├── debian ├── compat ├── source │ └── format ├── python3-anycast-healthchecker.install ├── patches │ ├── series │ ├── anycast-healthchecker.service.patch │ ├── anycast-healthchecker.conf.patch │ └── setup.cfg.patch ├── anycast-healthchecker.install ├── anycast-healthchecker.default ├── anycast-healthchecker.dirs ├── anycast-healthchecker.docs ├── anycast-healthchecker.postinst ├── rules ├── changelog ├── prepare-environment ├── HOWTO ├── copyright └── control ├── requirements.txt ├── contrib ├── SysVinit │ ├── anycast-healthchecker.sysconfig │ ├── README │ └── anycast-healthchecker.init ├── puppet │ └── anycast_healthchecker │ │ ├── templates │ │ ├── motd.erb │ │ ├── tmpfiles.conf.erb │ │ ├── check.conf.erb │ │ └── anycast-healthchecker.conf.erb │ │ ├── files │ │ └── anycast-healthchecker.sysconfig │ │ ├── manifests │ │ ├── install.pp │ │ ├── service.pp │ │ ├── motd.pp │ │ ├── sudo_access.pp │ │ ├── params.pp │ │ ├── config.pp │ │ ├── check.pp │ │ └── init.pp │ │ └── README.md ├── systemd │ └── anycast-healthchecker.service └── nagios │ └── check_anycast_healthchecker.py ├── anycast-receivers-example1.png ├── anycast-receivers-example2.png ├── anycast-receivers-example3.png ├── bird_daemon_rib_explained.png ├── bird_daemon_filter_explained.png ├── setup.py ├── TODO.rst ├── AUTHORS ├── .github └── workflows │ └── pylint.yml ├── anycast-healthchecker.conf ├── setup.cfg ├── pyproject.toml ├── anycast_healthchecker ├── __init__.py ├── main.py ├── healthchecker.py ├── servicecheck.py └── utils.py ├── local_run.sh ├── LICENSE.md ├── ChangeLog └── README.rst /debian/compat: -------------------------------------------------------------------------------- 1 | 10 2 | -------------------------------------------------------------------------------- /debian/source/format: -------------------------------------------------------------------------------- 1 | 3.0 (quilt) 2 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | docopt 2 | prometheus-client 3 | python-json-logger 4 | -------------------------------------------------------------------------------- /contrib/SysVinit/anycast-healthchecker.sysconfig: -------------------------------------------------------------------------------- 1 | RUNASUSER="healthchecker" 2 | -------------------------------------------------------------------------------- /debian/python3-anycast-healthchecker.install: -------------------------------------------------------------------------------- 1 | usr/lib/python3.*/dist-packages/* 2 | -------------------------------------------------------------------------------- /contrib/SysVinit/README: -------------------------------------------------------------------------------- 1 | You can use this SysV init script only with versions < 0.8.0. 2 | -------------------------------------------------------------------------------- /debian/patches/series: -------------------------------------------------------------------------------- 1 | anycast-healthchecker.conf.patch 2 | anycast-healthchecker.service.patch 3 | setup.cfg.patch 4 | -------------------------------------------------------------------------------- /anycast-receivers-example1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unixsurfer/anycast_healthchecker/HEAD/anycast-receivers-example1.png -------------------------------------------------------------------------------- /anycast-receivers-example2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unixsurfer/anycast_healthchecker/HEAD/anycast-receivers-example2.png -------------------------------------------------------------------------------- /anycast-receivers-example3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unixsurfer/anycast_healthchecker/HEAD/anycast-receivers-example3.png -------------------------------------------------------------------------------- /bird_daemon_rib_explained.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unixsurfer/anycast_healthchecker/HEAD/bird_daemon_rib_explained.png -------------------------------------------------------------------------------- /bird_daemon_filter_explained.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unixsurfer/anycast_healthchecker/HEAD/bird_daemon_filter_explained.png -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import setuptools 4 | 5 | setuptools.setup( 6 | setup_requires=['pbr'], 7 | pbr=True) 8 | -------------------------------------------------------------------------------- /debian/anycast-healthchecker.install: -------------------------------------------------------------------------------- 1 | usr/bin/* 2 | anycast-healthchecker.conf etc/anycast-healthchecker/ 3 | debian/prepare-environment usr/share/anycast-healthchecker/ 4 | -------------------------------------------------------------------------------- /debian/anycast-healthchecker.default: -------------------------------------------------------------------------------- 1 | RUN_AS_USER="bird" 2 | RUN_AS_GROUP="bird" 3 | OPTIONS="-f /etc/anycast-healthchecker/anycast-healthchecker.conf -d /etc/anycast-healthchecker/check.d/" 4 | -------------------------------------------------------------------------------- /debian/anycast-healthchecker.dirs: -------------------------------------------------------------------------------- 1 | /etc/anycast-healthchecker/ 2 | /etc/anycast-healthchecker/check.d/ 3 | /usr/share/anycast-healthchecker/ 4 | /var/lib/anycast-healthchecker/6/ 5 | /var/log/anycast-healthchecker/ 6 | -------------------------------------------------------------------------------- /debian/anycast-healthchecker.docs: -------------------------------------------------------------------------------- 1 | AUTHORS 2 | README.rst 3 | TODO.rst 4 | anycast-receivers-example1.png 5 | anycast-receivers-example2.png 6 | anycast-receivers-example3.png 7 | bird_daemon_filter_explained.png 8 | bird_daemon_rib_explained.png 9 | -------------------------------------------------------------------------------- /contrib/puppet/anycast_healthchecker/templates/motd.erb: -------------------------------------------------------------------------------- 1 | <% 2 | width=73 3 | 4 | if @motd_text && @motd_text.length > 0 -%> 5 | *<%= ''.center(width,' ') %>* 6 | <% @motd_text.each do |line| -%> 7 | * <%= line.ljust(width-2,' ') %> * 8 | <% end -%> 9 | <% end -%> 10 | -------------------------------------------------------------------------------- /debian/anycast-healthchecker.postinst: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | set -e 4 | 5 | . /etc/default/anycast-healthchecker 6 | 7 | for dir in /var/log/anycast-healthchecker/ /var/lib/anycast-healthchecker/ /var/lib/anycast-healthchecker/6/ ; do 8 | chown "${RUN_AS_USER}" "${dir}" 9 | done 10 | 11 | #DEBHELPER# 12 | -------------------------------------------------------------------------------- /TODO.rst: -------------------------------------------------------------------------------- 1 | TODO 2 | ==== 3 | 4 | #. Improve the way we handle timeouts/errors when we run ip tool 5 | 6 | #. Consider switching from threads to asyncio, requires to drop support for 7 | Pyhton versions < 3.5. I can live with that. We should do that only when 8 | the number of service checks is higher than ~50. 9 | -------------------------------------------------------------------------------- /contrib/puppet/anycast_healthchecker/files/anycast-healthchecker.sysconfig: -------------------------------------------------------------------------------- 1 | ################################################### 2 | # File is managed by puppet # 3 | # Module: anycast_healthchecker # 4 | # Path: files/anycast-healthchecker.sysconfig # 5 | ################################################### 6 | RUNASUSER="healthchecker" 7 | -------------------------------------------------------------------------------- /contrib/puppet/anycast_healthchecker/manifests/install.pp: -------------------------------------------------------------------------------- 1 | # == Class: anycast_healthchecker::install 2 | # 3 | # This class manages anycast_healthchecker parameters 4 | # 5 | class anycast_healthchecker::install { 6 | assert_private() 7 | package { 8 | $::anycast_healthchecker::package_name: 9 | ensure => $::anycast_healthchecker::package_version; 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /contrib/puppet/anycast_healthchecker/templates/tmpfiles.conf.erb: -------------------------------------------------------------------------------- 1 | ##################################################### 2 | # File is managed by puppet # 3 | # Module: anycast_healthchecker # 4 | # Path: templates/tmpfiles.conf.erb # 5 | ####################################################@ 6 | d /run/anycast-healthchecker 0755 <%= @user %> <%= @group %> 7 | -------------------------------------------------------------------------------- /contrib/puppet/anycast_healthchecker/manifests/service.pp: -------------------------------------------------------------------------------- 1 | # == Class: anycast_healthchecker::service 2 | # 3 | # This class manages anycast-healthchecker service 4 | # 5 | class anycast_healthchecker::service { 6 | assert_private() 7 | service { 8 | $::anycast_healthchecker::service_name: 9 | ensure => $::anycast_healthchecker::service_ensure, 10 | enable => $::anycast_healthchecker::service_enable; 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /contrib/puppet/anycast_healthchecker/templates/check.conf.erb: -------------------------------------------------------------------------------- 1 | [<%= @name%>] 2 | interface = <%= @interface %> 3 | check_cmd = <%= @check_cmd%> 4 | check_interval = <%= @check_interval%> 5 | check_timeout = <%= @check_timeout%> 6 | check_rise = <%= @check_rise%> 7 | check_fail = <%= @check_fail%> 8 | check_disabled = <%= @check_disabled%> 9 | on_disabled = <%= @on_disabled%> 10 | ip_prefix = <%= @ip_prefix%> 11 | <%- if @ip_check_disabled -%> 12 | ip_check_disabled = <%= @ip_check_disabled%> 13 | <%- end -%> 14 | -------------------------------------------------------------------------------- /contrib/systemd/anycast-healthchecker.service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=Anycast healthchecker 3 | After=network.target 4 | Requires=network.target 5 | Documentation=https://github.com/unixsurfer/anycast_healthchecker/blob/master/README.rst 6 | 7 | [Service] 8 | Type=simple 9 | EnvironmentFile=/etc/sysconfig/anycast-healthchecker 10 | TimeoutStartSec=0 11 | User=healthchecker 12 | Group=healthchecker 13 | ExecStart=/usr/bin/anycast-healthchecker $OPTIONS 14 | Restart=on-failure 15 | 16 | [Install] 17 | WantedBy=multi-user.target 18 | -------------------------------------------------------------------------------- /contrib/puppet/anycast_healthchecker/manifests/motd.pp: -------------------------------------------------------------------------------- 1 | # == Class: anycast_healthchecker::motd 2 | # 3 | # This class installs a motd message. 4 | # 5 | class anycast_healthchecker::motd { 6 | assert_private() 7 | $motd_text = [ 8 | "Anycast-healthchecker runs here", 9 | "- Configuration files: $::anycast_healthchecker::configuration_dir/", 10 | "- Log files: $::anycast_healthchecker::log_dir/", 11 | ] 12 | motd::fragment { 13 | "20-motd-$::anycast_healthchecker::service_name": 14 | ensure => $::anycast_healthchecker::motd_ensure, 15 | content => template('anycast_healthchecker/motd.erb'); 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /debian/rules: -------------------------------------------------------------------------------- 1 | #!/usr/bin/make -f 2 | # See debhelper(7) (uncomment to enable) 3 | # output every command that modifies files on the build system. 4 | #DH_VERBOSE = 1 5 | 6 | # see EXAMPLES in dpkg-buildflags(1) and read /usr/share/dpkg/* 7 | DPKG_EXPORT_BUILDFLAGS = 1 8 | include /usr/share/dpkg/default.mk 9 | 10 | # main packaging script based on dh7 syntax 11 | %: 12 | dh $@ --with python3 --buildsystem=pybuild 13 | 14 | 15 | override_dh_auto_install: 16 | cp $(CURDIR)/contrib/systemd/anycast-healthchecker.service $(CURDIR)/debian/anycast-healthchecker.service 17 | 18 | dh_auto_install 19 | 20 | 21 | override_dh_installinit: 22 | dh_installinit -n 23 | -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | Andras Temesvary 2 | Carlo Rengo 3 | Corubba <97832352+corubba@users.noreply.github.com> 4 | Daniel Hermann 5 | Greg Cox 6 | Jose Pedro Oliveira 7 | José Pedro Oliveira 8 | Kostis Fardelas 9 | Maximilian Wilhelm 10 | Miro Hrončok 11 | Pavlos Parissis 12 | Pavlos Parissis 13 | Ralf Ertzinger 14 | Shane Ramey 15 | ndemou 16 | -------------------------------------------------------------------------------- /debian/changelog: -------------------------------------------------------------------------------- 1 | anycast-healthchecker (0.9.8-1) stable; urgency=medium 2 | 3 | * Release version 0.9.8. 4 | 5 | -- Pavlos Parissis Tue, 05 Dec 2023 20:04:44 +0100 6 | 7 | anycast-healthchecker (0.9.1-1) stable; urgency=medium 8 | 9 | * Release version 0.9.1. 10 | 11 | -- Pavlos Parissis Sun, 02 Aug 2020 09:13:49 +0200 12 | 13 | anycast-healthchecker (0.9.0-1) stable; urgency=medium 14 | 15 | * Release version 0.9.0. 16 | 17 | -- Pavlos Parissis Mon, 15 Jul 2019 17:30:48 +0200 18 | 19 | anycast-healthchecker (0.8.1-1) stable; urgency=medium 20 | 21 | * Initial release. 22 | 23 | -- Maximilian Wilhelm Wed, 07 Feb 2018 22:12:07 +0100 24 | -------------------------------------------------------------------------------- /.github/workflows/pylint.yml: -------------------------------------------------------------------------------- 1 | name: Pylint 2 | 3 | on: [push] 4 | 5 | jobs: 6 | build: 7 | runs-on: ubuntu-latest 8 | strategy: 9 | matrix: 10 | python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"] 11 | steps: 12 | - uses: actions/checkout@v3 13 | - name: Set up Python ${{ matrix.python-version }} 14 | uses: actions/setup-python@v3 15 | with: 16 | python-version: ${{ matrix.python-version }} 17 | - name: Install dependencies 18 | run: | 19 | python -m pip install --upgrade pip 20 | pip install pylint 21 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 22 | - name: Analysing the code with pylint 23 | run: | 24 | pylint $(git ls-files '*.py') 25 | -------------------------------------------------------------------------------- /debian/prepare-environment: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # 3 | # (C) 2018 Maximilian Wilhelm 4 | # -- Wed, 07 Feb 2018 22:13:35 +0100 5 | # 6 | 7 | set -eu 8 | 9 | RUN_DIR=/var/run/anycast-healthchecker 10 | . /etc/default/anycast-healthchecker 11 | 12 | 13 | mkdir --parents "${RUN_DIR}"; 14 | 15 | OWNER="" 16 | 17 | if [ "${RUN_AS_USER}" ]; then 18 | if ! getent passwd "${RUN_AS_USER}" >/dev/null; then 19 | echo "Configured user '$RUN_AS_USER' doesn't exist." 20 | exit 1 21 | fi 22 | 23 | OWNER="${RUN_AS_USER}" 24 | fi 25 | 26 | if [ "${RUN_AS_GROUP}" ]; then 27 | if ! getent group $RUN_AS_GROUP >/dev/null; then 28 | echo "Configured group '$RUN_AS_GROUP' doesn't exist." 29 | exit 1 30 | fi 31 | 32 | OWNER="${RUN_AS_USER}:${RUN_AS_GROUP}" 33 | fi 34 | 35 | if [ "${OWNER}" ]; then 36 | chown --silent "${OWNER}" "${RUN_DIR}" 37 | fi 38 | -------------------------------------------------------------------------------- /debian/HOWTO: -------------------------------------------------------------------------------- 1 | Building a Debian package for anycast healthchecker is rather easy. Just follow 2 | the steps below. Building the package from within GIT is not encouraged as some 3 | part of the building system is trying to be smart and does naughty things with 4 | the ChangeLog and AUTHORS file and stuff. 5 | 6 | Here you go: 7 | 8 | Create a pristine build dir 9 | 10 | mkdir /tmp/ah-build 11 | 12 | Create a tarball for the latest stable version 13 | 14 | git archive --format tar --prefix=anycast-healthchecker-0.9.8/ HEAD | xz > /tmp/ah-build/anycast-healthchecker_0.9.8.orig.tar.xz 15 | 16 | Extract the tarball and change into the build directory 17 | 18 | cd /tmp/ah-build && tar xf anycast-healthchecker_0.9.8.orig.tar.xz && cd anycast-healthchecker-0.9.8 19 | 20 | Build the package 21 | 22 | dpkg-buildpackage -uc -us -rfakeroot 23 | 24 | Enjoy :-) 25 | -------------------------------------------------------------------------------- /contrib/puppet/anycast_healthchecker/manifests/sudo_access.pp: -------------------------------------------------------------------------------- 1 | # == Class: anycast_healthchecker::sudo_access 2 | # 3 | # This class configures sudo access for healthchecker account 4 | # 5 | class anycast_healthchecker::sudo_access { 6 | assert_private() 7 | sudo::access{ 8 | $::anycast_healthchecker::user: 9 | commands => [ 10 | '/usr/sbin/birdcl configure', 11 | '/usr/sbin/birdcl6 configure', 12 | '/usr/sbin/birdc configure', 13 | '/usr/sbin/birdc6 configure', 14 | '/usr/local/bin/devkvmpuppet_anycast_healthchecker.sh', 15 | '/usr/local/bin/puppet_anycast_healthchecker.sh', 16 | '/usr/local/bin/puppetdb_anycast_healthchecker.sh', 17 | ]; 18 | 'nagios-anycast': 19 | group => 'nagios', 20 | commands => [ 21 | '/usr/lib64/nagios/plugins/check_anycast_healthchecker.py', 22 | '/usr/lib64/nagios/plugins/check_anycast_healthchecker_threads.py', 23 | ]; 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /debian/patches/anycast-healthchecker.service.patch: -------------------------------------------------------------------------------- 1 | diff --git a/contrib/systemd/anycast-healthchecker.service b/contrib/systemd/anycast-healthchecker.service 2 | index fc9f044..aed8cb3 100644 3 | --- a/contrib/systemd/anycast-healthchecker.service 4 | +++ b/contrib/systemd/anycast-healthchecker.service 5 | @@ -1,17 +1,19 @@ 6 | [Unit] 7 | Description=Anycast healthchecker 8 | After=network.target 9 | Requires=network.target 10 | Documentation=https://github.com/unixsurfer/anycast_healthchecker/blob/master/README.rst 11 | 12 | [Service] 13 | Type=simple 14 | -EnvironmentFile=/etc/sysconfig/anycast-healthchecker 15 | +EnvironmentFile=/etc/default/anycast-healthchecker 16 | TimeoutStartSec=0 17 | -User=healthchecker 18 | -Group=healthchecker 19 | +User=bird 20 | +Group=bird 21 | +PermissionsStartOnly=true 22 | +ExecStartPre=/usr/share/anycast-healthchecker/prepare-environment 23 | ExecStart=/usr/bin/anycast-healthchecker $OPTIONS 24 | Restart=on-failure 25 | 26 | [Install] 27 | WantedBy=multi-user.target 28 | -------------------------------------------------------------------------------- /anycast-healthchecker.conf: -------------------------------------------------------------------------------- 1 | [DEFAULT] 2 | interface = lo 3 | 4 | [daemon] 5 | pidfile = /var/run/anycast-healthchecker/anycast-healthchecker.pid 6 | ipv4 = true 7 | ipv6 = false 8 | bird_conf = /var/lib/anycast-healthchecker/anycast-prefixes.conf 9 | bird6_conf = /var/lib/anycast-healthchecker/6/anycast-prefixes.conf 10 | bird_variable = ACAST_PS_ADVERTISE 11 | bird6_variable = ACAST6_PS_ADVERTISE 12 | bird_reconfigure_cmd = sudo /usr/sbin/birdc configure 13 | bird6_reconfigure_cmd = sudo /usr/sbin/birdc6 configure 14 | dummy_ip_prefix = 10.189.200.255/32 15 | dummy_ip6_prefix = 2001:db8::1/128 16 | bird_keep_changes = false 17 | bird6_keep_changes = false 18 | bird_changes_counter = 128 19 | bird6_changes_counter = 128 20 | purge_ip_prefixes = false 21 | loglevel = debug 22 | log_maxbytes = 104857600 23 | log_backups = 8 24 | json_stdout = false 25 | json_log_file = false 26 | json_log_server = false 27 | -------------------------------------------------------------------------------- /debian/copyright: -------------------------------------------------------------------------------- 1 | Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ 2 | Upstream-Name: anycast-healthchecker 3 | Source: https://github.com/unixsurfer/anycast_healthchecker 4 | 5 | Files: * 6 | Copyright: 2014-2018 Pavlos Parissis 7 | License: Apache-2.0 8 | 9 | Files: debian/* 10 | Copyright: 2018 Maximilian Wilhelm 11 | License: Apache-2.0 12 | 13 | License: Apache-2.0 14 | Licensed under the Apache License, Version 2.0 (the "License"); 15 | you may not use this file except in compliance with the License. 16 | You may obtain a copy of the License at 17 | . 18 | http://www.apache.org/licenses/LICENSE-2.0 19 | . 20 | Unless required by applicable law or agreed to in writing, software 21 | distributed under the License is distributed on an "AS IS" BASIS, 22 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 23 | See the License for the specific language governing permissions and 24 | limitations under the License. 25 | . 26 | On Debian systems, the complete text of the Apache version 2.0 license 27 | can be found in "/usr/share/common-licenses/Apache-2.0". 28 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = anycast-healthchecker 3 | author = Pavlos Parissis 4 | author-email = pavlos.parissis@gmail.com 5 | maintainer = Pavlos Parissis 6 | maintainer-email = pavlos.parissis@gmail.com 7 | summary = A healthchecker for Anycasted Services 8 | home-page = https://github.com/unixsurfer/anycast_healthchecker 9 | license = Apache 2.0 10 | classifier = 11 | Development Status :: 5 - Production/Stable 12 | Environment :: Console 13 | Intended Audience :: Information Technology 14 | Intended Audience :: System Administrators 15 | Natural Language :: English 16 | Operating System :: POSIX 17 | Programming Language :: Python :: 3.4 18 | Programming Language :: Python :: 3.5 19 | Topic :: System :: Monitoring 20 | Topic :: Utilities 21 | install_requires = 22 | docopt 23 | prometheus-client 24 | python-json-logger 25 | keywords = healthchecker anycast ECMP 26 | 27 | [files] 28 | packages = 29 | anycast_healthchecker 30 | anycast 31 | healthchecker 32 | Equal-Cost Multi-Pathing 33 | monitor 34 | 35 | [entry_points] 36 | console_scripts = 37 | anycast-healthchecker = anycast_healthchecker.main:main 38 | -------------------------------------------------------------------------------- /debian/patches/anycast-healthchecker.conf.patch: -------------------------------------------------------------------------------- 1 | diff --git a/anycast-healthchecker.conf b/anycast-healthchecker.conf 2 | index 35f08bc..bc9fea7 100644 3 | --- a/anycast-healthchecker.conf 4 | +++ b/anycast-healthchecker.conf 5 | @@ -4,13 +4,13 @@ interface = lo 6 | [daemon] 7 | pidfile = /var/run/anycast-healthchecker/anycast-healthchecker.pid 8 | ipv4 = true 9 | -ipv6 = false 10 | +ipv6 = true 11 | bird_conf = /var/lib/anycast-healthchecker/anycast-prefixes.conf 12 | bird6_conf = /var/lib/anycast-healthchecker/6/anycast-prefixes.conf 13 | bird_variable = ACAST_PS_ADVERTISE 14 | bird6_variable = ACAST6_PS_ADVERTISE 15 | -bird_reconfigure_cmd = sudo /usr/sbin/birdc configure 16 | -bird6_reconfigure_cmd = sudo /usr/sbin/birdc6 configure 17 | +bird_reconfigure_cmd = /usr/sbin/birdc configure 18 | +bird6_reconfigure_cmd = /usr/sbin/birdc6 configure 19 | dummy_ip_prefix = 10.189.200.255/32 20 | dummy_ip6_prefix = 2001:db8::1/128 21 | bird_keep_changes = false 22 | @@ -18,7 +18,7 @@ bird6_keep_changes = false 23 | bird_changes_counter = 128 24 | bird6_changes_counter = 128 25 | purge_ip_prefixes = false 26 | -loglevel = debug 27 | +loglevel = info 28 | log_maxbytes = 104857600 29 | log_backups = 8 30 | json_stdout = false 31 | -------------------------------------------------------------------------------- /debian/patches/setup.cfg.patch: -------------------------------------------------------------------------------- 1 | diff --git a/setup.cfg b/setup.cfg 2 | deleted file mode 100644 3 | index 8a1b56a..0000000 4 | --- a/setup.cfg 5 | +++ /dev/null 6 | @@ -1,37 +0,0 @@ 7 | -[metadata] 8 | -name = anycast-healthchecker 9 | -author = Pavlos Parissis 10 | -author-email = pavlos.parissis@gmail.com 11 | -maintainer = Pavlos Parissis 12 | -maintainer-email = pavlos.parissis@gmail.com 13 | -summary = A healthchecker for Anycasted Services 14 | -home-page = https://github.com/unixsurfer/anycast_healthchecker 15 | -license = Apache 2.0 16 | -classifier = 17 | - Development Status :: 5 - Production/Stable 18 | - Environment :: Console 19 | - Intended Audience :: Information Technology 20 | - Intended Audience :: System Administrators 21 | - Natural Language :: English 22 | - Operating System :: POSIX 23 | - Programming Language :: Python :: 3.4 24 | - Programming Language :: Python :: 3.5 25 | - Topic :: System :: Monitoring 26 | - Topic :: Utilities 27 | -install_requires = 28 | - python-json-logger 29 | - docopt 30 | - prometheus_client 31 | -keywords = healthchecker anycast ECMP 32 | - 33 | -[files] 34 | -packages = 35 | - anycast_healthchecker 36 | - anycast 37 | - healthchecker 38 | - Equal-Cost Multi-Pathing 39 | - monitor 40 | - 41 | -[entry_points] 42 | -console_scripts = 43 | - anycast-healthchecker = anycast_healthchecker.main:main 44 | diff --git a/setup.py b/setup.py 45 | deleted file mode 100644 46 | index b96e399..0000000 47 | --- a/setup.py 48 | +++ /dev/null 49 | @@ -1,7 +0,0 @@ 50 | -#!/usr/bin/env python 51 | - 52 | -import setuptools 53 | - 54 | -setuptools.setup( 55 | - setup_requires=['pbr'], 56 | - pbr=True) 57 | -------------------------------------------------------------------------------- /contrib/puppet/anycast_healthchecker/templates/anycast-healthchecker.conf.erb: -------------------------------------------------------------------------------- 1 | #################################################### 2 | # File is managed by puppet # 3 | # Module: anycast_healthchecker # 4 | # Path: templates/anycast-healthchecker.conf.erb # 5 | #################################################### 6 | [DEFAULT] 7 | interface = lo 8 | 9 | [daemon] 10 | pidfile = <%= @pidfile %> 11 | bird_conf = <%= @bird_conf %> 12 | bird_variable = <%= @bird_variable %> 13 | bird_reconfigure_cmd = <%= @bird_reconfigure_cmd %> 14 | bird_keep_changes = <%= @bird_keep_changes %> 15 | bird_changes_counter = <%= @bird_changes_counter %> 16 | loglevel = <%= @log_level %> 17 | log_maxbytes = <%= @log_maxbytes %> 18 | log_backups = <%= @log_backups %> 19 | log_file = <%= @log_file %> 20 | stderr_file = <%= @stderr_file %> 21 | stdout_file = <%= @stdout_file %> 22 | dummy_ip_prefix = <%= @dummy_ip_prefix %> 23 | <%- if @ipv6 -%> 24 | ipv6 = <%= @ipv6 %> 25 | bird6_conf = <%= @bird6_conf %> 26 | bird6_variable = <%= @bird6_variable %> 27 | bird6_reconfigure_cmd = <%= @bird6_reconfigure_cmd %> 28 | bird6_keep_changes = <%= @bird6_keep_changes %> 29 | bird6_changes_counter = <%= @bird6_changes_counter %> 30 | dummy_ip6_prefix = <%= @dummy_ip6_prefix %> 31 | <%- end -%> 32 | <%- if @json_logging -%> 33 | json_logging = true 34 | http_server = <%= @http_server %> 35 | http_server_protocol = <%= @http_server_protocol %> 36 | http_server_port = <%= @http_server_port %> 37 | http_server_timeout = <%= @http_server_timeout -%> 38 | <%- end %> 39 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "anycast-healthchecker" 3 | description = "A healthchecker for Anycasted Services" 4 | authors = [{name = "Pavlos Parissis", email = "pavlos.parissis@gmail.com"}] 5 | license = {text = "Apache 2.0"} 6 | keywords = ["healthchecker", "anycast", "ECMP"] 7 | classifiers = [ 8 | "Development Status :: 5 - Production/Stable", 9 | "Environment :: Console", 10 | "Intended Audience :: Information Technology", 11 | "Intended Audience :: System Administrators", 12 | "Natural Language :: English", 13 | "Operating System :: POSIX", 14 | "Programming Language :: Python :: 3", 15 | "Programming Language :: Python :: 3.9", 16 | "Programming Language :: Python :: 3.10", 17 | "Programming Language :: Python :: 3.11", 18 | "Programming Language :: Python :: 3.12", 19 | "Programming Language :: Python :: 3.13", 20 | "Topic :: System :: Monitoring", 21 | "Topic :: Utilities", 22 | ] 23 | dynamic = ["version", "readme"] 24 | dependencies = [ 25 | "docopt", 26 | "prometheus-client", 27 | "python-json-logger", 28 | ] 29 | 30 | [project.urls] 31 | documentation = "https://github.com/unixsurfer/anycast_healthchecker#readme" 32 | repository = "https://github.com/unixsurfer/anycast_healthchecker" 33 | 34 | [project.scripts] 35 | anycast-healthchecker = 'anycast_healthchecker.main:main' 36 | 37 | [tool.setuptools] 38 | packages = ["anycast_healthchecker"] 39 | 40 | [tool.setuptools.dynamic] 41 | version = {attr = "anycast_healthchecker.__version__"} 42 | readme = {file = ["README.rst"], content-type = "text/x-rst"} 43 | 44 | [build-system] 45 | requires = ["setuptools>=61", "wheel"] 46 | build-backend = "setuptools.build_meta" 47 | 48 | [tool.isort] 49 | # https://pycqa.github.io/isort/docs/configuration/profiles.html 50 | profile = "black" 51 | 52 | [tool.pylint.'MESSAGES CONTROL'] 53 | disable = [ 54 | "line-too-long", 55 | "missing-module-docstring", 56 | "too-many-arguments", 57 | "too-many-instance-attributes", 58 | "too-many-locals", 59 | "too-many-branches", 60 | ] 61 | -------------------------------------------------------------------------------- /anycast_healthchecker/__init__.py: -------------------------------------------------------------------------------- 1 | # vim:fenc=utf-8 2 | # 3 | """A healthchecker for Anycasted services.""" 4 | __title__ = 'anycast_healthchecker' 5 | __author__ = 'Pavlos Parissis' 6 | __license__ = 'Apache 2.0' 7 | __version__ = '0.9.10' 8 | __copyright__ = 'Copyright 2015-2025 Pavlos Parissis' 9 | 10 | PROGRAM_NAME = __title__.replace('_', '-') 11 | METRIC_PREFIX = __title__ 12 | 13 | 14 | DEFAULT_OPTIONS = { 15 | 'DEFAULT': { 16 | 'interface': 'lo', 17 | 'check_interval': 10, 18 | 'check_timeout': 2, 19 | 'check_rise': 2, 20 | 'check_fail': 2, 21 | 'check_disabled': 'true', 22 | 'on_disabled': 'withdraw', 23 | 'ip_check_disabled': 'false', 24 | 'custom_bird_reconfigure_cmd_timeout': 2, 25 | }, 26 | 'daemon': { 27 | 'ipv4': 'true', 28 | 'ipv6': 'false', 29 | 'bird_conf': '/var/lib/anycast-healthchecker/anycast-prefixes.conf', 30 | 'bird6_conf': '/var/lib/anycast-healthchecker/6/anycast-prefixes.conf', 31 | 'bird_variable': 'ACAST_PS_ADVERTISE', 32 | 'bird6_variable': 'ACAST6_PS_ADVERTISE', 33 | 'bird_reconfigure_cmd': 'sudo /usr/sbin/birdc configure', 34 | 'bird6_reconfigure_cmd': 'sudo /usr/sbin/birdc6 configure', 35 | 'dummy_ip_prefix': '10.189.200.255/32', 36 | 'dummy_ip6_prefix': '2001:db8::1/128', 37 | 'bird_keep_changes': 'false', 38 | 'bird6_keep_changes': 'false', 39 | 'bird_changes_counter': 128, 40 | 'bird6_changes_counter': 128, 41 | 'purge_ip_prefixes': 'false', 42 | 'pidfile': '/var/run/anycast-healthchecker/anycast-healthchecker.pid', 43 | 'loglevel': 'debug', 44 | 'log_server_port': 514, 45 | 'json_stdout': 'false', 46 | 'json_log_file': 'false', 47 | 'json_log_server': 'false', 48 | 'log_maxbytes': 104857600, 49 | 'log_backups': 8, 50 | 'log_format_journalctl': 'false', 51 | 'prometheus_exporter': 'false', 52 | 'prometheus_collector_textfile_dir': '/var/cache/textfile_collector/', 53 | 'prometheus_exporter_interval': 10, 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /contrib/SysVinit/anycast-healthchecker.init: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # 3 | # healthchecker A Python healthchecker for Anycasted services 4 | # 5 | # chkconfig: 2345 90 10 6 | # description: A healthchecker daemon to run checks against multiple services \ 7 | # and either advertise or withdraw a route for an IP_PREFIX 8 | # which is associated with the service. 9 | 10 | # Source function library. 11 | . /etc/rc.d/init.d/functions 12 | 13 | exec="/usr/bin/anycast-healthchecker" 14 | prog="anycast-healthchecker" 15 | 16 | OPTIONS= 17 | [ -e /etc/sysconfig/$prog ] && . /etc/sysconfig/$prog 18 | runasuser=${RUNASUSER:-healthchecker} 19 | 20 | pidfile="/var/run/$prog/${prog}.pid" 21 | lockfile="/var/lock/subsys/$prog" 22 | 23 | start() { 24 | [ -x $exec ] || exit 5 25 | [ -f $config ] || exit 6 26 | echo -n $"Starting $prog: " 27 | daemon --user $runasuser --pidfile $pidfile $exec $OPTIONS 28 | retval=$? 29 | echo 30 | [ $retval -eq 0 ] && touch $lockfile 31 | return $retval 32 | } 33 | 34 | stop() { 35 | echo -n $"Stopping $prog: " 36 | killproc -p $pidfile $prog 37 | retval=$? 38 | echo 39 | [ $retval -eq 0 ] && rm -f $lockfile 40 | return $retval 41 | } 42 | 43 | restart() { 44 | stop 45 | start 46 | } 47 | 48 | reload() { 49 | restart 50 | } 51 | 52 | force_reload() { 53 | restart 54 | } 55 | 56 | rh_status() { 57 | status -p $pidfile $prog 58 | } 59 | 60 | rh_status_q() { 61 | rh_status >/dev/null 2>&1 62 | } 63 | 64 | 65 | case "$1" in 66 | start) 67 | rh_status_q && exit 0 68 | $1 69 | ;; 70 | stop) 71 | rh_status_q || exit 0 72 | $1 73 | ;; 74 | restart) 75 | $1 76 | ;; 77 | reload) 78 | rh_status_q || exit 7 79 | $1 80 | ;; 81 | force-reload) 82 | force_reload 83 | ;; 84 | status) 85 | rh_status 86 | ;; 87 | condrestart|try-restart) 88 | rh_status_q || exit 0 89 | restart 90 | ;; 91 | *) 92 | echo $"Usage: $0 {start|stop|status|restart|condrestart|try-restart|reload|force-reload}" 93 | exit 2 94 | esac 95 | exit $? 96 | -------------------------------------------------------------------------------- /debian/control: -------------------------------------------------------------------------------- 1 | Source: anycast-healthchecker 2 | Section: python 3 | Priority: optional 4 | Maintainer: Maximilian Wilhelm 5 | Build-Depends: debhelper (>= 10~), dh-python, pybuild-plugin-pyproject, 6 | python3-all, 7 | python3-setuptools 8 | Standards-Version: 3.9.5 9 | Homepage: https://github.com/unixsurfer/anycast_healthchecker 10 | X-Python3-Version: >= 3.9 11 | 12 | Package: python3-anycast-healthchecker 13 | Architecture: all 14 | Depends: ${python3:Depends}, ${misc:Depends}, python3-docopt, python3-pythonjsonlogger, python3-prometheus-client 15 | Description: Healthchecker for Anycasted services 16 | anycast-healthchecker monitors a service by doing periodic health checks and 17 | based on the result instructs Bird daemon to either advertise or withdraw the 18 | route to reach the monitored service. As a result Bird will only advertise 19 | routes for healthy services. Routes for IPv4 and IPv6 addresses are supported. 20 | . 21 | Bird must be configured in a certain way to interface properly with 22 | anycast-healthchecker. 23 | . 24 | anycast-healthchecker is a Python program operated as a native 25 | systemd service. It uses threading to run multiple service checks in 26 | parallel. 27 | . 28 | This package installs the library for Python 3. 29 | 30 | Package: anycast-healthchecker 31 | Architecture: all 32 | Depends: ${python3:Depends}, ${misc:Depends}, bird, python3-anycast-healthchecker (= ${source:Version}) 33 | Description: Healthchecker for Anycasted services 34 | anycast-healthchecker monitors a service by doing periodic health checks and 35 | based on the result instructs Bird daemon to either advertise or withdraw the 36 | route to reach the monitored service. As a result Bird will only advertise 37 | routes for healthy services. Routes for IPv4 and IPv6 addresses are supported. 38 | . 39 | Bird must be configured in a certain way to interface properly with 40 | anycast-healthchecker. 41 | . 42 | anycast-healthchecker is a Python program operated as a native 43 | systemd service. It uses threading to run multiple service checks in 44 | parallel. 45 | . 46 | This package contains the files required to run the tool as 47 | a native systemd service. 48 | -------------------------------------------------------------------------------- /contrib/puppet/anycast_healthchecker/manifests/params.pp: -------------------------------------------------------------------------------- 1 | # == Class: anycast_healthchecker::params 2 | # 3 | # This class manages anycast_healthchecker parameters 4 | # 5 | class anycast_healthchecker::params { 6 | $service_name = 'anycast-healthchecker' 7 | $var_lib_dir = "/var/lib/${service_name}" 8 | $var_lib_dir6 = "/var/lib/${service_name}/6" 9 | $bird_conf = "${var_lib_dir}/anycast-prefixes.conf" 10 | $bird6_conf = "${var_lib_dir6}/anycast-prefixes.conf" 11 | $bird_variable = 'ACAST_PS_ADVERTISE' 12 | $bird6_variable = 'ACAST6_PS_ADVERTISE' 13 | $bird_reconfigure_cmd = 'sudo /usr/sbin/birdc configure' 14 | $bird6_reconfigure_cmd = 'sudo /usr/sbin/birdc6 configure' 15 | $bird_keep_changes = false 16 | $bird6_keep_changes = false 17 | $bird_changes_counter = 128 18 | $bird6_changes_counter = 128 19 | $configuration_dir = '/etc/anycast-healthchecker.d' 20 | $configuration_file = '/etc/anycast-healthchecker.conf' 21 | $dummy_ip_prefix = '10.189.200.255/32' 22 | $dummy_ip6_prefix = '2001:db8::1/128' 23 | $group = 'healthchecker' 24 | $http_server = '127.0.0.1' 25 | $http_server_port = 2813 26 | $http_server_protocol = 'http' 27 | $http_server_timeout = 0.2 28 | $ipv4 = true 29 | $ipv6 = false 30 | $json_logging = false 31 | $log_level = 'info' 32 | $log_maxbytes = 104857600 33 | $log_backups = 8 34 | $log_dir = '/var/log/anycast-healthchecker' 35 | $log_file = "${log_dir}/anycast-healthchecker.log" 36 | $motd_ensure = present 37 | $package_name = 'blue-python34-anycast-healthchecker' 38 | $package_version = "0.7.3-1.el${::facts['lsbmajdistrelease']}" 39 | $pid_dir = "/var/run/${service_name}" 40 | $pidfile = "${pid_dir}/${service_name}.pid" 41 | $purge_directory = true 42 | $purge_ip_prefixes = false 43 | $service_enable = true 44 | $service_ensure = true 45 | $stderr_file = "${log_dir}/stderr.log" 46 | $stdout_file = "${log_dir}/stdout.log" 47 | $user = 'healthchecker' 48 | } 49 | -------------------------------------------------------------------------------- /contrib/puppet/anycast_healthchecker/manifests/config.pp: -------------------------------------------------------------------------------- 1 | # == Class: anycast_healthchecker::config 2 | # 3 | # This class configures anycast-healthchecker 4 | # 5 | class anycast_healthchecker::config { 6 | assert_private() 7 | file { 8 | $::anycast_healthchecker::log_dir: 9 | ensure => directory, 10 | owner => $::anycast_healthchecker::user, 11 | group => $::anycast_healthchecker::group, 12 | mode => '0755'; 13 | } 14 | file { 15 | $::anycast_healthchecker::var_lib_dir: 16 | ensure => directory, 17 | owner => $::anycast_healthchecker::user, 18 | group => $::anycast_healthchecker::group, 19 | mode => '0755'; 20 | } 21 | 22 | $var_lib_dir6_ensure = $::anycast_healthchecker::ipv6 ? { 23 | true => directory, 24 | false => absent, 25 | } 26 | file { 27 | $::anycast_healthchecker::var_lib_dir6: 28 | ensure => $var_lib_dir6_ensure, 29 | owner => $::anycast_healthchecker::user, 30 | group => $::anycast_healthchecker::group, 31 | force => true, 32 | mode => '0755'; 33 | } 34 | file { 35 | $::anycast_healthchecker::pid_dir: 36 | ensure => directory, 37 | owner => $::anycast_healthchecker::user, 38 | group => $::anycast_healthchecker::group, 39 | mode => '0755'; 40 | } 41 | file { 42 | $::anycast_healthchecker::configuration_dir: 43 | ensure => directory, 44 | purge => $::anycast_healthchecker::purge_directory, 45 | recurse => $::anycast_healthchecker::purge_directory, 46 | owner => root, 47 | group => root, 48 | mode => '0755'; 49 | } 50 | $python_ver = regsubst($::anycast_healthchecker::package_name, '^blue-python(\d)(\d)-.*', '\1.\2') 51 | $check_cmd = "/opt/blue-python/${python_ver}/bin/anycast-healthchecker" 52 | file { 53 | $::anycast_healthchecker::configuration_file: 54 | mode => '0444', 55 | owner => root, 56 | group => root, 57 | content => template('anycast_healthchecker/anycast-healthchecker.conf.erb'), 58 | validate_cmd => "su -s /bin/bash - $::anycast_healthchecker::user -c \'${check_cmd} -c -f %\'", 59 | require => File[$::anycast_healthchecker::configuration_dir]; 60 | } 61 | file { 62 | 'sysconfig': 63 | path => '/etc/sysconfig/anycast-healthchecker', 64 | mode => '0444', 65 | owner => root, 66 | group => root, 67 | source => 'puppet:///modules/anycast_healthchecker/anycast-healthchecker.sysconfig'; 68 | } 69 | $tmpfiles_config_ensure = $::facts['lsbmajdistrelease'] ? { 70 | 6 => absent, # RedHat 5 isn't supported anymore 71 | default => file, 72 | } 73 | file { 74 | '/usr/lib/tmpfiles.d/anycast-healthchecker.conf': 75 | ensure => $tmpfiles_config_ensure, 76 | owner => root, 77 | group => root, 78 | mode => '0444', 79 | content => template('anycast_healthchecker/tmpfiles.conf.erb'); 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /contrib/puppet/anycast_healthchecker/README.md: -------------------------------------------------------------------------------- 1 | # anycast_healthchecker 2 | 3 | #### Table of Contents 4 | 5 | 1. [Overview](#overview) 6 | 2. [Module Description - What the module does and why it is useful](#module-description) 7 | 3. [Setup - The basics of getting started with anycast_healthchecker](#setup) 8 | * [What anycast_healthchecker affects](#what-anycast_healthchecker-affects) 9 | * [Setup requirements](#setup-requirements) 10 | * [Beginning with anycast_healthchecker](#beginning-with-anycast_healthchecker) 11 | 4. [Usage - Configuration options and additional functionality](#usage) 12 | 5. [Reference - An under-the-hood peek at what the module is doing and how](#reference) 13 | 5. [Limitations - OS compatibility, etc.](#limitations) 14 | 6. [Development - Guide for contributing to the module](#development) 15 | 16 | ## Overview 17 | 18 | A one-maybe-two sentence summary of what the module does/what problem it solves. 19 | This is your 30 second elevator pitch for your module. Consider including 20 | OS/Puppet version it works with. 21 | 22 | ## Module Description 23 | 24 | If applicable, this section should have a brief description of the technology 25 | the module integrates with and what that integration enables. This section 26 | should answer the questions: "What does this module *do*?" and "Why would I use 27 | it?" 28 | 29 | If your module has a range of functionality (installation, configuration, 30 | management, etc.) this is the time to mention it. 31 | 32 | ## Setup 33 | 34 | ### What anycast_healthchecker affects 35 | 36 | * A list of files, packages, services, or operations that the module will alter, 37 | impact, or execute on the system it's installed on. 38 | * This is a great place to stick any warnings. 39 | * Can be in list or paragraph form. 40 | 41 | ### Setup Requirements **OPTIONAL** 42 | 43 | If your module requires anything extra before setting up (pluginsync enabled, 44 | etc.), mention it here. 45 | 46 | ### Beginning with anycast_healthchecker 47 | 48 | The very basic steps needed for a user to get the module up and running. 49 | 50 | If your most recent release breaks compatibility or requires particular steps 51 | for upgrading, you may wish to include an additional section here: Upgrading 52 | (For an example, see http://forge.puppetlabs.com/puppetlabs/firewall). 53 | 54 | ## Usage 55 | 56 | Put the classes, types, and resources for customizing, configuring, and doing 57 | the fancy stuff with your module here. 58 | 59 | ## Reference 60 | 61 | Here, list the classes, types, providers, facts, etc contained in your module. 62 | This section should include all of the under-the-hood workings of your module so 63 | people know what the module is touching on their system but don't need to mess 64 | with things. (We are working on automating this section!) 65 | 66 | ## Limitations 67 | 68 | This is where you list OS compatibility, version compatibility, etc. 69 | 70 | ## Development 71 | 72 | Since your module is awesome, other users will want to play with it. Let them 73 | know what the ground rules for contributing are. 74 | 75 | ## Release Notes/Contributors/Etc **Optional** 76 | 77 | If you aren't using changelog, put your release notes here (though you should 78 | consider using changelog). You may also add any additional sections you feel are 79 | necessary or important to include here. Please use the `## ` header. 80 | -------------------------------------------------------------------------------- /contrib/puppet/anycast_healthchecker/manifests/check.pp: -------------------------------------------------------------------------------- 1 | # == Class: anycast_healthchecker::check 2 | # 3 | # A defined type class to configure healthchecker for monitoring a anycasted 4 | # service. It produces a json configuration file which is parsed by healtchecker 5 | # daemon. 6 | 7 | # === Parameters: 8 | # 9 | # [*interface*] - Interface that the service IP resides on. 10 | # Defaults to 'lo'. 11 | # 12 | # [*check_cmd*] - Full path of the command to run for 13 | # healthchecking the service. 14 | # 15 | # [*check_interval*] - Interval in secords between checks. 16 | # 17 | # [*check_timeout*] - Maximum time in seconds to wait for 18 | # a check to finish. 19 | # 20 | # [*check_rise*] - Number of consecutive successful checks 21 | # to consider the service healhty. 22 | # 23 | # [*check_fail*] - Number of consecutive unsuccessful 24 | # checks to consider the service dead. 25 | # 26 | # [*check_disabled*] - Disables check for service. 27 | # 28 | # [*on_disabled*] - Action to take when check is disabled 29 | # -- withdraw => withdraw the ip_prefix 30 | # -- advertise => advertise the ip_prefix 31 | # [*ip_prefix*] - The ip_prefix associated with the 32 | # service in a IP address/prefix_len format. 33 | # 34 | # [*ip_check_disabled*] - true disables the assignment check of 35 | # ip_prefix to the interface set in interface, 36 | # false enables it. 37 | # 38 | # This class requires the following external variables 39 | # 40 | # This class requires the following templates 41 | # 42 | # anycast/healthcheck.json.erb 43 | # 44 | # === Actions: 45 | # 46 | # -- Perform sanity checks for all given parameters 47 | # 48 | # === Requires: 49 | # 50 | # anycast::healthcheck class 51 | # === Sample Usage: 52 | # 53 | # anycast_healthchecker::check { 54 | # 'for.bar.com': 55 | # ip_prefix => '10.189.200.1/32', 56 | # check_cmd => '/usr/bin/curl -o /dev/null http://10.189.200.1/'; 57 | # } 58 | # 59 | # === Authors 60 | # 61 | # Pavlos Parissis 62 | # 63 | define anycast_healthchecker::check ( 64 | Variant[Stdlib::IP::Address::V4::CIDR, 65 | Stdlib::IP::Address::V6::CIDR] $ip_prefix, 66 | String[1] $interface = 'lo', 67 | String[1] $check_cmd = '/bin/false', 68 | Numeric $check_interval = 10, 69 | Numeric $check_timeout = 5, 70 | Integer[1] $check_rise = 2, 71 | Integer[1] $check_fail = 2, 72 | Boolean $check_disabled = false, 73 | Enum['withdraw', 'advertise'] $on_disabled = 'withdraw', 74 | Boolean $ip_check_disabled = false, 75 | ) { 76 | 77 | if $check_interval < 0 { 78 | fail("anycast_healthchecker::check::${name} check_interval must be higher than zero") 79 | } 80 | if $check_timeout < 0 { 81 | fail("anycast_healthchecker::check::${name} check_timeout must be higher than zero") 82 | } 83 | 84 | $python_ver = regsubst($::anycast_healthchecker::package_name, '^blue-python(\d)(\d)-.*', '\1.\2') 85 | $_cmd = "/opt/blue-python/${python_ver}/bin/anycast-healthchecker" 86 | file { 87 | "${::anycast_healthchecker::configuration_dir}/${name}.conf": 88 | mode => '0444', 89 | owner => root, 90 | group => root, 91 | notify => Service[$::anycast_healthchecker::service_name], 92 | validate_cmd => "su -s /bin/bash - ${::anycast_healthchecker::user} -c \'${_cmd} -c -F %\'", 93 | content => template('anycast_healthchecker/check.conf.erb'); 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /anycast_healthchecker/main.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # vim:fenc=utf-8 3 | # 4 | """A simple healthchecker for Anycasted services. 5 | 6 | Usage: 7 | anycast-healthchecker [ -f -c -p -P ] [ -d | -F ] 8 | 9 | Options: 10 | -f, --file= read settings from 11 | [default: /etc/anycast-healthchecker.conf] 12 | -d, --dir= read settings for service checks from files 13 | under directory 14 | [default: /etc/anycast-healthchecker.d] 15 | -F, --service-file= read for settings of a single service 16 | check 17 | -c, --check perform a sanity check on configuration 18 | -p, --print show default settings for anycast-healthchecker 19 | and service checks 20 | -P, --print-conf show running configuration with default settings 21 | applied 22 | -v, --version show version 23 | -h, --help show this screen 24 | """ 25 | import signal 26 | import socket 27 | import sys 28 | from functools import partial 29 | 30 | from docopt import docopt 31 | 32 | from anycast_healthchecker import ( 33 | DEFAULT_OPTIONS, 34 | PROGRAM_NAME, 35 | __version__, 36 | healthchecker, 37 | ) 38 | from anycast_healthchecker.utils import ( 39 | ip_prefixes_sanity_check, 40 | load_configuration, 41 | setup_logger, 42 | shutdown, 43 | update_pidfile, 44 | ) 45 | 46 | 47 | def main(): 48 | """Parse CLI and starts main program.""" 49 | args = docopt(__doc__, version=__version__) 50 | if args['--print']: 51 | for section in DEFAULT_OPTIONS: 52 | print(f"[{section}]") 53 | for key, value in DEFAULT_OPTIONS[section].items(): 54 | print(f"{key} = {value}") 55 | print() 56 | sys.exit(0) 57 | 58 | try: 59 | config, bird_configuration = load_configuration(args['--file'], 60 | args['--dir'], 61 | args['--service-file']) 62 | except ValueError as exc: 63 | sys.exit('Invalid configuration: ' + str(exc)) 64 | 65 | if args['--check']: 66 | print("OK") 67 | sys.exit(0) 68 | 69 | if args['--print-conf']: 70 | for section in config: 71 | print(f"[{section}]") 72 | for key, value in config[section].items(): 73 | print(f"{key} = {value}") 74 | print() 75 | sys.exit(0) 76 | 77 | try: 78 | lock_socket = socket.socket(socket.AF_UNIX, socket.SOCK_DGRAM) 79 | lock_socket.bind('\0' + f"{PROGRAM_NAME}") 80 | except socket.error as exc: 81 | sys.exit("failed to acquire a lock by creating an abstract namespace" 82 | " socket: {}".format(exc)) 83 | else: 84 | print("acquired a lock by creating an abstract namespace socket: {}" 85 | .format(lock_socket)) 86 | 87 | # Clean old pidfile, if it exists, and write PID to it. 88 | pidfile = config.get('daemon', 'pidfile') 89 | update_pidfile(pidfile) 90 | 91 | # Register our shutdown handler to various termination signals. 92 | shutdown_handler = partial(shutdown, pidfile) 93 | signal.signal(signal.SIGHUP, shutdown_handler) 94 | signal.signal(signal.SIGTERM, shutdown_handler) 95 | signal.signal(signal.SIGABRT, shutdown_handler) 96 | signal.signal(signal.SIGINT, shutdown_handler) 97 | 98 | # Set up loggers. 99 | logger = setup_logger(config) 100 | 101 | # Perform a sanity check on IP-Prefixes 102 | ip_prefixes_sanity_check(config, bird_configuration) 103 | 104 | # Create our master process. 105 | checker = healthchecker.HealthChecker(config, bird_configuration) 106 | 107 | # Register our SIGURG handler to immediately trigger all checks. 108 | signal.signal(signal.SIGURG, lambda signum, frame: checker.run_all_checks_now()) 109 | 110 | # and start working 111 | logger.info("starting %s version %s", PROGRAM_NAME, __version__) 112 | checker.run() 113 | 114 | 115 | # This is the standard boilerplate that calls the main() function. 116 | if __name__ == '__main__': 117 | main() 118 | -------------------------------------------------------------------------------- /local_run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | PROGRAM=anycast-healthchecker 3 | TEST_DIR="${PWD}/var" 4 | DOTDIR="${TEST_DIR}/etc/"${PROGRAM}".d" 5 | PROGRAMCONF="${TEST_DIR}/etc/"${PROGRAM}".conf" 6 | PIDIFILE="${TEST_DIR}/var/run/"${PROGRAM}"/"${PROGRAM}".pid" 7 | directories=("${DOTDIR}" \ 8 | "${TEST_DIR}"/var/log/"${PROGRAM}" \ 9 | "${TEST_DIR}"/var/lib/"${PROGRAM}" \ 10 | "${TEST_DIR}"/var/lib/"${PROGRAM}"/6 \ 11 | "${TEST_DIR}"/var/run/"${PROGRAM}") 12 | 13 | echo "------------------------------------------" 14 | echo "--------create directory structure--------" 15 | echo "------------------------------------------" 16 | for dir in ${directories[@]}; do 17 | if [ ! -d "${dir}" ]; then 18 | mkdir -v -p "${dir}" 19 | fi 20 | done 21 | 22 | echo "------------------------------------------" 23 | echo "---------------create config--------------" 24 | echo "------------------------------------------" 25 | if [ ! -e ${PROGRAMCONF} ]; then 26 | echo "${PROGRAMCONF}" 27 | cat < "${PROGRAMCONF}" 28 | [DEFAULT] 29 | interface = lo 30 | 31 | [daemon] 32 | pidfile = ${PIDIFILE} 33 | loglevel = debug 34 | log_maxbytes = 104857600 35 | log_backups = 8 36 | log_file = ${TEST_DIR}/var/log/anycast-healthchecker/anycast-healthchecker.log 37 | stderr_file = ${TEST_DIR}/var/log/anycast-healthchecker/stderr.log 38 | 39 | ipv4 = true 40 | bird_conf = ${TEST_DIR}/var/lib/anycast-healthchecker/anycast-prefixes.conf 41 | bird_variable = ACAST_PS_ADVERTISE 42 | bird_keep_changes = true 43 | bird_changes_counter = 6 44 | bird_reconfigure_cmd = /usr/bin/sudo /usr/sbin/birdc configure 45 | dummy_ip_prefix = 10.189.200.255/32 46 | 47 | ipv6 = true 48 | bird6_conf = ${TEST_DIR}/var/lib/anycast-healthchecker/6/anycast-prefixes.conf 49 | bird6_variable = ACAST6_PS_ADVERTISE 50 | dummy_ip6_prefix = 2001:db8::1/128 51 | bird6_reconfigure_cmd = sudo /usr/sbin/birdc6 configure 52 | bird6_keep_changes = true 53 | bird6_changes_counter = 6 54 | EOT 55 | fi 56 | 57 | echo "------------------------------------------" 58 | echo "--------create service checks-------------" 59 | echo "------------------------------------------" 60 | if [ ! -e ${DOTDIR}/foo.bar.com.conf ]; then 61 | cat < ${DOTDIR}/foo.bar.com.conf 62 | [foo.bar.com] 63 | check_cmd = curl -A 'anycast-healthchecker' --fail --silent -o /dev/null http://10.52.12.1:8888 64 | check_interval = 10 65 | check_timeout = 5 66 | check_rise = 2 67 | check_fail = 2 68 | check_disabled = false 69 | on_disabled = withdraw 70 | ip_prefix = 10.52.12.1/32 71 | EOT 72 | fi 73 | 74 | if [ ! -e ${DOTDIR}/foo1.bar.com.conf ]; then 75 | cat < ${DOTDIR}/foo1.bar.com.conf 76 | [foo1.bar.com] 77 | check_cmd = curl -A 'anycast-healthchecker' --fail --silent -o /dev/null http://10.52.12.2:8888 78 | check_interval = 10 79 | check_timeout = 5 80 | check_rise = 2 81 | check_fail = 2 82 | check_disabled = false 83 | on_disabled = withdraw 84 | ip_prefix = 10.52.12.2/32 85 | EOT 86 | fi 87 | 88 | if [ ! -e ${DOTDIR}/fooIPv6.bar.com.conf ]; then 89 | cat < ${DOTDIR}/fooIPv6.bar.com.conf 90 | [foo1IPv6.bar.com] 91 | check_cmd = /usr/bin/curl --fail -o /dev/null 'http://[fd12:aba6:57db:ffff::1]:8888' 92 | check_timeout = 5 93 | check_rise = 2 94 | check_fail = 2 95 | check_disabled = false 96 | on_disabled = withdraw 97 | ip_prefix = fd12:aba6:57db:ffff::1/128 98 | ip_check_disabled = true 99 | EOT 100 | fi 101 | 102 | if [ ! -e ${DOTDIR}/foo1IPv6.bar.com.conf ]; then 103 | cat < ${DOTDIR}/foo1IPv6.bar.com.conf 104 | [foo1IPv6.bar.com] 105 | check_cmd = /usr/bin/curl --fail -o /dev/null 'http://[fd12:aba6:57db:ffff::2]:8888' 106 | check_timeout = 5 107 | check_rise = 2 108 | check_fail = 2 109 | check_disabled = false 110 | on_disabled = withdraw 111 | ip_prefix = fd12:aba6:57db:ffff::2/128 112 | ip_check_disabled = false 113 | EOT 114 | fi 115 | 116 | echo "------------------------------------------" 117 | echo "--------installing software---------------" 118 | echo "------------------------------------------" 119 | python3 -m pip install . 120 | 121 | echo "------------------------------------------" 122 | echo "--------Assign IPs in loopback------------" 123 | echo "------------------------------------------" 124 | found () { 125 | local query="$1" 126 | shift 127 | while [ -n "$1" ]; do 128 | [ "${query}" == "${1}" ] && return 0 129 | shift 130 | done 131 | return 1 132 | } 133 | 134 | get_ips () { 135 | /sbin/ip addr show dev lo|awk '/inet/ {print $2}' 136 | } 137 | 138 | loopback_ips=( $(get_ips) ) 139 | to_be_configured=(127.0.0.1/8 \ 140 | 10.52.12.1/32 \ 141 | 10.52.12.2/32 \ 142 | 10.52.12.3/32 \ 143 | 10.52.12.4/32 \ 144 | ::1/128 \ 145 | fd12:aba6:57db:ffff::1/128 \ 146 | fd12:aba6:57db:ffff::2/128) 147 | 148 | for ip_cidr in ${to_be_configured[@]}; do 149 | if ! found "${ip_cidr}" "${loopback_ips[@]}"; then 150 | sudo /sbin/ip addr add "${ip_cidr}" brd "${ip_cidr%%/*}" dev lo scope host && echo "Added ${ip_cidr} to loopback interface" 151 | fi 152 | done 153 | 154 | for ip_cidr in $(get_ips) ; do 155 | if ! found "${ip_cidr}" "${to_be_configured[@]}"; then 156 | sudo /sbin/ip addr del "${ip_cidr}" dev lo && echo "Removed ${ip_cidr} from loopback interface" 157 | fi 158 | done 159 | 160 | echo "------------------------------------------" 161 | echo "---------------bird status----------------" 162 | echo "------------------------------------------" 163 | BIRD_PROGRAMS=(bird bird6) 164 | for bird_daemon in ${BIRD_PROGRAMS[@]}; do 165 | bird_pid=$(pgrep -x "${bird_daemon}") 166 | if [ ! -z "${bird_pid}" ]; then 167 | echo "${bird_daemon} seems to be running pid:${bird_pid}" 168 | else 169 | echo "${bird_daemon} is down" 170 | fi 171 | done 172 | 173 | version=$("${PROGRAM}" -v) 174 | echo "------------------------------------------" 175 | echo "---------starting program-----------------" 176 | echo "------------------------------------------" 177 | pgrep -F "${PIDIFILE}" >/dev/null 2>&1 178 | if [ $? -eq 0 ]; then 179 | echo "Process $(cat "${PIDIFILE}") already running, killing it.." 180 | pkill -F "${PIDIFILE}" 181 | sleep 1 182 | fi 183 | 184 | "${PROGRAM}" -f "${PROGRAMCONF}" -d "${DOTDIR}" 185 | if [ $? -eq 0 ]; then 186 | echo "anycast-healtchecker ${version} started!" 187 | echo 'run: nohup python3 -m http.server --bind 10.52.12.2 8888 & nohup python3 -m http.server --bind 10.52.12.1 8888 &' 188 | fi 189 | -------------------------------------------------------------------------------- /contrib/nagios/check_anycast_healthchecker.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # pylint: disable=too-many-branches 3 | # pylint: disable=too-many-statements 4 | # pylint: disable=too-many-locals 5 | # vim:fenc=utf-8 6 | """Check if anycast-healthchecker and all configured threads are running. 7 | 8 | Usage: 9 | check_anycast_healthchecker.py [-v] 10 | 11 | Options: 12 | -v report what it does 13 | """ 14 | import configparser 15 | import glob 16 | import os 17 | import subprocess 18 | import sys 19 | 20 | from docopt import docopt 21 | 22 | 23 | def get_processid(config): 24 | """Return process id of anycast-healthchecker. 25 | 26 | Arguments: 27 | config (obj): A configparser object with the configuration of 28 | anycast-healthchecker. 29 | 30 | Returns: 31 | The process id found in the pid file 32 | 33 | Raises: 34 | ValueError in the following cases 35 | - pidfile option is missing from the configuration 36 | - pid is either -1 or 1 37 | - stale pidfile, either with no data or invalid data 38 | - failure to read pidfile 39 | 40 | """ 41 | pidfile = config.get('daemon', 'pidfile', fallback=None) 42 | if pidfile is None: 43 | raise ValueError("Configuration doesn't have pidfile option!") 44 | 45 | try: 46 | with open(pidfile, 'r') as _file: 47 | pid = _file.read().rstrip() 48 | try: 49 | pid = int(pid) 50 | except ValueError: 51 | raise ValueError("stale pid file with invalid data:{}" 52 | .format(pid)) 53 | else: 54 | if pid in [-1, 1]: 55 | raise ValueError("invalid PID ({})".format(pid)) 56 | else: 57 | return pid 58 | except OSError as exc: 59 | if exc.errno == 2: 60 | print("CRITICAL: anycast-healthchecker could be down as pid file " 61 | "{} doesn't exist".format(pidfile)) 62 | sys.exit(2) 63 | else: 64 | raise ValueError("error while reading pid file:{}".format(exc)) 65 | 66 | 67 | def running(pid): 68 | """Check the validity of a process ID. 69 | 70 | Note: We need root privileges for this to work if we don't run under the same user 71 | as **anycast-healthechecker**. 72 | 73 | Arguments: 74 | pid (int): Process ID number. 75 | 76 | Returns: 77 | True if process ID is found otherwise False. 78 | 79 | """ 80 | try: 81 | # From kill(2) 82 | # If sig is 0 (the null signal), error checking is performed but no 83 | # signal is actually sent. The null signal can be used to check the 84 | # validity of pid 85 | os.kill(pid, 0) 86 | except OSError: 87 | return False 88 | 89 | return True 90 | 91 | 92 | def parse_services(config, services): 93 | """Parse configuration to return number of enabled service checks. 94 | 95 | Arguments: 96 | config (obj): A configparser object with the configuration of 97 | anycast-healthchecker. 98 | services (list): A list of section names which holds configuration 99 | for each service check 100 | 101 | Returns: 102 | A number (int) of enabled service checks. 103 | 104 | """ 105 | enabled = 0 106 | for service in services: 107 | check_disabled = config.getboolean(service, 'check_disabled') 108 | if not check_disabled: 109 | enabled += 1 110 | 111 | return enabled 112 | 113 | 114 | def main(): 115 | """Run check. 116 | 117 | anycast-healthchecker is a multi-threaded software and for each 118 | service check it holds a thread. If a thread dies then the service 119 | is not monitored anymore and the route for the IP associated with service 120 | it wont be withdrawn in case service goes down in the meantime. 121 | """ 122 | arguments = docopt(__doc__) 123 | config_file = '/etc/anycast-healthchecker.conf' 124 | config_dir = '/etc/anycast-healthchecker.d' 125 | config = configparser.ConfigParser() 126 | config_files = [config_file] 127 | config_files.extend(glob.glob(os.path.join(config_dir, '*.conf'))) 128 | config.read(config_files) 129 | 130 | try: 131 | pid = get_processid(config) 132 | except ValueError as exc: 133 | print("UNKNOWN: {e}".format(e=exc)) 134 | sys.exit(3) 135 | else: 136 | process_up = running(pid) 137 | 138 | if not process_up: 139 | print("CRITICAL: anycast-healthchecker with pid ({p}) isn't running" 140 | .format(p=pid)) 141 | sys.exit(3) 142 | 143 | services = config.sections() 144 | services.remove('daemon') 145 | if not services: 146 | print("UNKNOWN: No service checks are configured") 147 | sys.exit(3) 148 | 149 | enabled_service_checks = parse_services(config, services) 150 | if enabled_service_checks == 0: 151 | print("OK: Number of service checks is zero, no threads are running") 152 | sys.exit(0) 153 | else: 154 | # parent process plus nummber of threads for each service check 155 | configured_threads = enabled_service_checks + 1 156 | 157 | cmd = ['/bin/ps', 'h', '-T', '-p', '{n}'.format(n=pid)] 158 | try: 159 | if arguments['-v']: 160 | print("running {}".format(' '.join(cmd))) 161 | out = subprocess.check_output(cmd, timeout=1) 162 | except subprocess.CalledProcessError as exc: 163 | print("UNKNOWN: running '{c}' failed with return code: {r}" 164 | .format(c=' '.join(cmd), r=exc.returncode)) 165 | sys.exit(3) 166 | except subprocess.TimeoutExpired: 167 | print("UNKNOWN: running '{}' timed out".format(' '.join(cmd))) 168 | sys.exit(3) 169 | else: 170 | output_lines = out.splitlines() 171 | if arguments['-v']: 172 | for line in output_lines: 173 | print(line) 174 | running_threads = len(output_lines) 175 | if running_threads == configured_threads: 176 | print("OK: UP (pid={p}) and all threads ({t}) are running" 177 | .format(p=pid, t=configured_threads - 1)) 178 | sys.exit(0) 179 | elif running_threads - 1 == 0: # minus parent process 180 | print("CRITICAL: No threads are running OpDocs ANYCAST-03") 181 | sys.exit(2) 182 | else: 183 | print("CRITICAL: Found {n} running threads while configured " 184 | "number of threads is {c} OpDocs ANYCAST-03" 185 | .format(n=running_threads - 1, c=configured_threads - 1)) 186 | sys.exit(2) 187 | 188 | 189 | # This is the standard boilerplate that calls the main() function. 190 | if __name__ == '__main__': 191 | main() 192 | -------------------------------------------------------------------------------- /contrib/puppet/anycast_healthchecker/manifests/init.pp: -------------------------------------------------------------------------------- 1 | # == Class: anycast_healthchecker 2 | # 3 | # Installs, configures and manages anycast-healthchecker daemon 4 | # 5 | # === Parameters 6 | # 7 | # Document parameters here. 8 | # 9 | # [*bird_conf*] 10 | # File with the list of IPv6 prefixes allowed to be exported. If this file is 11 | # a symbolic link then the destination and the link itself must be on the same 12 | # mounted filesystem. 13 | # 14 | # [*bird6_conf*] 15 | # File with the list of IPv6 prefixes allowed to be exported. If this file is 16 | # a symbolic link then the destination and the link itself must be on the same 17 | # mounted filesystem. 18 | # 19 | # [*bird_variable*] 20 | # The name of the list defined in ``bird_conf`` 21 | # 22 | # [*bird6_variable*] 23 | # The name of the list defined in ``bird6_conf`` 24 | # 25 | # [*bird_reconfigure_cmd*] 26 | # Command to trigger a reconfiguration of IPv4 Bird daemon 27 | # 28 | # [*bird6_reconfigure_cmd*] 29 | # Command to trigger a reconfiguration of IPv6 Bird daemon 30 | # 31 | # [*bird_keep_changes*] 32 | # Keep a history of changes for ``bird_conf`` file by copying it to a directory. 33 | # During the startup of the daemon a directory with the name ``history`` is 34 | # created under the directory where ``bird_conf`` file resides. The daemon has to 35 | # have sufficient privileges to create that directory. 36 | # 37 | # [*bird6_keep_changes*] 38 | # Keep a history of changes for ``bird6_conf`` file by copying it to a directory. 39 | # During the startup of the daemon a directory with the name ``history`` is 40 | # created under the directory where ``bird6_conf`` file resides. The daemon has to 41 | # have sufficient privileges to create that directory. 42 | # WARNING: If keep changes is enabled for both IP protocols then the 43 | # ``bird_conf`` and ``bird6_conf`` **must** point to files which are stored on 44 | # two different directories. 45 | # 46 | # [*bird_changes_counter*] 47 | # How many ``bird_conf`` files to keep in the ``history`` directory. 48 | # 49 | # [*bird6_changes_counter*] 50 | # How many ``bird6_conf`` files to keep in the ``history`` directory. 51 | # 52 | # [*configuration_dir*] 53 | # Read settings for service checks from files under directory 54 | # 55 | # [*configuration_file*] 56 | # Read settings for the daemon from 57 | # 58 | # [*dummy_ip_prefix*] 59 | # An IP prefix in the form / which will be always available in 60 | # the list defined by ``bird_variable`` to avoid having an empty list. 61 | # The ``dummy_ip_prefix`` **must not** be used by any service or assigned to the 62 | # interface set with ``interface`` or configured anywhere on the network as 63 | # anycast-healthchecker **does not** perform any checks for it. 64 | # 65 | # [*dummy_ip6_prefix*] 66 | # An IPv6 prefix in the form / which will be always 67 | # available in the list defined by ``bird6_variable`` to avoid having an empty 68 | # list. The ``dummy_ip6_prefix`` **must not** be used by any service or assigned 69 | # to the interface set with ``interface`` or configured anywhere on the network as 70 | # anycast-healthchecker **does not** perform any checks for it. 71 | # 72 | # [*group*] 73 | # Set the UNIX group that anycast-healthchecker is executed. 74 | # WARNING: Group must exist in the system. 75 | # 76 | # [*http_server*] 77 | # Server name to send JSON logging over HTTP protocol. 78 | # 79 | # [*http_server_port*] 80 | # Port to connect 81 | # 82 | # [*http_server_protocol*] 83 | # HTTP protocol to use, either ``http`` or ``https`` 84 | # 85 | # [*http_server_timeout*] 86 | # How long to wait for the server to send data before giving up, as a float number. 87 | # JSON messages are send using http POST requests which are executed in blocking 88 | # mode which means that possible long delays will make the health checks to be 89 | # delayed as well. 90 | # ``http_server_timeout`` accepts floating point numbers as values which are 91 | # passed to underlying request module as a single timeout which will be applied 92 | # to both the connect and the read timeouts. 93 | # 94 | # [*ipv4*] 95 | # Enable IPv4 support 96 | # 97 | # [*ipv6*] 98 | # Enable IPv6 support 99 | # 100 | # [*json_logging*] 101 | # ``true`` enables JSON logging ``false`` disables it. 102 | # 103 | # [*user*] 104 | # Set the UNIX user that anycast-healthchecker is executed 105 | # WARNING: User must exist in the system. 106 | # 107 | # === Examples 108 | # 109 | # $user = 'healthchecker' 110 | # $group = 'healthchecker' 111 | # $bird_variable = 'ACAST_PS_ADVERTISE' 112 | # $bird6_variable = 'ACAST_PS_ADVERTISE_IPV6' 113 | # realize ( Group[$group] ) 114 | # realize ( User[$user] ) 115 | # class { 'anycast_healthchecker': 116 | # package_version => '0.7.0-1.el7', 117 | # bird_conf => '/etc/bird.d/4/anycast-prefixes.conf', 118 | # bird6_conf => '/etc/bird.d/6/anycast-prefixes.conf', 119 | # bird_variable => $bird_variable, 120 | # bird6_variable => $bird6_variable, 121 | # user => $user, 122 | # group => $group, 123 | # json_logging => true, 124 | # ipv6 => true, 125 | # var_lib_dir => '/var/lib/anycast-healthchecker', 126 | # var_lib_dir6 => '/var/lib/anycast-healthchecker/6', 127 | # require => [ 128 | # User[$user], 129 | # Group[$group], 130 | # ], 131 | # } 132 | # ::bird2::config::variable{ 133 | # $bird_variable: 134 | # scope => 'ipv4', 135 | # replace => false, 136 | # file_name => 'anycast-prefixes', 137 | # value => [ '10.189.200.255/32', ]; 138 | # } 139 | # ::bird2::config::variable{ 140 | # bird6_variable: 141 | # scope => 'ipv6', 142 | # replace => false, 143 | # file_name => 'anycast-prefixes', 144 | # value => [ '2001:db8::1/128', ]; 145 | # } 146 | # } 147 | # 148 | # 149 | # === Authors 150 | # 151 | # Pavlos Parissis 152 | # 153 | # === Copyright 154 | # 155 | # Copyright 2016 Pavlos Parissis, unless otherwise noted. 156 | # 157 | class anycast_healthchecker ( 158 | $bird_conf = $::anycast_healthchecker::params::bird_conf, 159 | $bird6_conf = $::anycast_healthchecker::params::bird6_conf, 160 | $bird_variable = $::anycast_healthchecker::params::bird_variable, 161 | $bird6_variable = $::anycast_healthchecker::params::bird6_variable, 162 | $bird_reconfigure_cmd = $::anycast_healthchecker::params::bird_reconfigure_cmd, 163 | $bird6_reconfigure_cmd = $::anycast_healthchecker::params::bird6_reconfigure_cmd, 164 | $bird_keep_changes = $::anycast_healthchecker::params::bird_keep_changes, 165 | $bird6_keep_changes = $::anycast_healthchecker::params::bird6_keep_changes, 166 | $bird_changes_counter = $::anycast_healthchecker::params::bird_changes_counter, 167 | $bird6_changes_counter = $::anycast_healthchecker::params::bird6_changes_counter, 168 | $configuration_dir = $::anycast_healthchecker::params::configuration_dir, 169 | $configuration_file = $::anycast_healthchecker::params::configuration_file, 170 | $dummy_ip_prefix = $::anycast_healthchecker::params::dummy_ip_prefix, 171 | $dummy_ip6_prefix = $::anycast_healthchecker::params::dummy_ip6_prefix, 172 | $group = $::anycast_healthchecker::params::group, 173 | $http_server = $::anycast_healthchecker::params::http_server, 174 | $http_server_port = $::anycast_healthchecker::params::http_server_port, 175 | $http_server_protocol = $::anycast_healthchecker::params::http_server_protocol, 176 | $http_server_timeout = $::anycast_healthchecker::params::http_server_timeout, 177 | $ipv4 = $::anycast_healthchecker::params::ipv4, 178 | $ipv6 = $::anycast_healthchecker::params::ipv6, 179 | $json_logging = $::anycast_healthchecker::params::json_logging, 180 | $log_level = $::anycast_healthchecker::params::log_level, 181 | $log_maxbytes = $::anycast_healthchecker::params::log_maxbytes, 182 | $log_backups = $::anycast_healthchecker::params::log_backups, 183 | $log_dir = $::anycast_healthchecker::params::log_dir, 184 | $log_file = $::anycast_healthchecker::params::log_file, 185 | $motd_ensure = $::anycast_healthchecker::params::motd_ensure, 186 | $package_name = $::anycast_healthchecker::params::package_name, 187 | $package_version = $::anycast_healthchecker::params::package_version, 188 | $pid_dir = $::anycast_healthchecker::params::pid_dir, 189 | $pidfile = $::anycast_healthchecker::params::pidfile, 190 | $purge_directory = $::anycast_healthchecker::params::purge_directory, 191 | $purge_ip_prefixes = $::anycast_healthchecker::params::purge_ip_prefixes, 192 | $service_enable = $::anycast_healthchecker::params::service_enable, 193 | $service_ensure = $::anycast_healthchecker::params::service_ensure, 194 | $service_name = $::anycast_healthchecker::params::service_name, 195 | $stderr_file = $::anycast_healthchecker::params::stderr_file, 196 | $stdout_file = $::anycast_healthchecker::params::stdout_file, 197 | $user = $::anycast_healthchecker::params::user, 198 | $var_lib_dir = $::anycast_healthchecker::params::var_lib_dir, 199 | $var_lib_dir6 = $::anycast_healthchecker::params::var_lib_dir6, 200 | ) inherits anycast_healthchecker::params { 201 | 202 | 203 | contain '::anycast_healthchecker::install' 204 | contain '::anycast_healthchecker::config' 205 | contain '::anycast_healthchecker::service' 206 | contain '::anycast_healthchecker::sudo_access' 207 | contain '::anycast_healthchecker::motd' 208 | 209 | Class['::anycast_healthchecker::install'] ~> 210 | Class['::anycast_healthchecker::config'] ~> 211 | Class['::anycast_healthchecker::service'] -> 212 | Class['::anycast_healthchecker::sudo_access'] -> 213 | Class['::anycast_healthchecker::motd'] 214 | 215 | } 216 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | -------------------------------------------------------------------------------- /anycast_healthchecker/healthchecker.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=too-few-public-methods 2 | 3 | """A library which provides the HealthChecker class.""" 4 | import logging 5 | import os 6 | import sys 7 | import threading 8 | from configparser import NoOptionError 9 | from queue import Queue 10 | 11 | from prometheus_client import CollectorRegistry, Counter, Gauge 12 | 13 | from anycast_healthchecker import METRIC_PREFIX, PROGRAM_NAME 14 | from anycast_healthchecker.servicecheck import ServiceCheck 15 | from anycast_healthchecker.utils import ( 16 | SERVICE_OPTIONS_TYPE, 17 | MainExporter, 18 | ServiceCheckDiedError, 19 | archive_bird_conf, 20 | get_ip_prefixes_from_bird, 21 | get_ip_prefixes_from_config, 22 | reconfigure_bird, 23 | run_custom_bird_reconfigure, 24 | write_temp_bird_conf, 25 | ) 26 | 27 | 28 | class HealthChecker: 29 | """Launch threads for each service check and reconfigure BIRD daemon. 30 | 31 | It starts a thread for each service check we have in the configuration and 32 | then waits for reconfiguring Bird daemon based on the results of the 33 | service checks. 34 | 35 | It uses a queue as a way to communicate with all threads. Each thread will 36 | add an item in the queue, which contains the IP prefix to remove from or to 37 | add to BIRD configuration file. When item is added we pick it up, adjust 38 | BIRD configuration and then reload BIRD. 39 | 40 | This class should be instantiated once. 41 | 42 | Arguments: 43 | config (configparger obj): A configparser object with the configuration 44 | bird_configuration (dict): A dictionary with Bird settings. 45 | 46 | Methods: 47 | run(): Launches checks and updates BIRD configuration based on 48 | the result of the check. 49 | 50 | """ 51 | 52 | def __init__(self, config, bird_configuration): 53 | """Initialization.""" 54 | self.log = logging.getLogger(PROGRAM_NAME) 55 | self.config = config 56 | # A queue with IP prefixes and their action to be taken based on the 57 | # state of health check. An item is a tuple of 3 elements: 58 | # 1st: name of the thread. 59 | # 2nd: IP prefix. 60 | # 3nd: IP version, either '4' or '6'. 61 | self.action = Queue() 62 | self.bird_configuration = bird_configuration 63 | self.log.debug(self.bird_configuration) 64 | 65 | # A list of service checks 66 | self.services = config.sections() 67 | self.services.remove('daemon') 68 | 69 | # Holds IP prefixes per IP version for which we have a service check 70 | self.ip_prefixes = {} 71 | for ip_version in self.bird_configuration: 72 | _ip_prefixes = get_ip_prefixes_from_config( 73 | self.config, 74 | self.services, 75 | ip_version) 76 | 77 | _ip_prefixes.add( 78 | self.bird_configuration[ip_version]['dummy_ip_prefix'] 79 | ) 80 | self.ip_prefixes[ip_version] = _ip_prefixes 81 | 82 | self._urgent_event = threading.Event() 83 | 84 | self.log.info('initialize healthchecker') 85 | 86 | def _update_bird_conf_file(self, operation): 87 | """Update BIRD configuration. 88 | 89 | It adds to or removes IP prefix from BIRD configuration. It also 90 | updates generation time stamp in the configuration file. 91 | 92 | Main program will exit if configuration file cant be read/written. 93 | 94 | Arguments: 95 | operation (obj): Either an AddOperation or DeleteOperation object 96 | 97 | Returns: 98 | True if BIRD configuration was updated otherwise False. 99 | 100 | """ 101 | conf_updated = False 102 | prefixes = [] 103 | ip_version = operation.ip_version 104 | config_file = self.bird_configuration[ip_version]['config_file'] 105 | variable_name = self.bird_configuration[ip_version]['variable_name'] 106 | changes_counter =\ 107 | self.bird_configuration[ip_version]['changes_counter'] 108 | dummy_ip_prefix =\ 109 | self.bird_configuration[ip_version]['dummy_ip_prefix'] 110 | 111 | try: 112 | prefixes = get_ip_prefixes_from_bird(config_file) 113 | except OSError as error: 114 | self.log.error("failed to open Bird configuration %s, this is a " 115 | "FATAL error, thus exiting main program", error) 116 | sys.exit(1) 117 | 118 | if not prefixes: 119 | self.log.error("found empty bird configuration %s, this is a FATAL" 120 | " error, thus exiting main program", config_file) 121 | sys.exit(1) 122 | 123 | if dummy_ip_prefix not in prefixes: 124 | self.log.warning("dummy IP prefix %s wasn't found in bird " 125 | "configuration, adding it. This shouldn't have " 126 | "happened!", dummy_ip_prefix) 127 | prefixes.insert(0, dummy_ip_prefix) 128 | conf_updated = True 129 | 130 | ip_prefixes_without_check = set(prefixes).difference( 131 | self.ip_prefixes[ip_version]) 132 | if ip_prefixes_without_check: 133 | self.log.warning("found %s IP prefixes in Bird configuration but " 134 | "we aren't configured to run health checks on " 135 | "them. Either someone modified the configuration " 136 | "manually or something went horrible wrong. We " 137 | "remove them from Bird configuration", 138 | ','.join(ip_prefixes_without_check)) 139 | # This is faster than using lambda and filter. 140 | # NOTE: We don't use remove method as we want to remove more than 141 | # occurrences of the IP prefixes without check. 142 | prefixes[:] = (ip for ip in prefixes 143 | if ip not in ip_prefixes_without_check) 144 | conf_updated = True 145 | 146 | # Update the list of IP prefixes based on the status of health check. 147 | if operation.update(prefixes): 148 | conf_updated = True 149 | 150 | if not conf_updated: 151 | self.log.info('no updates for bird configuration') 152 | return conf_updated 153 | 154 | if self.bird_configuration[ip_version]['keep_changes']: 155 | archive_bird_conf(config_file, changes_counter) 156 | 157 | # some IP prefixes are either removed or added, create 158 | # configuration with new data. 159 | tempname = write_temp_bird_conf( 160 | dummy_ip_prefix, 161 | config_file, 162 | variable_name, 163 | prefixes 164 | ) 165 | try: 166 | os.rename(tempname, config_file) 167 | except OSError as error: 168 | self.log.critical("failed to create Bird configuration %s, this " 169 | "is a FATAL error, thus exiting main program", 170 | error) 171 | sys.exit(1) 172 | else: 173 | self.log.info("Bird configuration for IPv%s is updated", 174 | ip_version) 175 | 176 | # dummy_ip_prefix is always there 177 | if len(prefixes) == 1: 178 | self.log.warning("Bird configuration doesn't have IP prefixes for " 179 | "any of the services we monitor! It means local " 180 | "node doesn't receive any traffic") 181 | 182 | return conf_updated 183 | 184 | def run(self): 185 | """Launch checks and triggers updates on BIRD configuration.""" 186 | # Launch a thread for each configuration 187 | registry = CollectorRegistry() 188 | metric_state = Gauge( 189 | name="service_state", 190 | documentation=( 191 | 'The status of the service check: 0 = healthy, any other value = unhealthy' 192 | ), 193 | labelnames=['service_name', 'ip_prefix'], 194 | namespace=f"{METRIC_PREFIX}", 195 | registry=registry 196 | ) 197 | metric_check_duration = Gauge( 198 | name='service_check_duration_milliseconds', 199 | namespace=f"{METRIC_PREFIX}", 200 | labelnames=['service_name', 'ip_prefix'], 201 | documentation='Service check duration in milliseconds', 202 | registry=registry 203 | ) 204 | metric_check_ip_assignment = Gauge( 205 | name='service_check_ip_assignment', 206 | namespace=f"{METRIC_PREFIX}", 207 | labelnames=['service_name', 'ip_prefix'], 208 | documentation=( 209 | 'Service IP assignment check: 0 = not assigned, 1 = assigned' 210 | ), 211 | registry=registry 212 | ) 213 | metric_check_timeout = Counter( 214 | name='service_check_timeout', 215 | namespace=f"{METRIC_PREFIX}", 216 | labelnames=['service_name', 'ip_prefix'], 217 | documentation='The number of times a service check timed out', 218 | registry=registry 219 | ) 220 | 221 | if self.config.getboolean('daemon', 'prometheus_exporter'): 222 | thread_exporter = MainExporter( 223 | registry=registry, 224 | services=self.services, 225 | config=self.config 226 | ) 227 | thread_exporter.start() 228 | 229 | if not self.services: 230 | self.log.warning("no service checks are configured") 231 | else: 232 | self.log.info("going to launch %s threads", len(self.services)) 233 | if self.config.has_option('daemon', 'splay_startup'): 234 | splay_startup = self.config.getfloat('daemon', 'splay_startup') 235 | else: 236 | splay_startup = None 237 | 238 | for service in self.services: 239 | self.log.debug("launching thread for %s", service) 240 | _config = {} 241 | for option, getter in SERVICE_OPTIONS_TYPE.items(): 242 | try: 243 | _config[option] = getattr(self.config, getter)(service, 244 | option) 245 | except NoOptionError: 246 | pass # for optional settings 247 | 248 | _thread = ServiceCheck(service, 249 | _config, 250 | self.action, 251 | splay_startup, 252 | metric_state, 253 | metric_check_duration, 254 | metric_check_ip_assignment, 255 | metric_check_timeout, 256 | self._urgent_event) 257 | _thread.start() 258 | 259 | # Stay running until we are stopped 260 | while True: 261 | # Fetch items from action queue 262 | operation = self.action.get(block=True) 263 | 264 | if isinstance(operation, ServiceCheckDiedError): 265 | self.log.critical(operation) 266 | self.log.critical("This is a fatal error and the only way to " 267 | "recover is to restart, thus exiting with a " 268 | "non-zero code and let systemd act by " 269 | "triggering a restart") 270 | sys.exit(1) 271 | 272 | self.log.info("returned an item from the queue for %s with IP " 273 | "prefix %s and action to %s Bird configuration", 274 | operation.name, 275 | operation.ip_prefix, 276 | operation) 277 | bird_updated = self._update_bird_conf_file(operation) 278 | self.action.task_done() 279 | if bird_updated: 280 | ip_version = operation.ip_version 281 | if operation.bird_reconfigure_cmd is None: 282 | reconfigure_bird( 283 | self.bird_configuration[ip_version]['reconfigure_cmd']) 284 | else: 285 | run_custom_bird_reconfigure(operation) 286 | 287 | def run_all_checks_now(self): 288 | """Immediately run all checks. This does not change the usual interval.""" 289 | self.log.info("Immediatly running all checks") 290 | self._urgent_event.set() 291 | self._urgent_event.clear() 292 | -------------------------------------------------------------------------------- /anycast_healthchecker/servicecheck.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=too-many-branches 2 | # pylint: disable=too-many-statements 3 | # pylint: disable=too-many-return-statements 4 | # pylint: disable=too-many-instance-attributes 5 | # 6 | 7 | """A library which provides the ServiceCheck class.""" 8 | 9 | import ipaddress 10 | import logging 11 | import random 12 | import shlex 13 | import subprocess 14 | import time 15 | import traceback 16 | from threading import Thread 17 | 18 | from anycast_healthchecker import PROGRAM_NAME 19 | from anycast_healthchecker.utils import ( 20 | AddOperation, 21 | DeleteOperation, 22 | ServiceCheckDiedError, 23 | ) 24 | 25 | 26 | class ServiceCheck(Thread): 27 | """Handle the health checking for a service. 28 | 29 | Arguments: 30 | service (str): The name of the service to monitor. 31 | config (dict): A dictionary with the configuration of the service. 32 | action (Queue obj): A queue object to place actions based on the result 33 | of the health check. 34 | splay_startup: (float): The maximum time to delay the startup. 35 | 36 | Methods: 37 | run(): Run method of the thread. 38 | 39 | """ 40 | 41 | def __init__(self, service, config, action, splay_startup, metric_state, 42 | metric_check_duration, metric_check_ip_assignment, 43 | metric_check_timeout, urgent_event): 44 | """Set the name of thread to be the name of the service.""" 45 | super(ServiceCheck, self).__init__() 46 | self.name = service # Used by Thread() 47 | self.daemon = True # Used by Thread() 48 | self.config = config 49 | self.action = action 50 | self.splay_startup = splay_startup 51 | # sanity check has already been done, so the following *should* not 52 | # raise an exception 53 | _ip_prefix = ipaddress.ip_network(self.config['ip_prefix']) 54 | # NOTE: When subnetmask isn't provided ipaddress module creates an 55 | # object with a mask of /32 for IPv4 addresses and mask of /128 for 56 | # IPv6 addresses. As a result the prefix length is either 32 or 128 57 | # and we can get the IP address by looking at the network_address 58 | # attribute. 59 | self.ip_address = str(_ip_prefix.network_address) 60 | self.prefix_length = _ip_prefix.prefixlen 61 | self.ip_with_prefixlen = _ip_prefix.with_prefixlen 62 | self.ip_version = _ip_prefix.version 63 | self.ip_check_disabled = self.config['ip_check_disabled'] 64 | self.log = logging.getLogger(PROGRAM_NAME) 65 | self.extra = { 66 | 'ip_address': self.ip_address, 67 | 'prefix_length': self.prefix_length, 68 | 'ip_check_disabled': self.ip_check_disabled, 69 | 'status': 'unknown', 70 | } 71 | self.add_operation = AddOperation( 72 | name=self.name, 73 | ip_prefix=self.ip_with_prefixlen, 74 | ip_version=self.ip_version, 75 | bird_reconfigure_timeout=( 76 | config['custom_bird_reconfigure_cmd_timeout'] 77 | ), 78 | bird_reconfigure_cmd=config.get('custom_bird_reconfigure_cmd', 79 | None) 80 | ) 81 | self.del_operation = DeleteOperation( 82 | name=self.name, 83 | ip_prefix=self.ip_with_prefixlen, 84 | ip_version=self.ip_version, 85 | bird_reconfigure_timeout=( 86 | config['custom_bird_reconfigure_cmd_timeout'] 87 | ), 88 | bird_reconfigure_cmd=config.get('custom_bird_reconfigure_cmd', 89 | None) 90 | ) 91 | self.urgent_event = urgent_event 92 | self.log.info("loading check for %s", self.name, extra=self.extra) 93 | 94 | self.metric_state = metric_state 95 | self.metric_check_duration = metric_check_duration 96 | self.metric_check_ip_assignment = metric_check_ip_assignment 97 | self.metric_check_timeout = metric_check_timeout 98 | self.labels = { 99 | "service_name": self.name, 100 | "ip_prefix": self.ip_with_prefixlen 101 | } 102 | 103 | def _run_check(self): 104 | """Execute a check command. 105 | 106 | Returns: 107 | The exit code of the command. 108 | 109 | """ 110 | cmd = shlex.split(self.config['check_cmd']) 111 | self.log.info("running %s", ' '.join(cmd)) 112 | proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, 113 | stderr=subprocess.PIPE) 114 | 115 | start_time = time.time() 116 | try: 117 | outs, errs = proc.communicate(timeout=self.config['check_timeout']) 118 | except subprocess.TimeoutExpired: 119 | self.log.error("check timed out") 120 | self.metric_check_timeout.labels(**self.labels).inc() 121 | if proc.poll() is None: 122 | try: 123 | proc.kill() 124 | except PermissionError: 125 | self.log.warning("failed to kill check due to adequate " 126 | "access rights, check could be running " 127 | "under another user(root) via sudo") 128 | 129 | return 126 130 | else: 131 | duration = (time.time() - start_time) * 1000 132 | msg = f"check duration {duration:.3f}ms" 133 | self.log.info(msg) 134 | self.metric_check_duration.labels(**self.labels).set(duration) 135 | 136 | if proc.returncode != 0: 137 | if errs: 138 | self.log.info("stderr from the check %s", errs) 139 | if outs: 140 | self.log.info("stdout from the check %s", outs) 141 | 142 | return proc.returncode 143 | 144 | def _ip_assigned(self): 145 | """Check if IP prefix is assigned to the interface. 146 | 147 | Returns: 148 | True if IP prefix found assigned otherwise False. 149 | 150 | """ 151 | output = [] 152 | cmd = [ 153 | '/sbin/ip', 154 | 'address', 155 | 'show', 156 | 'dev', 157 | self.config['interface'], 158 | 'to', 159 | self.ip_with_prefixlen, 160 | ] 161 | result = False 162 | 163 | if self.ip_check_disabled: 164 | self.log.info("checking for IP assignment on interface %s is " 165 | "disabled", self.config['interface']) 166 | return True 167 | 168 | self.log.debug("running %s", ' '.join(cmd)) 169 | try: 170 | output = subprocess.check_output( 171 | cmd, 172 | universal_newlines=True, 173 | timeout=1) 174 | except subprocess.CalledProcessError as error: 175 | self.log.error("error checking IP-PREFIX %s: %s", 176 | cmd, error.output) 177 | # Because it is unlikely to ever get an error we return True 178 | result = True 179 | except subprocess.TimeoutExpired: 180 | self.log.error("timeout running %s", ' '.join(cmd)) 181 | # Because it is unlikely to ever get a timeout we return True 182 | result = True 183 | except ValueError as error: 184 | # We have been getting intermittent ValueErrors, see here 185 | # gist.github.com/unixsurfer/67db620d87f667423f6f6e3a04e0bff5 186 | # It has happened ~5 times and this code is executed from multiple 187 | # threads and every ~10secs on several (~40) production servers for 188 | # more than 18months. 189 | # It could be a bug in Python or system returns corrupted data. 190 | # As a consequence of the raised exception thread dies and the 191 | # service isn't monitored anymore!. So, we now catch the exception. 192 | # While checking if an IP is assigned, we get an error unrelated to 193 | # that prevents us from knowing if it's assigned. We simply don't 194 | # know. A retry logic could be a more proper solution. 195 | self.log.error("running %s raised ValueError exception:%s", 196 | ' '.join(cmd), error) 197 | result = True 198 | else: 199 | if self.ip_with_prefixlen in output: # pylint: disable=E1135,R1705 200 | msg = "{i} assigned to {n} interface".format( 201 | i=self.ip_with_prefixlen, 202 | n=self.config['interface'] 203 | ) 204 | self.log.debug(msg) 205 | result = True 206 | else: 207 | msg = ("{i} isn't assigned to {d} interface" 208 | .format(i=self.ip_with_prefixlen, 209 | d=self.config['interface'])) 210 | self.log.warning(msg) 211 | result = False 212 | 213 | if result: 214 | self.metric_check_ip_assignment.labels(**self.labels).set(1) 215 | else: 216 | self.metric_check_ip_assignment.labels(**self.labels).set(0) 217 | 218 | return result 219 | 220 | def _check_disabled(self): 221 | """Check if health check is disabled. 222 | 223 | It logs a message if health check is disabled and it also adds an item 224 | to the action queue based on 'on_disabled' setting. 225 | 226 | Returns: 227 | True if check is disabled otherwise False. 228 | 229 | """ 230 | if self.config['check_disabled']: 231 | if self.config['on_disabled'] == 'withdraw': 232 | self.log.info("Check is disabled and ip_prefix will be " 233 | "withdrawn") 234 | self.log.info("adding %s in the queue", self.ip_with_prefixlen) 235 | self.action.put(self.del_operation) 236 | self.log.info("Check is now permanently disabled") 237 | elif self.config['on_disabled'] == 'advertise': 238 | self.log.info("check is disabled, ip_prefix wont be withdrawn") 239 | self.log.info("adding %s in the queue", self.ip_with_prefixlen) 240 | self.action.put(self.add_operation) 241 | self.log.info('check is now permanently disabled') 242 | 243 | return True 244 | 245 | return False 246 | 247 | def run(self): 248 | """Wrap _run method.""" 249 | # Catch all possible exceptions raised by the running thread 250 | # and let parent process know about it. 251 | try: 252 | self._run() 253 | except Exception: # pylint: disable=broad-except 254 | self.action.put( 255 | ServiceCheckDiedError(self.name, traceback.format_exc()) 256 | ) 257 | 258 | def _run(self): 259 | """Discovers the health of a service. 260 | 261 | Runs until it is being killed from main program and is responsible to 262 | put an item into the queue based on the status of the health check. 263 | The status of service is consider UP after a number of consecutive 264 | successful health checks, in that case it asks main program to add the 265 | IP prefix associated with service to BIRD configuration, otherwise ask 266 | for a removal. 267 | Rise and fail options prevent unnecessary configuration changes when 268 | check is flapping. 269 | """ 270 | up_cnt = 0 271 | down_cnt = 0 272 | # The current established state of the service check, it can be 273 | # either UP or DOWN but only after a number of consecutive successful 274 | # or unsuccessful health checks. 275 | check_state = 'Unknown' 276 | 277 | for key, value in self.config.items(): 278 | self.log.debug("%s=%s:%s", key, value, type(value)) 279 | 280 | # Service check will abort if it is disabled. 281 | if self._check_disabled(): 282 | return 283 | 284 | if self.splay_startup is not None: 285 | sleep_time = float("%.3f" % random.uniform(0, self.splay_startup)) 286 | self.log.info("delaying startup for %ssecs", sleep_time) 287 | time.sleep(sleep_time) 288 | 289 | interval = self.config['check_interval'] 290 | start_offset = time.time() % interval 291 | # Go in a loop until we are told to stop 292 | while True: 293 | timestamp = time.time() 294 | if not self._ip_assigned(): 295 | up_cnt = 0 296 | self.extra['status'] = 'down' 297 | self.log.warning("status DOWN because %s isn't assigned to " 298 | "%s interface.", 299 | self.ip_with_prefixlen, 300 | self.config['interface'], 301 | extra=self.extra) 302 | self.metric_state.labels(**self.labels).set(1) 303 | if check_state != 'DOWN': 304 | check_state = 'DOWN' 305 | self.log.info("adding %s in the queue", 306 | self.ip_with_prefixlen, 307 | extra=self.extra) 308 | self.action.put(self.del_operation) 309 | else: 310 | check_status = self._run_check() 311 | if check_status == 0: 312 | if up_cnt == (self.config['check_rise'] - 1): 313 | self.extra['status'] = 'up' 314 | self.metric_state.labels(**self.labels).set(check_status) 315 | # Service exceeded all consecutive checks. Set its state 316 | # accordingly and put an item in queue. But do it only if 317 | # previous state was different, to prevent unnecessary bird 318 | # reloads when a service flaps between states. 319 | if check_state != 'UP': 320 | check_state = 'UP' 321 | self.log.info("changed to UP", extra=self.extra) 322 | self.log.info("adding %s in the queue", 323 | self.ip_with_prefixlen, 324 | extra=self.extra) 325 | self.action.put(self.add_operation) 326 | else: 327 | self.log.info("status UP", extra=self.extra) 328 | elif up_cnt < self.config['check_rise']: 329 | up_cnt += 1 330 | self.log.info("going up %s/%s", up_cnt, self.config['check_rise'], extra=self.extra) 331 | else: 332 | self.log.error("up_cnt is higher %s, it's a BUG!", 333 | up_cnt, 334 | extra=self.extra) 335 | down_cnt = 0 336 | else: 337 | if down_cnt == (self.config['check_fail'] - 1): 338 | self.extra['status'] = 'down' 339 | # Service exceeded all consecutive checks. 340 | # Set its state accordingly and put an item in queue. 341 | # But do it only if previous state was different, to 342 | # prevent unnecessary bird reloads when a service flaps 343 | # between states 344 | self.metric_state.labels(**self.labels).set(check_status) 345 | if check_state != 'DOWN': 346 | check_state = 'DOWN' 347 | self.log.info("changed to DOWN", extra=self.extra) 348 | self.log.info("adding %s in the queue", 349 | self.ip_with_prefixlen, 350 | extra=self.extra) 351 | self.action.put(self.del_operation) 352 | else: 353 | self.log.info("status DOWN", extra=self.extra) 354 | elif down_cnt < self.config['check_fail']: 355 | down_cnt += 1 356 | self.log.info("going down %s/%s", down_cnt, self.config['check_fail'], extra=self.extra) 357 | else: 358 | self.log.error("down_cnt is higher %s, it's a BUG!", 359 | down_cnt, 360 | extra=self.extra) 361 | up_cnt = 0 362 | 363 | self.log.info("wall clock time %.3fms", 364 | (time.time() - timestamp) * 1000, 365 | extra=self.extra) 366 | 367 | # calculate sleep time 368 | sleep = start_offset - time.time() % interval 369 | if sleep < 0: 370 | sleep += interval 371 | self.log.debug("sleeping for %.3fsecs", sleep, extra=self.extra) 372 | self.urgent_event.wait(timeout=sleep) 373 | -------------------------------------------------------------------------------- /ChangeLog: -------------------------------------------------------------------------------- 1 | CHANGES 2 | ======= 3 | 4 | 0.9.10 5 | ------ 6 | * RELEASE 0.9.10 version 7 | * Adding journalctl format handler 8 | 9 | 0.9.9 10 | ----- 11 | 12 | * RELEASE 0.9.9 version 13 | * Honor appropriately the value of ip\_check\_disabled (#51) 14 | * Convert several .format strings to f-strings (#48) 15 | * pylint workflow tweaks (#47) 16 | * Sort imports (#46) 17 | * Add prometheus-client to the requirements.txt file (#45) 18 | * Run all checks with a signal (#44) 19 | * Update pylint.yml 20 | * Create pylint.yml github workflow 21 | * Improve logging (#43) 22 | * Fix default settings in README (#42) 23 | * Update debian packaging (#41) 24 | * Update README 25 | 26 | 0.9.8 27 | ----- 28 | 29 | * RELEASE 0.9.8 version 30 | * Use a more meaningful value for uptime metric 31 | * Pass the exit code of check\_cmd to service\_state metric 32 | 33 | 0.9.7 34 | ----- 35 | 36 | * RELEASE 0.9.7 version 37 | * Add a metric to capture the return code of the check 38 | 39 | 0.9.6 40 | ----- 41 | 42 | * RELEASE 0.9.6 version 43 | * Use install\_requires instead of requires-dist 44 | * RELEASE 0.9.5 version 45 | * Restore setup.cfg and setup.py 46 | * Require setuptools 61+ for [project] metadata 47 | 48 | 0.9.4 49 | ----- 50 | 51 | * RELEASE 0.9.4 version 52 | * Migrating from setup.py to pyproject.toml 53 | * Fix a typo in the doc of Prometheus metric 54 | 55 | 0.9.3 56 | ----- 57 | 58 | * RELEASE 0.9.3 version 59 | * Adjust docstring of a method 60 | * Set thread name for the Prometheus exporter 61 | * Customize the interval that Prometheus exporters runs 62 | * Make sure the process shutdowns cleanly 63 | 64 | 0.9.2 65 | ----- 66 | 67 | * RELEASE 0.9.2 version 68 | * Remove references to loopback interface 69 | * Add support for exporting Prometheus metrics 70 | * Remove deprecated validate\_ functions from puppet check 71 | * Add an interface parameter to anycast\_healthchecker::check puppet type 72 | * Set an interface on each check.conf file in puppet 73 | * Use the network target on the Systemd unit file 74 | * Add docopt module to requirements.txt 75 | 76 | 0.9.1 77 | ----- 78 | 79 | * RELEASE 0.9.1 version 80 | * Update year in \_\_copyright\_\_ 81 | * Rephrase a sentence to eliminate the usege of black word 82 | * Typo fixes 83 | * Fix misleading docstring 84 | * Update Debian Changelog 85 | * Exit if configuration file passed with '-f' doesn't exist 86 | * Document what exception load\_configuration() raises 87 | 88 | 0.9.0 89 | ----- 90 | 91 | * RELEASE version 0.9.0 version 92 | * Update year in \_\_copyright\_\_ 93 | * Simplify conditional statement 94 | * Add support for custom Bird reconfigure per service 95 | * Ensure correct variable name in Bird configuration 96 | * Make docstring more clear 97 | * Simplify return code from a function 98 | * Add docopt in the requires-dist 99 | * More clear log message 100 | * Report information about the lock 101 | * Fix typo in nagios check 102 | * PEP257 compatible docstring for nagios check 103 | * PEP257 compatible docstring for nagios check 104 | * Log stack-trace when a thread dies 105 | * Change the log severity to critical 106 | * Call super in ServiceCheckDiedError 107 | * Rename exception ServiceCheckDied to ServiceCheckDiedError 108 | * Exit main program when a thread dies 109 | * Update Copyright 110 | * Update docstring/comment 111 | * Remove unnecessary coding config 112 | * Add Maximilian Wilhelm to AUTHORS 113 | * Switch to integer for some default values 114 | * Simplify loading default configuration 115 | * Add support for delaying the 1st check 116 | * Fix typo in README 117 | * Rephrase a sentence 118 | * Remove the dependencies to daemon and lockfile 119 | * Disable pylint warnings 120 | * Add configuration for building Debian packages 121 | * Remove unused method \_update\_status 122 | 123 | 0.8.1 124 | ----- 125 | 126 | * RELEASE 0.8.1 version 127 | * Make the JSON formatted logs more compatible with syslog 128 | 129 | 0.8.0 130 | ----- 131 | 132 | * RELEASE 0.8.0 version 133 | * Fix a typo in README 134 | * Remove trailing whitespace character 135 | * Add Carlo Rengo in the contributers list 136 | * Grammar fixes to improve readability 137 | * Remove unnecessary sub-classing 138 | * Remove unnecessary pylint config 139 | * Dynamically set width in log formatter 140 | * Fix typo in comment 141 | * Print a more meaningful message 142 | * Catch the case where pid is higher than allowed 143 | * Rephrase some docstrings 144 | * Remove from local\_run.sh redundant settings 145 | * Refuse to start if one IP address is used by multiple service checks 146 | * MAJOR: Drop support for daemonization 147 | * Make docstrings compatible with EP257 148 | 149 | 0.7.4 150 | ----- 151 | 152 | * RELEASE 0.7.4 version 153 | * Update copyright 154 | * Make sure we return the prefix length 155 | * Add a 2nd IPv6 service check 156 | * Reorder functions in utils module 157 | * Refactor the way we check our bird settings 158 | * Move the check for a directory to sanity check function 159 | * Fail sanity check when config file and dir don't exist 160 | * Fix incorrect format for enumerated list in README 161 | * fix typo reqeusts->requests (issue #9) 162 | * Mention that IPv6 or v4 is disabled for our software 163 | * Fix type in on\_disabled setting 164 | * Add puppet module 165 | * Mention that parent directories for log and pifile files must be present 166 | 167 | 0.7.3 168 | ----- 169 | 170 | * RELEASE 0.7.3 version 171 | * Refuse to start when parent directory of pidfile is missing 172 | * Turn on bird6\_keep\_changes in example config 173 | * Make local.sh to create directories under var/lib 174 | * Remove the logic, which creates parent directories 175 | * Add Returns and Returns section in docstring 176 | * Update example configuration 177 | 178 | 0.7.2 179 | ----- 180 | 181 | * RELEASE 0.7.2 version 182 | * Add empty line for proper parsing 183 | * Create anycast-prefixes.conf configuration 184 | * PEP257 compatible docstring 185 | * Use os.path.join to build the file path 186 | * Better handling of symbolic links for bird config 187 | * Make docstrings compatible with EP257 188 | * Add ipv4 setting in the configuration generated by local.sh 189 | * Tiny updates on README 190 | 191 | 0.7.1 192 | ----- 193 | 194 | * RELEASE 0.7.1 version 195 | * Update local\_run.sh with latest config 196 | * Don't handle bird configuration in local\_run.sh 197 | * Allow floating numbers for check\_[timeout,interval] 198 | * Instruct Systemd to restart daemon upon failure 199 | * Use standard path rather our custom one 200 | * Mention that prefix length is optional 201 | * Fetch the correct parameter name 202 | * Return boolean value instead of a string 203 | * Update example anycast-healthchecker.conf 204 | 205 | 0.7.0 206 | ----- 207 | 208 | * RELEASE 0.7.0 version 209 | * Drop 3.4 version of python for the local\_run 210 | * Rephrase usage of CLI 211 | * MAJOR: Add support for IPv6 212 | 213 | 0.6.3 214 | ----- 215 | 216 | * RELEASE 0.6.3 version 217 | * Add a note for bird\_conf setting 218 | * Keep a history of changes for bird configuration 219 | * Add support for specifying a single service check 220 | 221 | 0.6.2 222 | ----- 223 | 224 | * RELEASE 0.6.2 version 225 | * Remove die parameter from get\_ip\_prefixes\_from\_bird 226 | * Remove unnecessary call to rstrip function 227 | * Use ConfigParser method to fetch configuration 228 | * Don't exit when there aren't any service checks 229 | * Make the use of format consistent in multi-line 230 | * Add support for removing IP-Prefixes without a check 231 | * Move functions to utils module 232 | * Use double quotes silence error with bash 4.4 version 233 | 234 | 0.6.1 235 | ----- 236 | 237 | * RELEASE 0.6.1 version 238 | * Include IP address and prefix length in JSON blob 239 | 240 | 0.6.0 241 | ----- 242 | 243 | * RELEASE 0.6.0 version 244 | * Rephrase sentences and fix typos in README 245 | * Restructure documentation about settings in README 246 | * Add option to disable the IP assignment check 247 | * Simplify sanity check on configuration 248 | * Log stderr and stdout of check cmd when it fails 249 | * Rephrase some comments 250 | * Add Documentation option in the Systemd unit file 251 | 252 | 0.5.10 253 | ------ 254 | 255 | * RELEASE 0.5.10 version 256 | * Set more accurate sleep time to avoid interval drifting 257 | * Add contrib dir and move files there 258 | 259 | 0.5.9 260 | ----- 261 | 262 | * RELEASE 0.5.9 version 263 | * Remove inaccurate note from README 264 | * Catch the case where check\_cmd can't be killed 265 | * Skip string formatting operation 266 | 267 | 0.5.8 268 | ----- 269 | 270 | * RELEASE 0.5.8 version 271 | * Put keywords in 1 line 272 | * Remove empty line 273 | * Work around intermittent ValueErrors 274 | * Add more keywords in setup.cfg 275 | 276 | 0.5.7 277 | ----- 278 | 279 | * RELEASE 0.5.7 version 280 | * Change the order of logging a message 281 | * Update bird config when IP isn't assigned to lo 282 | * Update installation instructions 283 | * Update setup.cfg 284 | * Make pylint happy 285 | 286 | 0.5.6 287 | ----- 288 | 289 | * RELEASE 0.5.6 version 290 | * Add 'status' key in the JSON blob 291 | * Fix line break before binary operator 292 | * Rewrite docstring of LoggerExt class 293 | * Another set of docstring updates 294 | * Update TODO 295 | * Update docstrings and comments 296 | * Use a more proper name for a variable 297 | * Remove unused reference from README 298 | * Update copyright 299 | 300 | 0.5.5 301 | ----- 302 | 303 | * RELEASE 0.5.5 version 304 | * Update TODO.rst 305 | * Enforce HTTP keep-alive on requests 306 | 307 | 0.5.4 308 | ----- 309 | 310 | * RELEASE 0.5.4 version 311 | * Fix an indentation issue 312 | * Fix a crash with python-daemon>=2.1 313 | 314 | 0.5.3 315 | ----- 316 | 317 | * RELEASE 0.5.3 version 318 | * Use lowercase for the status when is in transition 319 | * Change log level when IP isn't assigned to lo 320 | * Do not send JSON blob for debug messages 321 | * Log only failures when sending JSON blobs 322 | * Reorder the message about zero IP prefixes 323 | * Include the process ID in the header 324 | 325 | 0.5.2 326 | ----- 327 | 328 | * RELEASE 0.5.2 version 329 | * Update TODO 330 | * Use lowercase everywhere to simplify log parsing 331 | * Print also the frame stuck 332 | * Remove newline character from the error output 333 | * Implement a more atomic way to update bird conf 334 | * Disable pylint warnings for too-many-arguments 335 | * Store anycast-prefixes.conf under /var/lib/ 336 | 337 | 0.5.1 338 | ----- 339 | 340 | * RELEASE 0.5.1 version 341 | * Add a module docstring 342 | * Add a warning when all IP prefixes are removed 343 | * Shorten some lines to make pylint happy 344 | * Add module and classes docstrings 345 | * Add support of populating JSON structure 346 | * Rename variable for shorting the length of a line 347 | * Make possible to disable the sending of JSON blobs 348 | * Disable pylint warning attribute-defined-outside-init 349 | * Remove the JSON blob from the log messages 350 | * Wait a bit before starting again the daemon 351 | * Add docstring for a function 352 | * Remove string interpolation within logging calls 353 | * Add version of the daemon in the JSON blob 354 | * Rename a variable for shorting the length of lines 355 | * Add requests module in requirements.txt 356 | 357 | 0.5.0 358 | ----- 359 | 360 | * RELEASE 0.5.0 version 361 | * Add support for JSON logging to a remote point 362 | 363 | 0.4.5 364 | ----- 365 | 366 | * RELEASE 0.4.5 version 367 | * Catch the error on bird reload due to missing cmd 368 | * Remove unnecessary call to close() 369 | 370 | 0.4.4 371 | ----- 372 | 373 | * RELEASE 0.4.4 version 374 | * Update docstring of ip\_prefixes\_check() 375 | * Remove import of an unused exception 376 | * Change the way we handle IP prefixes without check 377 | * Add Ralf Ertzinger in the contributers list 378 | * Add some corrections in README 379 | * Rewrite routing protcol overview 380 | * Add contributers section 381 | 382 | 0.4.3 383 | ----- 384 | 385 | * RELEASE 0.4.3 version 386 | * Fix a regression introduced by 168f9aab 387 | 388 | 0.4.2 389 | ----- 390 | 391 | * RELEASE 0.4.2 version 392 | * Change the way threads pass operations 393 | * Fix a regression introduced by 37aa574b1159 394 | * Simplify the update of bird configuration 395 | * Call anycast-healthchecker from the virtualenvironment 396 | * Remove unnecessary catch of Empty exception 397 | * Remove unnecessary block on threads 398 | * Modify README based on the feedback received 399 | * ServiceCheck: simplify code by removing superfluous lines 400 | * switch to split from shlex 401 | * tiny changes on modifications from @ndemou 402 | * New Introduction + minor spelling, grammar syntax 403 | * More clear and simple introduction 404 | 405 | 0.4.1 406 | ----- 407 | 408 | * RELEASE 0.4.1 version 409 | * mention the use of virtualenvwrapper tool in Testing section 410 | * another set of small updates on README 411 | * fix broken reference in README 412 | * several changes on README 413 | * fix typo in README 414 | * use correct statement to include images 415 | * add drawings to illustrate how anycasted traffic is routed 416 | * tiny fixes in README 417 | * local\_run: get only the list of IPv4 addresses 418 | * tiny fixes on README 419 | * fix typos in README 420 | * add details in README on how someone can test the software 421 | * fix typo in README 422 | * local\_run: report if bird is running 423 | * several clean ups on local\_run 424 | 425 | 0.4.0 426 | ----- 427 | 428 | * RELEASE 0.4.0 version 429 | * ServiceCheck: rearrange the check of IPs in lo interface 430 | * correct the directory path which contains the service checks 431 | * add/remove items from README 432 | * remove OPTIONS variable as its settings aren't supported anymore 433 | * change path of the daemon 434 | * change the default directory location for services check configs 435 | * another set of updates on README 436 | * fix for real the typos in README 437 | * fix typos in README 438 | * some tiny updates on README 439 | * add support for bird\_reconfigure\_cmd 440 | * reStructured text is fun:-( 441 | * Revert "align left the parameter names" 442 | * align left the parameter names 443 | * small corrections on README 444 | * update diagram about RIB in bird 445 | * update diagram about RIB in bird 446 | * add proper description/examples/configution 447 | * add example and working bird configuration 448 | * use a more reasonable default value for log\_backups option 449 | * ServiceCheck: remove unnecessary check for wrong value for on\_disabled option 450 | * ServiceCheck: updates on docstrings/comments 451 | * HealthChecker: docstrings/comment updates 452 | * add an item to TODO 453 | * more useful logging on startup 454 | * make pylint happy by shorten length of lines 455 | * remove items from TODO list 456 | * include an example anycast-healthchecker.conf file 457 | * healthchecker: remove unnecessary import of re module 458 | * remove unnecessary empty line 459 | * healthchecker: use get\_ip\_prefixes\_from\_bird to fetch IP prefixes 460 | * ServiceCheck: remove unnecessary code 461 | * local\_run: drop sleep as it isn't needed anymore 462 | * MAJOR: drop event system as it is not needed anymore 463 | * local\_run: use correct file extension 464 | * utils: get\_ip\_prefixes\_from\_bird returns a list 465 | * HealthChecker: docstring updates 466 | * utils: remove unnecessary commented lines 467 | * utils: fix typo in variable name 468 | * utils: docstrings updates 469 | * BUG: always perform sanity checks 470 | * BUG:perform sanity check before we check if daemon is running 471 | * add support for showing configuration 472 | * add items in TODO list 473 | * remove items from TODO which are completed 474 | * MAJOR:switch to INI files with configparser and docopt for CLI 475 | * add item in TODO 476 | * use the 'to' option in ip tool to list addresses matching the prefix 477 | 478 | 0.3.0 479 | ----- 480 | 481 | * RELEASE 0.3.0 version 482 | * no need to close file handler as 'with' statement does it 483 | * remove another item from TODO 484 | * one item from TODO is finished 485 | * sys.exit can print as well:-) 486 | * remove unused constant 487 | * detect the case where Ddummy ip prefix is missing from bird conf 488 | * detect prefixes in bird without a configuration 489 | * serviceCheck and healthchecker classes get a config option 490 | * configuration\_check accept a config object rather a dir with conf files 491 | * provide a get\_config() to parse json files and return a dict 492 | * add get\_config\_files() to return the absolute path of json files 493 | * utils:configuration\_check:mention the value for invalid types 494 | * move running function to utils module 495 | * allocate a pid file after all checks has been passed 496 | * refactor configuration\_check function 497 | * remove items from TODO which are completed 498 | * exit if we can't write to bird configuration file 499 | * don't print filename as it is mentioned in exception message 500 | * add support for supplying dummy IP prefix in CLI 501 | * make sure log files exists before we start 502 | * add a touch function in utils module 503 | * introduce utils module with valid\_ip\_prefix function 504 | * remove unused code from local\_run 505 | * one item from TODO is implemented 506 | * add support for version in CLI, print it to log as well 507 | * update TODO 508 | 509 | 0.2.2 510 | ----- 511 | 512 | * RELEASE 0.2.2 version 513 | * proper shebang 514 | 515 | 0.2.1 516 | ----- 517 | 518 | * RELEASE 0.2.1 version 519 | * polish some log messages, no code changes 520 | * reset always counters for states to zero when state is in transition 521 | * more updates on local\_run 522 | * updates on TODO 523 | * permission changes 524 | * use birdc rather birdcl to reconfigure bird 525 | * adjust interval in service checks 526 | * updates on local\_run 527 | * updates on local\_run 528 | * local\_run assign IPs and install bird configuration 529 | * some updates on local\_run 530 | * exit when parsing bird configuration results to an empty list of IP prefixes 531 | * a script to run the daemon on a development node 532 | * update AUTHORS 533 | * proper handling of invalid data on pid file 534 | * update TODO 535 | * update TODO 536 | * add Systemd unit file 537 | * more updates on TODO 538 | * update TODO 539 | * add TODO file 540 | * proper RST 541 | * add a proper README file 542 | * replace domain in an example conf 543 | * use gmail mail address 544 | * remove old readme 545 | * better catch of IPs in bird config 546 | * include license to \_\_init\_\_ 547 | * change License to Apache v2.0 548 | 549 | 0.2.0 550 | ----- 551 | 552 | * RELEASE 0.2.0 version 553 | * add some useful info in \_\_init\_\_ 554 | * servicecheck.py: less versbose warning 555 | * servicecheck.py: check if we hare received stop signal before perform IP check 556 | * ServiceCheck.py: check was unnecessarily executed twice 557 | * main.py: call configuration checking 558 | * add requirements file 559 | * add a function to perform a sanity check on the configuration 560 | * servicecheck: Tiny rewrite of comment 561 | * servicecheck: Renamed previou\_state to check\_state 562 | * Fixed argument name collision 563 | * Fixed typo 564 | * Made configurable the rotation policy for log files 565 | * \_update\_bird\_prefix\_conf(): Doc string update 566 | * Report valid values for on\_disabled setting 567 | * Update ChangeLog 568 | 569 | 0.1.1 570 | ----- 571 | 572 | * Tiny change on the comment we place on BIRD config file 573 | * More docstrings and comment updates 574 | * Tiny docstring update 575 | * More pythonic way of loading module 576 | * Spread logic from main.py to individual module files 577 | * Update docstrings and add some comments 578 | * Handle misconfiguration better 579 | * More PEP8 friendly indentation 580 | * Tiny refactoring 581 | * print on stdout and on stderr loggers that we are daemonized 582 | * added pylint filtering 583 | * set umask to 022 to avoid write access for others on log files 584 | * Cosmetic change: Report check duration earlier 585 | * Updated ChangeLog 586 | * fixed path README 587 | 588 | 0.1.0 589 | ----- 590 | 591 | * added git push --tags step in README 592 | * Added (finally) some text in README 593 | * Renamed variable is\_updated -> bird\_updated 594 | * another small doc update 595 | * Various small docstrings updates 596 | * Rewrote ServiceCheck class docstring and changed log severity to error 597 | * Fixed indentation issue and rewrote message when IP-PREFIX is configured 598 | * Catch the case where conf says check disabled but value for on\_disabled is wrong 599 | * Changelog updated 600 | 601 | 0.0.9 602 | ----- 603 | 604 | * Better handling of errors when we reload bird 605 | * Fixed typos 606 | * Changelog updated 607 | 608 | 0.0.8 609 | ----- 610 | 611 | * Added program name in the header of bird configuration 612 | * Detect if IP-PREFIX is assigned to loopback interface 613 | * changelog 614 | 615 | 0.0.7 616 | ----- 617 | 618 | * mpla 619 | 620 | 0.0.6 621 | ----- 622 | 623 | * Constant name is configurable, sysconfig file 624 | * Cosmetic change 625 | * removed unused script 626 | 627 | 0.0.4 628 | ----- 629 | 630 | * mpla 631 | 632 | 0.0.3 633 | ----- 634 | 635 | * mpla 636 | 637 | 0.0.2 638 | ----- 639 | 640 | * trying to make it to build 641 | 642 | 0.0.1 643 | ----- 644 | 645 | * Initial 646 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | .. anycast_healthchecker 2 | .. README.rst 3 | 4 | ===================== 5 | anycast-healthchecker 6 | ===================== 7 | 8 | *A healthchecker for Anycasted services.* 9 | 10 | .. contents:: 11 | 12 | 13 | Introduction 14 | ------------ 15 | 16 | **anycast-healthchecker** monitors a service by doing periodic health checks and, based on the result, instructing `Bird`_ daemon to either advertise or withdraw the route to reach it. As a result Bird will only advertise routes for healthy services. Routes for both IPv4 and IPv6 addresses are supported. 17 | 18 | Bird must be configured in a certain way to interface properly with anycast-healthchecker. The configuration is detailed later in this document. 19 | 20 | anycast-healthchecker is a Python program, which runs in foreground and uses threading to run multiple service checks in parallel. 21 | In older versions ( < 0.8.0 ), anycast-healthchecker used the `daemon`_ library to implement a well-behaved Unix daemon process. This changed when 0.8.0 was released and the daemonization of the process is now a task of systemd. 22 | 23 | What is Anycast 24 | --------------- 25 | 26 | Anycast is a network addressing scheme where traffic from a sender has more than one potential receivers, but only one of them receives it. 27 | Routing protocols decide which one of the potential receivers will actually receive traffic, according to the topology of the network. The main attribute contributing to this decision is the cost of the network path between a sender and a receiver. 28 | 29 | Cost is a protocol specific value (usually an integer) that has meaning only within the domain of the protocol itself, and it is used as a metric of distance. 30 | Routing protocols provide default values for common topologies (`BGP`_ associates the cost of a path with the number of autonomous systems between the sender and the receiver, `OSPF`_ calculates the default cost based on the bandwidth of links), but its main use is to allow administrative control over traffic flow by specifying a cost according to business needs. 31 | 32 | The closest receiver to a sender always receives the traffic; this changes only if something changes on the network, i.e. another receiver with a better path to the sender shows up or the current receiver disappears. If multiple receivers share the same distance from the sender, more than one might receive traffic, depending on how the routing protocol is configured. 33 | 34 | The three pictures below show how traffic is routed between a sender and multiple potential receivers when something changes on network. In this example BGP routing protocol is used: 35 | 36 | .. image:: anycast-receivers-example1.png 37 | :scale: 60% 38 | .. image:: anycast-receivers-example2.png 39 | :scale: 60% 40 | .. image:: anycast-receivers-example3.png 41 | :scale: 60% 42 | 43 | These potential receivers use `BGP`_ or `OSPF`_ and simultaneously announce the same destination IP address from different places on the network. Due to the nature of Anycast, receivers can be located on any location across a global 44 | network infrastructure. 45 | 46 | Anycast doesn't balance traffic, as only one receiver attracts traffic from senders. For instance, if there are two receivers announcing the same destination IP address in different locations, traffic will be distributed between these two receivers unevenly, as senders can be spread across the network in an uneven way. 47 | 48 | Anycast is being used as a mechanism to switch traffic between and within data-centers for the following main reasons: 49 | 50 | * the switch of traffic occurs without the need to enforce a change on clients 51 | 52 | In case of a service failure in one location, traffic to that location will be switched to another data-center without any manual intervention and, most importantly, without pushing a change to clients, which you don't have always 53 | control on. 54 | 55 | * the switch happens within few milliseconds 56 | 57 | The same technology can be used for balancing traffic using `Equal-Cost Multi-Path`_. 58 | 59 | ECMP routing is a network technology where traffic can be routed over multiple paths. In the context of routing protocols, path is the route a packet has to take in order to be delivered to a destination. Because these multiple paths share the same cost, traffic is balanced across them. 60 | 61 | This grants the possibility to perform traffic load-balancing across multiple servers. Routers distribute traffic in a deterministic fashion, usually by selecting the next hop and looking at the following four properties of IP packets: 62 | 63 | * source IP 64 | * source PORT 65 | * destination IP 66 | * destination PORT 67 | 68 | Each unique combination of these four properties is called network flow. For each different network flow a different destination is selected so that traffic is evenly balanced across all servers. These nodes run an Internet Routing software in the same way as in the Anycast case, but with the major difference that all servers receive traffic at the 69 | same time. 70 | 71 | The main characteristic of this type of load-balancing is that it is stateless. Router balances traffic to a destination IP address based on the quadruple network flow without the need to understand and inspect protocols above Layer3. 72 | As a result, it is very cheap in terms of resources and very fast at the same time. This is commonly advertised as traffic balancing at "wire-speed". 73 | 74 | **anycast-healthchecker** can be utilized in Anycast and ECMP environments. 75 | 76 | How anycast-healthchecker works 77 | ------------------------------- 78 | 79 | The current release of anycast-healthchecker supports only the Bird daemon, which has to be configured in a specific way. Therefore, it is useful to explain very briefly how Bird handles advertisements for routes. 80 | 81 | Bird maintains a routing information base (`RIB`_) and various protocols import/export routes to/from it. The diagram below illustrates how Bird advertises IP routes, assigned on the loopback interface, to the rest of the network using BGP protocol. Bird can also import routes learned via BGP/OSPF protocols, but this part of the routing process is irrelevant to the functionality of anycast-healthchecker. 82 | 83 | 84 | .. image:: bird_daemon_rib_explained.png 85 | :scale: 60% 86 | 87 | A route is always associated with a service that runs locally on the box. The Anycasted service is a daemon (HAProxy, Nginx, Bind etc) that processes incoming traffic and listens to an IP (Anycast Service Address) for which a route exists in the RIB and is advertised by Bird. 88 | 89 | As shown in the above picture, a route is advertised only when: 90 | 91 | #. The IP is assigned to the loopback interface. 92 | #. `direct`_ protocol from Bird imports a route for that IP in the RIB. 93 | #. BGP/OSPF protocols export that route from the RIB to a network peer. 94 | 95 | The route associated with the Anycasted service must be either advertised or withdrawn based on the health of the service, otherwise traffic will always be routed to the local node regardless of the status of the service. 96 | 97 | Bird provides `filtering`_ capabilities with the help of a simple programming language. A filter can be used to either accept or reject routes before they are exported from the RIB to the network. 98 | 99 | A list of IP prefixes (/) is stored in a text file. IP prefixes that **are not** included in the list are filtered-out and **are not** exported from the RIB to the network. The white-list text file is sourced by Bird upon startup, reload and reconfiguration. The following diagram illustrates how this technique works: 100 | 101 | .. image:: bird_daemon_filter_explained.png 102 | :scale: 60% 103 | 104 | This configuration logic allows a separate process to update the list by adding or removing IP prefixes and trigger a reconfiguration of Bird in order to advertise or withdraw routes. **anycast-healthchecker** is that separate process. It monitors Anycasted services and, based on the status of the health checks, updates the list of IP prefixes. 105 | 106 | Bird does not allow the definition of a list with no elements: if that happens Bird will produce an error and refuses to start. Because of this, anycast-healthchecker makes sure that there is always an IP prefix in the list, see ``dummy_ip_prefix`` and ``dummy_ip6_prefix`` settings in `Daemon section`_. 107 | 108 | Configuring anycast-healthchecker 109 | --------------------------------- 110 | 111 | Because anycast-healthchecker is very tied with with Bird daemon, the configuration of Bird has been explained first. Next, the configuration of anycast-healthchecker (including the configuration for the health checks) is covered and, finally, the options for invoking the program from the command line will be described. 112 | 113 | IPv6 support 114 | ############ 115 | 116 | IPv4 and IPv6 addresses are supported by the Bird Internet Routing Daemon project by providing a different daemon per IP protocol version, bird for IPv4 and bird6 for IPv6. This implies that configuration files are split as well, meaning that you can't define IPv6 addresses in a configuration and source it by the IPv4 daemon. 117 | 118 | Bird configuration 119 | ################## 120 | 121 | The logic described in `How anycast-healthchecker works`_ can be accomplished by configuring: 122 | 123 | #. an ``include`` statement to source other configuration files in 124 | ``bird.conf`` 125 | #. a function, ``match_route``, as an export filter for the routing 126 | protocol (BGP or OSPF) 127 | #. a list of IP prefixes for routes which allowed to be exported by Bird 128 | 129 | anycast-healthchecker **does not** install any of the aforementioned files. 130 | 131 | bird.conf 132 | ********* 133 | 134 | The most important parts are the lines ``include "/etc/bird.d/*.conf";`` and ``export where match_route();``. The former statement causes inclusion of other configuration files while the latter forces all routes to pass from the ``match_route`` function before they are exported. BGP protocol is used in the below example but OSPF protocol can be used as well:: 135 | 136 | include "/etc/bird.d/*.conf"; 137 | protocol device { 138 | scan time 10; 139 | } 140 | protocol direct direct1 { 141 | interface "lo"; 142 | export none; 143 | import all; 144 | } 145 | template bgp bgp_peers { 146 | import none; 147 | export where match_route(); 148 | local as 64815; 149 | } 150 | protocol bgp BGP1 from bgp_peers { 151 | disabled no; 152 | neighbor 10.248.7.254 as 64814; 153 | } 154 | 155 | match-route.conf 156 | **************** 157 | 158 | ``match-route.conf`` file configures the ``match_route`` function, which performs the allow and deny of IP prefixes by looking at the IP prefix of the route in a list and exports it if it matches entry:: 159 | 160 | function match_route() 161 | { 162 | return net ~ ACAST_PS_ADVERTISE; 163 | } 164 | 165 | This is the equivalent function for IPv6:: 166 | 167 | function match_route6() 168 | { 169 | return net ~ ACAST6_PS_ADVERTISE; 170 | } 171 | 172 | anycast-prefixes.conf 173 | ********************* 174 | 175 | ``anycast-prefixes.conf`` file defines a list of IP prefixes which is stored in a variable named ``ACAST_PS_ADVERTISE``. The name of the variable can be anything meaningful but ``bird_variable`` setting **must** be changed accordingly. 176 | 177 | :: 178 | 179 | define ACAST_PS_ADVERTISE = 180 | [ 181 | 10.189.200.255/32 182 | ]; 183 | 184 | anycast-healthchecker removes IP prefixes from the list for which a service check is not configured. But, the IP prefix set in ``dummy_ip_prefix`` does not need a service check configuration. 185 | 186 | This the equivalent list for IPv6 prefixes:: 187 | 188 | define ACAST6_PS_ADVERTISE = 189 | [ 190 | 2001:db8::1/128 191 | ]; 192 | 193 | anycast-healthchecker creates ``anycast-prefixes.conf`` file for both IP versions upon startup if those file don't exist. After the launch **no other process(es) should** modify those files. 194 | 195 | Use daemon settings ``bird_conf`` and ``bird6_conf`` to control the location of the files. 196 | 197 | With the default settings those files are located under ``/var/lib/anycast-healthchecker`` and ``/var/lib/anycast-healthchecker/6``. Administrators must create those two directories with permissions ``755`` and user/group ownership to the account under which anycast-healthchecker runs. 198 | 199 | Bird daemon loads configuration files by using the ``include`` statement in the main Bird configuration (`bird.conf`_). By default such ``include`` statement points to a directory under ``/etc/bird.d``, while ``anycast-prefixes.conf`` files are located under ``/var/lib/anycast-healthchecker`` directories. Therefore, 200 | a link for each file must be created under ``/etc/bird.d`` directory. Administrators must also create those two links. Here is an example from a production server: 201 | 202 | :: 203 | 204 | % ls -ls /etc/bird.d/anycast-prefixes.conf 205 | 4 lrwxrwxrwx 1 root root 105 Dec 2 16:08 /etc/bird.d/anycast-prefixes.conf -> 206 | /var/lib/anycast-healthchecker/anycast-prefixes.conf 207 | 208 | % ls -ls /etc/bird.d/6/anycast-prefixes.conf 209 | 4 lrwxrwxrwx 1 root root 107 Jan 10 10:33 /etc/bird.d/6/anycast-prefixes.conf 210 | -> /var/lib/anycast-healthchecker/6/anycast-prefixes.conf 211 | 212 | Configuring anycast-healthchecker 213 | ################################# 214 | 215 | anycast-healthchecker uses the popular `INI`_ format for its configuration files. This is an example configuration file(/etc/anycast-healthchecker.conf) for configuring anycast-healthchecker:: 216 | 217 | [DEFAULT] 218 | interface = lo 219 | 220 | [daemon] 221 | pidfile = /var/run/anycast-healthchecker/anycast-healthchecker.pid 222 | ipv4 = true 223 | ipv6 = false 224 | bird_conf = /var/lib/anycast-healthchecker/anycast-prefixes.conf 225 | bird6_conf = /var/lib/anycast-healthchecker/6/anycast-prefixes.conf 226 | bird_variable = ACAST_PS_ADVERTISE 227 | bird6_variable = ACAST6_PS_ADVERTISE 228 | bird_reconfigure_cmd = sudo /usr/sbin/birdc configure 229 | bird6_reconfigure_cmd = sudo /usr/sbin/birdc6 configure 230 | dummy_ip_prefix = 10.189.200.255/32 231 | dummy_ip6_prefix = 2001:db8::1/128 232 | bird_keep_changes = false 233 | bird6_keep_changes = false 234 | bird_changes_counter = 128 235 | bird6_changes_counter = 128 236 | purge_ip_prefixes = false 237 | loglevel = debug 238 | log_maxbytes = 104857600 239 | log_backups = 8 240 | log_server_port = 514 241 | json_stdout = false 242 | json_log_file = false 243 | json_log_server = false 244 | prometheus_exporter = false 245 | prometheus_collector_textfile_dir = /var/cache/textfile_collector/ 246 | prometheus_exporter_interval = 10 247 | 248 | The above settings are used as defaults when anycast-healthchecker is launched without a configuration file. anycast-healthchecker **does not** need to run as root as long as it has sufficient privileges to modify the Bird configuration set in ``bird_conf`` or ``bird6_conf``, and trigger a reconfiguration of Bird by running the command configured in ``bird_reconfigure_cmd`` or ``bird6_reconfigure_cmd``. In the above example ``sudo`` is used for that purpose (``sudoers`` file has been modified for that purpose). 249 | 250 | DEFAULT section 251 | *************** 252 | 253 | Below are the default settings for all service checks, see `Configuring checks for services`_ for an explanation of the parameters. Settings in this section can be overwritten in other sections. 254 | 255 | :interface: lo 256 | :check_interval: 10 257 | :check_timeout: 2 258 | :check_rise: 2 259 | :check_fail: 2 260 | :check_disabled: true 261 | :on_disabled: withdraw 262 | :ip_check_disabled: false 263 | :custom_bird_reconfigure_cmd_timeout: 2 264 | 265 | Daemon section 266 | ************** 267 | 268 | Settings for anycast-healthchecker itself 269 | 270 | * **pidfile** Defaults to **/var/run/anycast-healthchecker/anycast-healthchecker.pid** 271 | 272 | File to store the process id. The parent directory must be created prior the initial launch. 273 | 274 | * **ipv4** Defaults to **true** 275 | 276 | ``true`` enables IPv4 support and ``false`` disables it. 277 | NOTE: anycast-healthchecker **will not** start if IPv4 support is disabled while there is an service check configured for IPv4 prefix. 278 | 279 | * **ipv6** Defaults to **false** 280 | 281 | ``true`` enables IPv6 support and ``false`` disables it 282 | NOTE: anycast-healthchecker **will not** start if IPv6 support is disabled while there is an service check configured for IPv6 prefix. 283 | 284 | * **bird_conf** Defaults to **/var/lib/anycast-healthchecker/anycast-prefixes.conf** 285 | 286 | File with the list of IPv4 prefixes allowed to be exported. If this file is a symbolic link then the destination and the link itself must be on the same mounted filesystem. 287 | 288 | * **bird6_conf** Defaults to **/var/lib/anycast-healthchecker/6/anycast-prefixes.conf** 289 | 290 | File with the list of IPv6 prefixes allowed to be exported. If this file is a symbolic link then the destination and the link itself must be on the same mounted filesystem. 291 | 292 | * **bird_variable** Defaults to **ACAST_PS_ADVERTISE** 293 | 294 | The name of the list defined in ``bird_conf`` 295 | 296 | * **bird6_variable** Defaults to **ACAST6_PS_ADVERTISE** 297 | 298 | The name of the list defined in ``bird6_conf`` 299 | 300 | * **bird_reconfigure_cmd** Defaults to **sudo /usr/sbin/birdc configure** 301 | 302 | Command to trigger a reconfiguration of IPv4 Bird daemon 303 | 304 | * **bird6_reconfigure_cmd** Defaults to **sudo /usr/sbin/birdc6 configure** 305 | 306 | Command to trigger a reconfiguration of IPv6 Bird daemon 307 | 308 | * **dummy_ip_prefix** Defaults to **10.189.200.255/32** 309 | 310 | An IP prefix in the form / which will be always available in the list defined by ``bird_variable`` to avoid having an empty list. The ``dummy_ip_prefix`` **must not** be used by any service or assigned to the interface set with ``interface`` or configured anywhere on the network as anycast-healthchecker **does not** perform any checks for it. 311 | 312 | * **dummy_ip6_prefix** Defaults to **2001:db8::1/128** 313 | 314 | An IPv6 prefix in the form / which will be always available in the list defined by ``bird6_variable`` to avoid having an empty list. The ``dummy_ip6_prefix`` **must not** be used by any service or assigned to the interface set with ``interface`` or configured anywhere on the network as anycast-healthchecker **does not** perform any checks for it. 315 | 316 | * **bird_keep_changes** Defaults to **false** 317 | 318 | Keep a history of changes for ``bird_conf`` file by copying it to a directory. During the startup of anycast-healthchecker a directory with the name ``history`` is created under the directory where ``bird_conf`` file resides. The daemon has to have sufficient privileges to create that directory. 319 | 320 | * **bird6_keep_changes** Defaults to **false** 321 | 322 | Keep a history of changes for ``bird6_conf`` file by copying it to a directory. During the startup of anycast-healthchecker a directory with the name ``history`` is created under the directory where ``bird6_conf`` file resides. The daemon has to have sufficient privileges to create that directory. 323 | WARNING: When keeping a history of changes is enabled for both IP versions then configuration files set in ``bird_conf`` and ``bird6_conf`` settings **must** be stored on two different directories. 324 | 325 | * **bird_changes_counter** Defaults to **128** 326 | 327 | How many ``bird_conf`` files to keep in the ``history`` directory. 328 | 329 | * **bird6_changes_counter** Defaults to **128** 330 | 331 | How many ``bird6_conf`` files to keep in the ``history`` directory. 332 | 333 | * **purge_ip_prefixes** Defaults to **false** 334 | 335 | During start-up purge IP-Prefixes from configuration files set in ``bird_conf`` and ``bird6_conf``, which don't have a service check associated with them. 336 | 337 | NOTE: Those IP-Prefixes are always removed from the configuration files set in ``bird_conf`` and in ``bird6_conf`` settings when anycast-healthchecker updates those files. ``purge_ip_prefixes`` is considered only during start-up and was introduced in order to be compatible with the behavior of previous releases, which didn't remove those IP-Prefixes on start-up. 338 | 339 | * **loglevel** Defaults to **debug** 340 | 341 | Log level to use, possible values are: debug, info, warning, error, critical 342 | 343 | * **log_file** Unset by default 344 | 345 | File to log messages to. The parent directory must be created prior the initial 346 | launch. If unset, log messages are written to stdout. 347 | 348 | * **log_maxbytes** Defaults to **104857600** (bytes, equals 100MiB) 349 | 350 | Maximum size in bytes for log files. It is only used if **log_file** is set to 351 | a file. 352 | 353 | * **log_backups** Defaults to **8** 354 | 355 | Number of old log files to maintain. It is only used if **log_file** is set to 356 | a file. 357 | 358 | * **stderr_file** Unset by default 359 | 360 | File to redirect standard error to. The parent directory must be created prior the initial launch. 361 | If unset, stderr is not redirected. 362 | 363 | * **log_server** Unset by default 364 | 365 | Either the IP address or the hostname of an UDP syslog server to forward logging messages. 366 | 367 | * **log_server_port** Defaults to **514** 368 | 369 | The port on the remote syslog server to forward logging messages over UDP. 370 | 371 | * **json_stdout** Defaults to **false** 372 | 373 | ``true`` enables structured logging for STDOUT. 374 | 375 | * **json_log_file** Defaults to **false** 376 | 377 | ``true`` enables structured logging when **log_file** is set to a file. 378 | 379 | * **json_log_server** Defaults to **false** 380 | 381 | ``true`` enables structured logging when **log_server** is set to a remote UDP 382 | syslog server. 383 | 384 | * **prometheus_exporter** Defaults to **false** 385 | 386 | ``true`` enables prometheus exporter. 387 | 388 | * **prometheus_collector_textfile_dir** Defaults to **/var/cache/textfile_collector/** 389 | 390 | The directory to store the exported statistics. 391 | 392 | * **prometheus_exporter_interval** Defaults to **10** seconds 393 | 394 | How often to export Prometheus metrics. 395 | 396 | * **splay_startup** Unset by default 397 | 398 | The maximum time to delay the startup of service checks. You can use either integer or floating-point number as a value. 399 | 400 | In order to avoid launching all checks at the same time, after anycast-healthchecker is started, we can delay the 1st check in random way. This can be useful in cases where we have a lot of service checks and launching all them at the same time can overload the system. We randomize the delay of the 1st check for each service and **splay_startup** sets the maximum time we can delay that 1st check. 401 | 402 | The interval of the check doesn't drift, thanks to 9cbbeaff455c49b35670c, and as a result the service checks will be always launched in different times during the life time of anycast-healthchecker. 403 | 404 | Prometheus exporter 405 | ************************ 406 | 407 | anycast-healthchecker comes with a Prometheus exporter to expose various statistics. This functionality is not enabled by default and users need to set **prometheus_exporter** setting to **true** and also adjust **prometheus_collector_textfile_dir** parameter according to their setup. 408 | 409 | Below is the exported metrics when there are three service checks configured:: 410 | 411 | # HELP anycast_healthchecker_service_state The status of the service check: 0 = healthy, any other value = unhealthy 412 | # TYPE anycast_healthchecker_service_state gauge 413 | anycast_healthchecker_service_state{ip_prefix="fd12:aba6:57db:ffff::1/128",service_name="foo1IPv6.bar.com"} 0.0 414 | anycast_healthchecker_service_state{ip_prefix="10.52.12.1/32",service_name="foo.bar.com"} 0.0 415 | anycast_healthchecker_service_state{ip_prefix="10.52.12.2/32",service_name="foo1.bar.com"} 0.0 416 | # HELP anycast_healthchecker_service_check_duration_milliseconds Service check duration in milliseconds 417 | # TYPE anycast_healthchecker_service_check_duration_milliseconds gauge 418 | anycast_healthchecker_service_check_duration_milliseconds{ip_prefix="10.52.12.1/32",service_name="foo.bar.com"} 5.141496658325195 419 | # HELP anycast_healthchecker_service_check_ip_assignment Service IP assignment check: 0 = not assigned, 1 = assigned 420 | # TYPE anycast_healthchecker_service_check_ip_assignment gauge 421 | anycast_healthchecker_service_check_ip_assignment{ip_prefix="10.52.12.1/32",service_name="foo.bar.com"} 1.0 422 | anycast_healthchecker_service_check_ip_assignment{ip_prefix="fd12:aba6:57db:ffff::1/128",service_name="foo1IPv6.bar.com"} 0.0 423 | anycast_healthchecker_service_check_ip_assignment{ip_prefix="10.52.12.2/32",service_name="foo1.bar.com"} 1.0 424 | # HELP anycast_healthchecker_service_check_timeout_total The number of times a service check timed out 425 | # TYPE anycast_healthchecker_service_check_timeout_total counter 426 | anycast_healthchecker_service_check_timeout_total{ip_prefix="10.52.12.2/32",service_name="foo1.bar.com"} 3.0 427 | # HELP anycast_healthchecker_service_check_timeout_created The number of times a service check timed out 428 | # TYPE anycast_healthchecker_service_check_timeout_created gauge 429 | anycast_healthchecker_service_check_timeout_created{ip_prefix="10.52.12.2/32",service_name="foo1.bar.com"} 1.698693786243282e+09 430 | # HELP anycast_healthchecker_uptime Uptime of the process in seconds since the epoch 431 | # TYPE anycast_healthchecker_uptime gauge 432 | anycast_healthchecker_uptime 1.6986938162371802e+09 433 | # HELP anycast_healthchecker_state The current state of the process: 0 = down, 1 = up 434 | # TYPE anycast_healthchecker_state gauge 435 | anycast_healthchecker_state 1.0 436 | # HELP anycast_healthchecker_version_info Version of the software 437 | # TYPE anycast_healthchecker_version_info gauge 438 | anycast_healthchecker_version_info{version="0.9.1"} 1.0 439 | # HELP anycast_healthchecker_service The configured service checks 440 | # TYPE anycast_healthchecker_service gauge 441 | anycast_healthchecker_service{ip_prefix="10.52.12.1/32",service_name="foo.bar.com"} 1.0 442 | anycast_healthchecker_service{ip_prefix="fd12:aba6:57db:ffff::1/128",service_name="foo1IPv6.bar.com"} 1.0 443 | anycast_healthchecker_service{ip_prefix="10.52.12.2/32",service_name="foo1.bar.com"} 1.0 444 | 445 | 446 | How to configure logging 447 | ************************ 448 | 449 | By default anycast-healtchecker logs messages to STDOUT, while messages related to unhandled exceptions or crashes go to STDERR. But it is possible to log such messages to a file and/or to a remote UDP syslog server. 450 | 451 | anycast-healthchecker doesn't log to STDOUT/STDERR when either log file or a remote UDP syslog server is configured. 452 | 453 | You can configure it to use a log file and a remote UDP syslog server at the same time, so logging messages can be stored locally and remotely. This is convenient when remote log server is in trouble and loses log messages. 454 | 455 | The best logging configuration in terms of resiliency is to enable logging only to a remote UDP syslog server. Sending data over UDP protocol is done in no-blocking mode and therefore anycast-healthchecker isn't blocked in any way 456 | when it logs messages. Furthermore, when it logs to a log file and there isn't any more space available on the filesystem, the software will crash. You can easily avoid this failure by using UDP syslog server. 457 | 458 | Last but not least, anycast-healthchecker handles the rotation of old log files, so you don't need to configure any other tools(logrotate) for that. 459 | 460 | JSON logging 461 | ************ 462 | 463 | You can configure anycast-healthchecker to send structured logging messages. This is quite important in environments with a lot of servers and Anycasted services. 464 | 465 | You can enable structured logging for STDOUT, log file and remote UDP syslog server. Currently, it isn't possible to add/remove keys from the structured logging data. The followings are the keys that are present in the structure: 466 | 467 | 468 | * asctime: Human-readable time when the log message was created, example value 2017-07-23 09:43:28,995. 469 | 470 | * levelname: Text logging level for the message, example value WARNING. 471 | 472 | * process: Process ID, example value 23579 473 | 474 | * message: The logged message. 475 | 476 | * prefix_length: The prefix length of the Anycast Address associated with the logged message, example value 128. 477 | This key isn't present for messages, which were logged by the parent thread. 478 | 479 | * status: The status of the service when message was logged, possible values are down, up and unknown. 480 | This key isn't present for messages, which were logged by the parent thread. 481 | 482 | * ip_address: The Anycast IP address of the monitored service for which the message was logged, example value fd12:aba6:57db:ffff::2 483 | This key isn't present for messages, which were logged by the parent thread. 484 | 485 | * ip_check_disabled: Either ``true`` when the assignment check of ``ip_prefix`` to the interface is disabled, otherwise ``false``. 486 | This key isn't present for messages, which were logged by the parent thread. 487 | 488 | * version: The running version of anycast-healthchecker, example value 0.7.4. 489 | 490 | * program: The process name, defaults to anycast-healthchecker. 491 | 492 | * service_name: The name of the service defined in configuration for which the message was logged, example value foo1IPv6.bar.com. Logging messages from the parent thread will have value "MainThread". 493 | 494 | Journalctl logging 495 | ****************** 496 | 497 | If you run the daemon via systemd, you might prefer using journalctl for logging. 498 | To enable this, make sure ``log_file`` and ``log_server`` and ``json_stdout`` options are **NOT set** , and ``log_format_journalctl`` is set to ``true``. 499 | 500 | Configuring checks for services 501 | ############################### 502 | 503 | The configuration for a single service check is defined in one section. 504 | Here are few examples:: 505 | 506 | [foo.bar.com] 507 | check_cmd = /usr/bin/curl --fail --silent http://10.52.12.1/ 508 | check_interval = 10 509 | check_timeout = 2 510 | check_fail = 2 511 | check_rise = 2 512 | check_disabled = false 513 | on_disabled = withdraw 514 | ip_prefix = 10.52.12.1/32 515 | 516 | [foo6.bar.com] 517 | check_cmd = /usr/bin/curl --fail 'http://[fd12:aba6:57db:ffff::1]:8888' 518 | check_timeout = 5 519 | check_rise = 2 520 | check_fail = 2 521 | check_disabled = false 522 | on_disabled = withdraw 523 | ip_prefix = fd12:aba6:57db:ffff::1/128 524 | ip_check_disabled = false 525 | 526 | The name of the section becomes the name of the service check and appears in the log files for easier searching of error/warning messages. 527 | 528 | * **check_cmd** Unset by default 529 | 530 | The command to run to determine the status of the service based **on the return code**. Complex health checking should be wrapped in a script. When check command fails, the stdout and stderr appears in the log file. 531 | 532 | * **check_interval** Defaults to **10** (seconds) 533 | 534 | How often to run the check 535 | 536 | * **check_timeout** Defaults to **2** (seconds) 537 | 538 | Maximum time in seconds for the check command to complete. anycast-healthchecker will try kill the check if it doesn't return after *check_timeout* seconds. If *check_cmd* runs under another user account (root) via sudo then it won't be killed. anycast-healthchecker could run as root to overcome this problem, but it is highly recommended to run it as normal user. 539 | 540 | * **check_fail** Defaults to **2** 541 | 542 | A service is considered DOWN after these many consecutive unsuccessful health checks 543 | 544 | * **check_rise** Defaults to **2** 545 | 546 | A service is considered HEALTHY after these many consecutive successful health checks 547 | 548 | * **check_disabled** Defaults to **true** 549 | 550 | ``true`` disables the check, ``false`` enables it 551 | 552 | * **on_disabled** Defaults to **withdraw** 553 | 554 | What to do when check is disabled, either ``withdraw`` or ``advertise`` 555 | 556 | * **ip_prefix** Unset by default 557 | 558 | IP prefix associated with the service. It **must be** assigned to the interface set in ``interface`` parameter unless ``ip_check_disabled`` is set to ``true``. Prefix length is optional and defaults to 32 for IPv4 addresses and to 128 for IPv6 addresses. 559 | 560 | * **ip_check_disabled** Defaults to **false** 561 | 562 | ``true`` disables the assignment check of ``ip_prefix`` to the interface set in ``interface``, ``false`` enables it. 563 | 564 | If the ``check_cmd`` checks the availability of the service by sending a request to the Anycasted IP address then this request may be served by another node that advertises the same IP address on the network. This usually happens 565 | when the Anycasted IP address is not assigned to loopback or any other interface on the local node. 566 | 567 | Therefore, it should be only enabled in environments where the network or the network configuration of the local node prevents the request from ``check_cmd`` to be forwarded to another node. 568 | 569 | * **interface** Defaults to **lo** 570 | 571 | The name of the interface that ``ip_prefix`` is assigned to 572 | 573 | * **custom_bird_reconfigure_cmd** Unset by default 574 | 575 | A custom command to trigger a reconfiguration of Bird daemon. This overwrites the value of **bird_reconfigure_cmd** and **bird6_reconfigure_cmd** settings. This setting allows the use of a custom command to trigger a reconfiguration of Bird daemon after an IP prefix is either added to or removed from Bird configuration. If return code is not a zero value then an error is logged together with STDERR of the command, if there is any. anycast-healthchecker passes one argument to the command, which is *up* when IP prefix is added or *down* when is removed, so the command can perform different things depending the status of the service. 576 | 577 | * **custom_bird_reconfigure_cmd_timeout** Defaults to **2** (seconds) 578 | 579 | Maximum time in seconds for the **custom_bird_reconfigure_cmd** to complete. anycast-healthchecker will try kill the command if it doesn't return after **custom_bird_reconfigure_cmd_timeout** seconds. If **custom_bird_reconfigure_cmd** runs under another user account (root) via sudo then it won't be killed. anycast-healthchecker could run as root to overcome this problem, but it is highly recommended to run it as normal user. 580 | 581 | 582 | Multiple sections may be combined in one file or provide one file per section. File must be stored under one directory and their name should use ``.conf`` as suffix (foo.bar.com.conf). 583 | 584 | Starting anycast-healthchecker 585 | ############################## 586 | 587 | CLI usage:: 588 | 589 | anycast-healthchecker --help 590 | A simple healthchecker for Anycasted services. 591 | 592 | Usage: 593 | anycast-healthchecker [ -f -c -p -P ] [ -d | -F ] 594 | 595 | Options: 596 | -f, --file= read settings from 597 | [default: /etc/anycast-healthchecker.conf] 598 | -d, --dir= read settings for service checks from files 599 | under directory 600 | [default: /etc/anycast-healthchecker.d] 601 | -F, --service-file= read for settings of a single service 602 | check 603 | -c, --check perform a sanity check on configuration 604 | -p, --print show default settings for anycast-healthchecker 605 | and service checks 606 | -P, --print-conf show running configuration with default settings 607 | applied 608 | -v, --version show version 609 | -h, --help show this screen 610 | 611 | You can launch it by supplying a configuration file and a directory with configuration files for service checks:: 612 | 613 | anycast-healthchecker -f ./anycast-healthchecker.conf -d ./anycast-healthchecker.d 614 | 615 | At the root of the project there is System V init and a Systemd unit file for proper integration with OS startup tools. 616 | 617 | Sending a ``SIGURG`` signal to a running anycast-healthchecker process will trigger an immediate, additional (not changing the regular interval) execution of all active checks. For services with ``check_rise`` and/or ``check_fail`` set to ``1``, this can be used to make external events faster advertise and/or withdraw their prefixes. 618 | 619 | Systemd and SysVinit integration 620 | ################################ 621 | 622 | Under contrib/systemd and contrib/SysVinit directories there are the necessary startup files that can be used to start anycast-healthchecker on boot. 623 | 624 | **IMPORTANT:** Version 0.8.0 dropped support for daemonization and therefore you can't use the System V init script stored under contrib/SysVinit directory with newer versions. If you want to use version 0.8.0 and higher on Operating Systems that don't support Systemd then you have to use a tool like supervisord. 625 | 626 | Nagios check 627 | ############ 628 | 629 | Under contrib/nagios directory there is a nagios plugin to check if the program is up and if all threads are running. 630 | 631 | Installation 632 | ------------ 633 | 634 | Use pip:: 635 | 636 | pip install anycast-healthchecker 637 | 638 | From Source:: 639 | 640 | sudo python -m pip install . 641 | 642 | Build a python wheel for manual installation:: 643 | 644 | python -m pip install build; python -m build --wheel 645 | 646 | 647 | Release 648 | ------- 649 | 650 | #. Bump version in anycast_healthchecker/__init__.py 651 | 652 | #. Commit above change with:: 653 | 654 | git commit -av -m'RELEASE 0.1.3 version' 655 | 656 | #. Create a signed tag, pbr will use this for the version number:: 657 | 658 | git tag -s 0.1.3 -m 'bump release' 659 | 660 | #. Create the package wheel (the whl file will be placed in the **dist** directory):: 661 | 662 | python -m pip install build; python -m build --wheel 663 | 664 | #. pbr will update ChangeLog file and we want to squeeze them to the previous commit thus we run:: 665 | 666 | git commit -av --amend 667 | 668 | #. Move current tag to the last commit:: 669 | 670 | git tag -fs 0.1.3 -m 'bump release' 671 | 672 | #. Push changes:: 673 | 674 | git push; git push --tags 675 | 676 | 677 | Development 678 | ----------- 679 | I would love to hear what other people think about **anycast_healthchecker** and provide feedback. Please post your comments, bug reports and wishes on my `issues page `_. 680 | 681 | Testing 682 | ####### 683 | 684 | At the root of the project there is a ``local_run.sh`` script which you can use 685 | for testing purposes. It does the following: 686 | 687 | #. Creates the necessary directory structure under $PWD/var to store 688 | configuration and log files 689 | 690 | #. Generates configuration for the daemon and for 2 service checks 691 | 692 | #. Generates bird configuration(anycast-prefixes.conf) 693 | 694 | #. Installs anycast-healthchecker with ``python3 -m pip install .`` 695 | 696 | #. Assigns 4 IPv4 addresses and 2 IPv6 addresses to loopback interface 697 | 698 | #. Checks if bird daemon runs but it does not try to start if it is down 699 | 700 | #. Starts the daemon as normal user and not as root 701 | 702 | Requirements for running ``local_run.sh`` 703 | 704 | #. python3 installation 705 | 706 | #. A working python virtual environment, use the excellent tool virtualenvwrapper 707 | 708 | #. Bird installed and configured as it is mentioned in `Bird configuration`_ 709 | 710 | #. sudo access to run ``birdc configure`` and ``birdc6 configure`` 711 | 712 | #. sudo access to assign IPs on the loopback interface using ``ip`` tool 713 | 714 | Contributors 715 | ############ 716 | 717 | The following people have contributed to project with feedback, commits and code reviews 718 | 719 | - Károly Nagy (@charlesnagy) 720 | - Nick Demou (@ndemou) 721 | - Ralf Ertzinger (@alufu) 722 | - Carlo Rengo (@sevencastles) 723 | 724 | Licensing 725 | --------- 726 | 727 | Apache 2.0 728 | 729 | Acknowledgement 730 | --------------- 731 | This program was originally developed for Booking.com. With approval from Booking.com, the code was generalised and published as Open Source on github, for which the author would like to express his gratitude. 732 | 733 | Contacts 734 | -------- 735 | 736 | **Project website**: https://github.com/unixsurfer/anycast_healthchecker 737 | 738 | **Author**: Pavlos Parissis 739 | 740 | .. _Bird: http://bird.network.cz/ 741 | .. _BGP: https://en.wikipedia.org/wiki/Border_Gateway_Protocol 742 | .. _OSPF: https://en.wikipedia.org/wiki/Open_Shortest_Path_First 743 | .. _Equal-Cost Multi-Path: https://en.wikipedia.org/wiki/Equal-cost_multi-path_routing 744 | .. _direct: http://bird.network.cz/?get_doc&f=bird-6.html#ss6.4 745 | .. _filtering: http://bird.network.cz/?get_doc&f=bird-5.html 746 | .. _RIB: https://en.wikipedia.org/wiki/Routing_table 747 | .. _INI: https://en.wikipedia.org/wiki/INI_file 748 | .. _daemon: https://pypi.python.org/pypi/python-daemon/ 749 | .. _requests: https://github.com/kennethreitz/requests 750 | -------------------------------------------------------------------------------- /anycast_healthchecker/utils.py: -------------------------------------------------------------------------------- 1 | # vim:fenc=utf-8 2 | # 3 | # pylint: disable=too-many-arguments 4 | # pylint: disable=too-many-locals 5 | # pylint: disable=too-many-branches 6 | # pylint: disable=too-few-public-methods 7 | # pylint: disable=too-many-lines 8 | """Provide functions and classes that are used within anycast_healthchecker.""" 9 | import configparser 10 | import datetime 11 | import glob 12 | import ipaddress 13 | import logging 14 | import logging.handlers 15 | import os 16 | import re 17 | import shlex 18 | import shutil 19 | import signal 20 | import subprocess 21 | import sys 22 | import time 23 | from collections import Counter 24 | from threading import Thread 25 | 26 | from prometheus_client import CollectorRegistry, Gauge, Info, write_to_textfile 27 | from pythonjsonlogger import jsonlogger 28 | 29 | from anycast_healthchecker import ( 30 | DEFAULT_OPTIONS, 31 | METRIC_PREFIX, 32 | PROGRAM_NAME, 33 | __version__, 34 | ) 35 | 36 | SERVICE_OPTIONS_TYPE = { 37 | 'check_cmd': 'get', 38 | 'check_interval': 'getfloat', 39 | 'check_timeout': 'getfloat', 40 | 'check_rise': 'getint', 41 | 'check_fail': 'getint', 42 | 'check_disabled': 'getboolean', 43 | 'on_disabled': 'get', 44 | 'ip_prefix': 'get', 45 | 'interface': 'get', 46 | 'ip_check_disabled': 'getboolean', 47 | 'custom_bird_reconfigure_cmd_timeout': 'getfloat', 48 | 'custom_bird_reconfigure_cmd': 'get', 49 | } 50 | SERVICE_OPTIONAL_OPTIONS = { 51 | 'custom_bird_reconfigure_cmd_timeout', 52 | 'custom_bird_reconfigure_cmd', 53 | } 54 | 55 | DAEMON_OPTIONS_TYPE = { 56 | 'pidfile': 'get', 57 | 'bird_conf': 'get', 58 | 'bird6_conf': 'get', 59 | 'bird_variable': 'get', 60 | 'bird6_variable': 'get', 61 | 'log_maxbytes': 'getint', 62 | 'log_backups': 'getint', 63 | 'log_file': 'get', 64 | 'stderr_file': 'get', 65 | 'stderr_log_server': 'getboolean', 66 | 'log_server': 'get', 67 | 'log_server_port': 'getint', 68 | 'json_stdout': 'getboolean', 69 | 'json_log_server': 'getboolean', 70 | 'json_log_file': 'getboolean', 71 | 'purge_ip_prefixes': 'getboolean', 72 | 'bird_keep_changes': 'getboolean', 73 | 'bird6_keep_changes': 'getboolean', 74 | 'bird_changes_counter': 'getint', 75 | 'bird6_changes_counter': 'getint', 76 | 'bird_reconfigure_cmd': 'get', 77 | 'bird6_reconfigure_cmd': 'get', 78 | 'splay_startup': 'getfloat', 79 | 'prometheus_exporter': 'getboolean', 80 | 'prometheus_collector_textfile_dir': 'get', 81 | 'prometheus_exporter_interval': 'getint', 82 | } 83 | DAEMON_OPTIONAL_OPTIONS = [ 84 | 'stderr_log_server', 85 | 'stderr_file', 86 | 'log_server', 87 | 'log_file', 88 | 'splay_startup', 89 | ] 90 | 91 | 92 | def valid_ip_prefix(ip_prefix): 93 | """Perform a sanity check on ip_prefix. 94 | 95 | Arguments: 96 | ip_prefix (str): The IP-Prefix to validate 97 | 98 | Returns: 99 | True if ip_prefix is a valid IPv4 address with prefix length 32 or a 100 | valid IPv6 address with prefix length 128, otherwise False 101 | 102 | """ 103 | try: 104 | ip_prefix = ipaddress.ip_network(ip_prefix) 105 | except ValueError: 106 | return False 107 | else: 108 | if ip_prefix.version == 4 and ip_prefix.max_prefixlen != 32: 109 | return False 110 | if ip_prefix.version == 6 and ip_prefix.max_prefixlen != 128: 111 | return False 112 | return True 113 | 114 | 115 | def touch(file_path): 116 | """Touch a file in the same way as touch tool does. 117 | 118 | NOTE: 119 | If file_path doesn't exist it will be created. 120 | 121 | Arguments: 122 | file_path (str): The absolute file path 123 | 124 | Returns: 125 | None 126 | 127 | Raises: 128 | OSError exception 129 | 130 | """ 131 | with open(file_path, 'a'): 132 | os.utime(file_path, None) 133 | 134 | 135 | def get_ip_prefixes_from_config(config, services, ip_version): 136 | """Build a set of IP prefixes found in service configuration files. 137 | 138 | Arguments: 139 | config (obg): A configparser object which holds our configuration. 140 | services (list): A list of section names which are the name of the 141 | service checks. 142 | ip_version (int): IP protocol version 143 | 144 | Returns: 145 | A set of IP prefixes. 146 | 147 | """ 148 | ip_prefixes = set() 149 | 150 | for service in services: 151 | ip_prefix = ipaddress.ip_network(config.get(service, 'ip_prefix')) 152 | if ip_prefix.version == ip_version: 153 | ip_prefixes.add(ip_prefix.with_prefixlen) 154 | 155 | return ip_prefixes 156 | 157 | 158 | def ip_prefixes_sanity_check(config, bird_configuration): 159 | """Sanity check on IP prefixes. 160 | 161 | Arguments: 162 | config (obg): A configparser object which holds our configuration. 163 | bird_configuration (dict): A dictionary, which holds Bird configuration 164 | per IP protocol version. 165 | 166 | """ 167 | for ip_version in bird_configuration: 168 | modify_ip_prefixes(config, 169 | bird_configuration[ip_version]['config_file'], 170 | bird_configuration[ip_version]['variable_name'], 171 | bird_configuration[ip_version]['dummy_ip_prefix'], 172 | bird_configuration[ip_version]['reconfigure_cmd'], 173 | bird_configuration[ip_version]['keep_changes'], 174 | bird_configuration[ip_version]['changes_counter'], 175 | ip_version) 176 | 177 | 178 | def modify_ip_prefixes( 179 | config, 180 | config_file, 181 | variable_name, 182 | dummy_ip_prefix, 183 | reconfigure_cmd, 184 | keep_changes, 185 | changes_counter, 186 | ip_version): 187 | """Modify IP prefixes in Bird configuration. 188 | 189 | Depending on the configuration either removes or reports IP prefixes found 190 | in Bird configuration for which we don't have a service check associated 191 | with them. Moreover, it adds the dummy IP prefix if it isn't present and 192 | ensures that the correct variable name is set. 193 | 194 | Arguments: 195 | config (obg): A configparser object which holds our configuration. 196 | config_file (str): The file name of bird configuration 197 | variable_name (str): The name of the variable set in bird configuration 198 | dummy_ip_prefix (str): The dummy IP prefix, which must be always 199 | reconfigure_cmd (str): The command to run to trigger a reconfiguration 200 | on Bird daemon upon successful configuration update 201 | keep_changes (boolean): To enable keeping a history of changes applied 202 | to bird configuration 203 | changes_counter (int): The number of configuration changes to keep 204 | ip_version (int): IP protocol version of Bird configuration 205 | 206 | """ 207 | log = logging.getLogger(PROGRAM_NAME) 208 | services = config.sections() 209 | services.remove('daemon') # not needed during sanity check for IP-Prefixes 210 | update_bird_conf = False 211 | try: 212 | ip_prefixes_in_bird = get_ip_prefixes_from_bird(config_file) 213 | except OSError as error: 214 | log.error("failed to open Bird configuration %s, this is a FATAL " 215 | "error, thus exiting main program", error) 216 | sys.exit(1) 217 | 218 | _name = get_variable_name_from_bird(config_file) 219 | if _name is None: 220 | log.warning("failed to find variable name in %s, going to add it", 221 | config_file) 222 | update_bird_conf = True 223 | elif _name != variable_name: 224 | log.warning("found incorrect variable name in %s, going to add the " 225 | "correct one %s", _name, variable_name) 226 | update_bird_conf = True 227 | 228 | if dummy_ip_prefix not in ip_prefixes_in_bird: 229 | log.warning("dummy IP prefix %s is missing from bird configuration " 230 | "%s, adding it", dummy_ip_prefix, config_file) 231 | ip_prefixes_in_bird.insert(0, dummy_ip_prefix) 232 | update_bird_conf = True 233 | 234 | # Find IP prefixes in Bird configuration without a check. 235 | ip_prefixes_with_check = get_ip_prefixes_from_config( 236 | config, 237 | services, 238 | ip_version) 239 | # dummy_ip_prefix doesn't have a config by design 240 | ip_prefixes_with_check.add(dummy_ip_prefix) 241 | 242 | ip_prefixes_without_check = set(ip_prefixes_in_bird).difference( 243 | ip_prefixes_with_check) 244 | 245 | if ip_prefixes_without_check: 246 | if config.getboolean('daemon', 'purge_ip_prefixes'): 247 | log.warning("removing IP prefix(es) %s from %s because they don't " 248 | "have a service check configured", 249 | ','.join(ip_prefixes_without_check), 250 | config_file) 251 | ip_prefixes_in_bird[:] = (ip for ip in ip_prefixes_in_bird 252 | if ip not in ip_prefixes_without_check) 253 | update_bird_conf = True 254 | else: 255 | log.warning("found IP prefixes %s in %s without a service " 256 | "check configured", 257 | ','.join(ip_prefixes_without_check), 258 | config_file) 259 | 260 | if update_bird_conf: 261 | if keep_changes: 262 | archive_bird_conf(config_file, changes_counter) 263 | tempname = write_temp_bird_conf( 264 | dummy_ip_prefix, 265 | config_file, 266 | variable_name, 267 | ip_prefixes_in_bird 268 | ) 269 | try: 270 | os.rename(tempname, config_file) 271 | except OSError as error: 272 | msg = ("CRITICAL: failed to create Bird configuration {e}, " 273 | "this is FATAL error, thus exiting main program" 274 | .format(e=error)) 275 | sys.exit(f"{msg}") 276 | else: 277 | log.info("Bird configuration for IPv%s is updated", ip_version) 278 | reconfigure_bird(reconfigure_cmd) 279 | 280 | 281 | def load_configuration(config_file, config_dir, service_file): 282 | """Build configuration objects. 283 | 284 | If all sanity checks against daemon and service check settings are passed 285 | then it builds a ConfigParser object which holds all our configuration 286 | and a dictionary data structure which holds Bird configuration per IP 287 | protocol version. 288 | 289 | Arguments: 290 | config_file (str): The file name which holds daemon settings 291 | config_dir (str): The directory name which has configuration files 292 | for each service check 293 | service_file (str): A file which contains configuration for a single 294 | service check 295 | 296 | Returns: 297 | A tuple with 1st element a ConfigParser object and 2nd element 298 | a dictionary. 299 | Raises: 300 | ValueError if a sanity check fails. 301 | 302 | """ 303 | config_files = [config_file] 304 | config = configparser.ConfigParser() 305 | config.read_dict(DEFAULT_OPTIONS) 306 | 307 | if not os.path.isfile(config_file): 308 | raise ValueError("{f} configuration file either isn't readable or " 309 | "doesn't exist".format(f=config_file)) 310 | if service_file is not None: 311 | if not os.path.isfile(service_file): 312 | raise ValueError("{f} configuration file for a service check " 313 | "doesn't exist".format(f=service_file)) 314 | else: 315 | config_files.append(service_file) 316 | elif config_dir is not None: 317 | if not os.path.isdir(config_dir): 318 | raise ValueError("{d} directory with configuration files for " 319 | "service checks doesn't exist" 320 | .format(d=config_dir)) 321 | else: 322 | config_files.extend(glob.glob(os.path.join(config_dir, '*.conf'))) 323 | 324 | try: 325 | config.read(config_files) 326 | except configparser.Error as exc: 327 | raise ValueError(exc) 328 | 329 | configuration_check(config) 330 | bird_configuration = build_bird_configuration(config) 331 | create_bird_config_files(bird_configuration) 332 | 333 | return config, bird_configuration 334 | 335 | 336 | def configuration_check(config): 337 | """Perform a sanity check on configuration. 338 | 339 | First it performs a sanity check against settings for daemon 340 | and then against settings for each service check. 341 | 342 | Arguments: 343 | config (obj): A configparser object which holds our configuration. 344 | 345 | Returns: 346 | None if all checks are successfully passed otherwise raises a 347 | ValueError exception. 348 | 349 | """ 350 | log_level = config.get('daemon', 'loglevel') 351 | num_level = getattr(logging, log_level.upper(), None) 352 | pidfile = config.get('daemon', 'pidfile') 353 | 354 | # Catch the case where the directory, under which we store the pid file, is 355 | # missing. 356 | if not os.path.isdir(os.path.dirname(pidfile)): 357 | raise ValueError(f"{os.path.dirname(pidfile)} doesn't exit") 358 | 359 | if not isinstance(num_level, int): 360 | raise ValueError(f'Invalid log level: {log_level}') 361 | 362 | for _file in 'log_file', 'stderr_file': 363 | if config.has_option('daemon', _file): 364 | try: 365 | touch(config.get('daemon', _file)) 366 | except OSError as exc: 367 | raise ValueError(exc) 368 | 369 | for option, getter in DAEMON_OPTIONS_TYPE.items(): 370 | try: 371 | getattr(config, getter)('daemon', option) 372 | except configparser.NoOptionError as error: 373 | if option not in DAEMON_OPTIONAL_OPTIONS: 374 | raise ValueError(error) 375 | except configparser.Error as error: 376 | raise ValueError(error) 377 | except ValueError as exc: 378 | msg = ("invalid data for '{opt}' option in daemon section: {err}" 379 | .format(opt=option, err=exc)) 380 | raise ValueError(msg) 381 | 382 | service_configuration_check(config) 383 | 384 | 385 | def service_configuration_check(config): 386 | """Perform a sanity check against options for each service check. 387 | 388 | Arguments: 389 | config (obj): A configparser object which holds our configuration. 390 | 391 | Returns: 392 | None if all sanity checks are successfully passed otherwise raises a 393 | ValueError exception. 394 | 395 | """ 396 | ipv4_enabled = config.getboolean('daemon', 'ipv4') 397 | ipv6_enabled = config.getboolean('daemon', 'ipv6') 398 | services = config.sections() 399 | # we don't need it during sanity check for services check 400 | services.remove('daemon') 401 | ip_prefixes = [] 402 | 403 | for service in services: 404 | for option, getter in SERVICE_OPTIONS_TYPE.items(): 405 | try: 406 | getattr(config, getter)(service, option) 407 | except configparser.NoOptionError as error: 408 | if option not in SERVICE_OPTIONAL_OPTIONS: 409 | raise ValueError(error) 410 | except configparser.Error as error: 411 | raise ValueError(error) 412 | except ValueError as exc: 413 | msg = ("invalid data for '{opt}' option in service check " 414 | "{name}: {err}" 415 | .format(opt=option, name=service, err=exc)) 416 | raise ValueError(msg) 417 | 418 | if (config.get(service, 'on_disabled') != 'withdraw' and 419 | config.get(service, 'on_disabled') != 'advertise'): 420 | msg = ("'on_disabled' option has invalid value ({val}) for " 421 | "service check {name}, 'on_disabled option should be set " 422 | "either to 'withdraw' or to 'advertise'" 423 | .format(name=service, 424 | val=config.get(service, 'on_disabled'))) 425 | raise ValueError(msg) 426 | 427 | ip_prefixes.append(config.get(service, 'ip_prefix')) 428 | 429 | if not valid_ip_prefix(config.get(service, 'ip_prefix')): 430 | msg = ("invalid value ({val}) for 'ip_prefix' option in service " 431 | "check {name}. It should be an IP PREFIX in form of " 432 | "ip/prefixlen." 433 | .format(name=service, val=config.get(service, 'ip_prefix'))) 434 | raise ValueError(msg) 435 | 436 | _ip_prefix = ipaddress.ip_network(config.get(service, 'ip_prefix')) 437 | if not ipv6_enabled and _ip_prefix.version == 6: 438 | raise ValueError("IPv6 support is disabled in " 439 | "anycast-healthchecker while there is an IPv6 " 440 | "prefix configured for {name} service check" 441 | .format(name=service)) 442 | if not ipv4_enabled and _ip_prefix.version == 4: 443 | raise ValueError("IPv4 support is disabled in " 444 | "anycast-healthchecker while there is an IPv4 " 445 | "prefix configured for {name} service check" 446 | .format(name=service)) 447 | 448 | cmd = shlex.split(config.get(service, 'check_cmd')) 449 | try: 450 | proc = subprocess.Popen(cmd) 451 | proc.kill() 452 | except (OSError, subprocess.SubprocessError) as exc: 453 | msg = ("failed to run check command '{cmd}' for service check " 454 | "{name}: {err}" 455 | .format(name=service, 456 | cmd=config.get(service, 'check_cmd'), 457 | err=exc)) 458 | raise ValueError(msg) 459 | 460 | occurrences_of_ip_prefixes = Counter(ip_prefixes) 461 | for ip_prefix, counter in occurrences_of_ip_prefixes.items(): 462 | if counter > 1: 463 | raise ValueError("{ip} is used by {c} service checks" 464 | .format(ip=ip_prefix, c=counter)) 465 | 466 | 467 | def build_bird_configuration(config): 468 | """Build bird configuration structure. 469 | 470 | First it performs a sanity check against bird settings and then builds a 471 | dictionary structure with bird configuration per IP version. 472 | 473 | Arguments: 474 | config (obj): A configparser object which holds our configuration. 475 | 476 | Returns: 477 | A dictionary 478 | 479 | Raises: 480 | ValueError if sanity check fails. 481 | 482 | """ 483 | bird_configuration = {} 484 | 485 | if config.getboolean('daemon', 'ipv4'): 486 | if os.path.islink(config.get('daemon', 'bird_conf')): 487 | config_file = os.path.realpath(config.get('daemon', 'bird_conf')) 488 | print("'bird_conf' is set to a symbolic link ({s} -> {d}, but we " 489 | "will use the canonical path of that link" 490 | .format(s=config.get('daemon', 'bird_conf'), d=config_file)) 491 | else: 492 | config_file = config.get('daemon', 'bird_conf') 493 | 494 | dummy_ip_prefix = config.get('daemon', 'dummy_ip_prefix') 495 | if not valid_ip_prefix(dummy_ip_prefix): 496 | raise ValueError("invalid dummy IPv4 prefix: {i}" 497 | .format(i=dummy_ip_prefix)) 498 | 499 | bird_configuration[4] = { 500 | 'config_file': config_file, 501 | 'variable_name': config.get('daemon', 'bird_variable'), 502 | 'dummy_ip_prefix': dummy_ip_prefix, 503 | 'reconfigure_cmd': config.get('daemon', 'bird_reconfigure_cmd'), 504 | 'keep_changes': config.getboolean('daemon', 'bird_keep_changes'), 505 | 'changes_counter': config.getint('daemon', 'bird_changes_counter') 506 | } 507 | if config.getboolean('daemon', 'ipv6'): 508 | if os.path.islink(config.get('daemon', 'bird6_conf')): 509 | config_file = os.path.realpath(config.get('daemon', 'bird6_conf')) 510 | print("'bird6_conf' is set to a symbolic link ({s} -> {d}, but we " 511 | "will use the canonical path of that link" 512 | .format(s=config.get('daemon', 'bird6_conf'), d=config_file)) 513 | else: 514 | config_file = config.get('daemon', 'bird6_conf') 515 | 516 | dummy_ip_prefix = config.get('daemon', 'dummy_ip6_prefix') 517 | if not valid_ip_prefix(dummy_ip_prefix): 518 | raise ValueError("invalid dummy IPv6 prefix: {i}" 519 | .format(i=dummy_ip_prefix)) 520 | bird_configuration[6] = { 521 | 'config_file': config_file, 522 | 'variable_name': config.get('daemon', 'bird6_variable'), 523 | 'dummy_ip_prefix': dummy_ip_prefix, 524 | 'reconfigure_cmd': config.get('daemon', 'bird6_reconfigure_cmd'), 525 | 'keep_changes': config.getboolean('daemon', 'bird6_keep_changes'), 526 | 'changes_counter': config.getint('daemon', 'bird6_changes_counter') 527 | } 528 | 529 | return bird_configuration 530 | 531 | 532 | def get_variable_name_from_bird(bird_conf): 533 | """Return the variable name set in Bird configuration. 534 | 535 | The variable name in Bird configuration is set with the keyword 'define', 536 | here is an example: 537 | 538 | define ACAST_PS_ADVERTISE = 539 | 540 | and we exract the string between the word 'define' and the equals sign. 541 | 542 | Arguments: 543 | bird_conf (str): The absolute file name path of Bird configuration. 544 | 545 | Returns: 546 | The variable name as a string or None if it isn't found. 547 | 548 | """ 549 | bird_variable_pattern = re.compile( 550 | r''' 551 | ^\s* 552 | define\s+ 553 | (?P\S+\b) 554 | \s+ 555 | = 556 | ''', re.VERBOSE 557 | ) 558 | 559 | with open(bird_conf, 'r') as content: 560 | for line in content.readlines(): 561 | variable_match = bird_variable_pattern.search(line) 562 | if variable_match: 563 | return variable_match.group('name') 564 | 565 | return None 566 | 567 | 568 | def create_bird_config_files(bird_configuration): 569 | """Create bird configuration files per IP version. 570 | 571 | Creates bird configuration files if they don't exist. It also creates the 572 | directories where we store the history of changes, if this functionality is 573 | enabled. 574 | 575 | Arguments: 576 | bird_configuration (dict): A dictionary with settings for bird. 577 | 578 | Returns: 579 | None 580 | 581 | Raises: 582 | ValueError if we can't create bird configuration files and the 583 | directory to store the history of changes in bird configuration file. 584 | 585 | """ 586 | for ip_version in bird_configuration: 587 | # This creates the file if it doesn't exist. 588 | config_file = bird_configuration[ip_version]['config_file'] 589 | try: 590 | touch(config_file) 591 | except OSError as exc: 592 | raise ValueError("failed to create {f}:{e}" 593 | .format(f=config_file, e=exc)) 594 | if bird_configuration[ip_version]['keep_changes']: 595 | history_dir = os.path.join(os.path.dirname(config_file), 'history') 596 | try: 597 | os.mkdir(history_dir) 598 | except FileExistsError: 599 | pass 600 | except OSError as exc: 601 | raise ValueError("failed to make directory {d} for keeping a " 602 | "history of changes for {b}:{e}" 603 | .format(d=history_dir, b=config_file, e=exc)) 604 | else: 605 | print(f"{history_dir} is created") 606 | 607 | 608 | def running(processid): 609 | """Check the validity of a process ID. 610 | 611 | Arguments: 612 | processid (int): Process ID number. 613 | 614 | Returns: 615 | True if process ID is found otherwise False. 616 | 617 | """ 618 | try: 619 | # From kill(2) 620 | # If sig is 0 (the null signal), error checking is performed but no 621 | # signal is actually sent. The null signal can be used to check the 622 | # validity of pid 623 | os.kill(processid, 0) 624 | except OverflowError as exc: 625 | print("checking validity of pid ({p}) failed with: {e}" 626 | .format(p=processid, e=exc)) 627 | sys.exit(1) 628 | except OSError: 629 | return False 630 | else: 631 | return True 632 | 633 | 634 | def get_ip_prefixes_from_bird(filename): 635 | """Build a list of IP prefixes found in Bird configuration. 636 | 637 | Arguments: 638 | filename (str): The absolute path of the Bird configuration file. 639 | 640 | Notes: 641 | It can only parse a file with the following format 642 | 643 | define ACAST_PS_ADVERTISE = 644 | [ 645 | 10.189.200.155/32, 646 | 10.189.200.255/32 647 | ]; 648 | 649 | Returns: 650 | A list of IP prefixes. 651 | 652 | """ 653 | prefixes = [] 654 | with open(filename, 'r') as bird_conf: 655 | lines = bird_conf.read() 656 | 657 | for line in lines.splitlines(): 658 | line = line.strip(', ') 659 | if valid_ip_prefix(line): 660 | prefixes.append(line) 661 | 662 | return prefixes 663 | 664 | 665 | class BaseOperation: 666 | """Run operation on a list. 667 | 668 | Arguments: 669 | name (string): The name of the service for the given ip_prefix 670 | ip_prefix (string): The value to run the operation 671 | ip_version (string): IP protocol version 672 | bird_reconfigure_cmd (string): A custom command to trigger 673 | reconfiguration to Bird Daemon. 674 | bird_reconfigure_cmd_timeout (float): Maximum time to wait for command 675 | to complete. 676 | """ 677 | 678 | def __init__( 679 | self, 680 | name, 681 | ip_prefix, 682 | ip_version, 683 | bird_reconfigure_cmd, 684 | bird_reconfigure_timeout): # noqa:D102 685 | self.name = name 686 | self.ip_prefix = ip_prefix 687 | self.log = logging.getLogger(PROGRAM_NAME) 688 | self.ip_version = ip_version 689 | self.bird_reconfigure_cmd = bird_reconfigure_cmd 690 | self.bird_reconfigure_timeout = bird_reconfigure_timeout 691 | 692 | 693 | class AddOperation(BaseOperation): 694 | """Add a value to a list.""" 695 | 696 | def __str__(self): 697 | """Handy string representation.""" 698 | return 'add to' 699 | 700 | def update(self, prefixes): 701 | """Add a value to the list. 702 | 703 | Arguments: 704 | prefixes(list): A list to add the value 705 | """ 706 | if self.ip_prefix not in prefixes: 707 | prefixes.append(self.ip_prefix) 708 | self.log.info("announcing %s for %s", self.ip_prefix, self.name) 709 | return True 710 | 711 | return False 712 | 713 | 714 | class DeleteOperation(BaseOperation): 715 | """Remove a value from a list.""" 716 | 717 | def __str__(self): 718 | """Handy string representation.""" 719 | return 'delete from' 720 | 721 | def update(self, prefixes): 722 | """Remove a value to the list. 723 | 724 | Arguments: 725 | prefixes(list): A list to remove the value 726 | """ 727 | if self.ip_prefix in prefixes: 728 | prefixes.remove(self.ip_prefix) 729 | self.log.info("withdrawing %s for %s", self.ip_prefix, self.name) 730 | return True 731 | 732 | return False 733 | 734 | 735 | def reconfigure_bird(cmd): 736 | """Reconfigure BIRD daemon. 737 | 738 | Arguments: 739 | cmd (string): A command to trigger a reconfiguration of Bird daemon 740 | 741 | Notes: 742 | Runs 'birdc configure' to reconfigure BIRD. Some useful information on 743 | how birdc tool works: 744 | -- Returns a non-zero exit code only when it can't access BIRD 745 | daemon via the control socket (/var/run/bird.ctl). This happens 746 | when BIRD daemon is either down or when the caller of birdc 747 | doesn't have access to the control socket. 748 | -- Returns zero exit code when reconfigure fails due to invalid 749 | configuration. Thus, we catch this case by looking at the output 750 | and not at the exit code. 751 | -- Returns zero exit code when reconfigure was successful. 752 | -- Should never timeout, if it does then it is a bug. 753 | 754 | """ 755 | log = logging.getLogger(PROGRAM_NAME) 756 | cmd = shlex.split(cmd) 757 | log.info("reconfiguring BIRD by running %s", ' '.join(cmd)) 758 | try: 759 | output = subprocess.check_output( 760 | cmd, 761 | timeout=2, 762 | stderr=subprocess.STDOUT, 763 | universal_newlines=True, 764 | ) 765 | except subprocess.TimeoutExpired: 766 | log.error("reconfiguring bird timed out") 767 | return 768 | except subprocess.CalledProcessError as error: 769 | # birdc returns 0 even when it fails due to invalid config, 770 | # but it returns 1 when BIRD is down. 771 | log.error("reconfiguring BIRD failed, either BIRD daemon is down or " 772 | "we don't have privileges to reconfigure it (sudo problems?)" 773 | ":%s", error.output.strip()) 774 | return 775 | except FileNotFoundError as error: 776 | log.error("reconfiguring BIRD failed with: %s", error) 777 | return 778 | 779 | # 'Reconfigured' string will be in the output if and only if conf is valid. 780 | pattern = re.compile('^Reconfigured$', re.MULTILINE) 781 | if pattern.search(str(output)): 782 | log.info('reconfigured BIRD daemon') 783 | else: 784 | # We will end up here only if we generated an invalid conf 785 | # or someone broke bird.conf. 786 | log.error("reconfiguring BIRD returned error, most likely we generated" 787 | " an invalid configuration file or Bird configuration in is " 788 | "broken:%s", output) 789 | 790 | 791 | def write_temp_bird_conf(dummy_ip_prefix, 792 | config_file, 793 | variable_name, 794 | prefixes): 795 | """Write in a temporary file the list of IP-Prefixes. 796 | 797 | A failure to create and write the temporary file will exit main program. 798 | 799 | Arguments: 800 | dummy_ip_prefix (str): The dummy IP prefix, which must be always 801 | config_file (str): The file name of bird configuration 802 | variable_name (str): The name of the variable set in bird configuration 803 | prefixes (list): The list of IP-Prefixes to write 804 | 805 | Returns: 806 | The filename of the temporary file 807 | 808 | """ 809 | log = logging.getLogger(PROGRAM_NAME) 810 | comment = ("# {i} is a dummy IP Prefix. It should NOT be used and " 811 | "REMOVED from the constant.".format(i=dummy_ip_prefix)) 812 | 813 | # the temporary file must be on the same filesystem as the bird config 814 | # as we use os.rename to perform an atomic update on the bird config. 815 | # Thus, we create it in the same directory that bird config is stored. 816 | tm_file = os.path.join(os.path.dirname(config_file), str(time.time())) 817 | log.debug("going to write to %s", tm_file) 818 | 819 | try: 820 | with open(tm_file, 'w') as tmpf: 821 | tmpf.write("# Generated {t} by {n} (pid={p})\n" 822 | .format(t=datetime.datetime.now(), 823 | n=PROGRAM_NAME, 824 | p=os.getpid())) 825 | tmpf.write(f"{comment}\n") 826 | tmpf.write(f"define {variable_name} =\n") 827 | tmpf.write("{s}[\n".format(s=4 * ' ')) 828 | # all entries of the array need a trailing comma except the last 829 | # one. A single element array doesn't need a trailing comma. 830 | tmpf.write(',\n'.join([' '*8 + n for n in prefixes])) 831 | tmpf.write("\n{s}];\n".format(s=4 * ' ')) 832 | except OSError as error: 833 | log.critical("failed to write temporary file %s: %s. This is a FATAL " 834 | "error, this exiting main program", tm_file, error) 835 | sys.exit(1) 836 | else: 837 | return tm_file 838 | 839 | 840 | def archive_bird_conf(config_file, changes_counter): 841 | """Keep a history of Bird configuration files. 842 | 843 | Arguments: 844 | config_file (str): file name of bird configuration 845 | changes_counter (int): number of configuration files to keep in the 846 | history 847 | """ 848 | log = logging.getLogger(PROGRAM_NAME) 849 | history_dir = os.path.join(os.path.dirname(config_file), 'history') 850 | dst = os.path.join(history_dir, str(time.time())) 851 | log.debug("coping %s to %s", config_file, dst) 852 | 853 | history = [x for x in os.listdir(history_dir) 854 | if os.path.isfile(os.path.join(history_dir, x))] 855 | 856 | if len(history) > changes_counter: 857 | log.info("threshold of %s is reached, removing old files", 858 | changes_counter) 859 | for _file in sorted(history, reverse=True)[changes_counter - 1:]: 860 | _path = os.path.join(history_dir, _file) 861 | try: 862 | os.remove(_path) 863 | except OSError as exc: 864 | log.warning("failed to remove %s: %s", _file, exc) 865 | else: 866 | log.info("removed %s", _path) 867 | 868 | try: 869 | shutil.copy2(config_file, dst) 870 | except OSError as exc: 871 | log.warning("failed to copy %s to %s: %s", config_file, dst, exc) 872 | 873 | 874 | def update_pidfile(pidfile): 875 | """Update pidfile. 876 | 877 | Notice: 878 | We should call this function only after we have successfully acquired 879 | a lock and never before. It exits main program if it fails to parse 880 | and/or write pidfile. 881 | 882 | Arguments: 883 | pidfile (str): pidfile to update 884 | 885 | """ 886 | try: 887 | with open(pidfile, mode='r') as _file: 888 | pid = _file.read(1024).rstrip() 889 | 890 | try: 891 | pid = int(pid) 892 | except ValueError: 893 | print(f"cleaning stale pidfile with invalid data:'{pid}'") 894 | write_pid(pidfile) 895 | else: 896 | if running(pid): 897 | # This is to catch migration issues from 0.7.x to 0.8.x 898 | # version, where old process is still around as it failed to 899 | # be stopped. Since newer version has a different locking 900 | # mechanism, we can end up with both versions running. 901 | # In order to avoid this situation we refuse to startup. 902 | sys.exit(f"process {pid} is already running") 903 | else: 904 | # pidfile exists with a PID for a process that is not running. 905 | # Let's update PID. 906 | print(f"updating stale processID({pid}) in pidfile") 907 | write_pid(pidfile) 908 | except FileNotFoundError: 909 | # Either it's 1st time we run or previous run was terminated 910 | # successfully. 911 | print(f"creating pidfile {pidfile}") 912 | write_pid(pidfile) 913 | except OSError as exc: 914 | sys.exit(f"failed to update pidfile:{exc}") 915 | 916 | 917 | def write_pid(pidfile): 918 | """Write processID to the pidfile. 919 | 920 | Notice: 921 | It exits main program if it fails to write pidfile. 922 | 923 | Arguments: 924 | pidfile (str): pidfile to update 925 | 926 | """ 927 | pid = str(os.getpid()) 928 | try: 929 | with open(pidfile, mode='w') as _file: 930 | print(f"writing processID {pid} to pidfile") 931 | _file.write(pid) 932 | except OSError as exc: 933 | sys.exit(f"failed to write pidfile:{exc}") 934 | 935 | 936 | def shutdown(pidfile, signalnb=None, frame=None): 937 | """Clean up pidfile upon shutdown. 938 | 939 | Notice: 940 | We should register this function as signal handler for the following 941 | termination signals: 942 | SIGHUP 943 | SIGTERM 944 | SIGABRT 945 | SIGINT 946 | 947 | Arguments: 948 | pidfile (str): pidfile to remove 949 | signalnb (int): The ID of signal 950 | frame (obj): Frame object at the time of receiving the signal 951 | 952 | """ 953 | log = logging.getLogger(PROGRAM_NAME) 954 | log.info("received %s at %s", signalnb, frame) 955 | log.info("going to remove pidfile %s", pidfile) 956 | # no point to catch possible errors when we delete the pid file 957 | os.unlink(pidfile) 958 | log.info('shutdown is complete') 959 | sys.exit(0) 960 | 961 | 962 | def setup_logger(config): 963 | """Configure the logging environment. 964 | 965 | Notice: 966 | By default logging will go to STDOUT and messages for unhandled 967 | exceptions or crashes will go to STDERR. If log_file and/or log_server 968 | is set then we don't log to STDOUT. Messages for unhandled exceptions 969 | or crashes can only go to either STDERR or to stderr_file or to 970 | stderr_log_server. 971 | 972 | Arguments: 973 | config (obj): A configparser object which holds our configuration. 974 | 975 | Returns: 976 | A logger with all possible handlers configured. 977 | 978 | """ 979 | logger = logging.getLogger(PROGRAM_NAME) 980 | num_level = getattr( 981 | logging, 982 | config.get('daemon', 'loglevel').upper(), # pylint: disable=no-member 983 | None 984 | ) 985 | logger.setLevel(num_level) 986 | lengths = [] 987 | for section in config: 988 | lengths.append(len(section)) 989 | 990 | width = sorted(lengths)[-1] + 1 991 | 992 | def log_format(): 993 | """Produce a log format line.""" 994 | supported_keys = [ 995 | 'asctime', 996 | 'levelname', 997 | 'process', 998 | # 'funcName', 999 | # 'lineno', 1000 | 'threadName', 1001 | 'message', 1002 | ] 1003 | 1004 | return ' '.join([f'%({i:s})' for i in supported_keys]) 1005 | 1006 | custom_format = log_format() 1007 | json_formatter = CustomJsonFormatter(custom_format, 1008 | prefix=PROGRAM_NAME + ': ') 1009 | formatter = logging.Formatter( 1010 | '%(asctime)s {program}[%(process)d] %(levelname)-8s ' 1011 | '%(threadName)-{width}s %(message)s' 1012 | .format(program=PROGRAM_NAME, width=width) 1013 | ) 1014 | 1015 | # Register logging handlers based on configuration. 1016 | if config.has_option('daemon', 'log_file'): 1017 | file_handler = logging.handlers.RotatingFileHandler( 1018 | config.get('daemon', 'log_file'), 1019 | maxBytes=config.getint('daemon', 'log_maxbytes'), 1020 | backupCount=config.getint('daemon', 'log_backups') 1021 | ) 1022 | 1023 | if config.getboolean('daemon', 'json_log_file'): 1024 | file_handler.setFormatter(json_formatter) 1025 | else: 1026 | file_handler.setFormatter(formatter) 1027 | logger.addHandler(file_handler) 1028 | 1029 | if config.has_option('daemon', 'log_server'): 1030 | udp_handler = logging.handlers.SysLogHandler( 1031 | ( 1032 | config.get('daemon', 'log_server'), 1033 | config.getint('daemon', 'log_server_port') 1034 | ) 1035 | ) 1036 | 1037 | if config.getboolean('daemon', 'json_log_server'): 1038 | udp_handler.setFormatter(json_formatter) 1039 | else: 1040 | udp_handler.setFormatter(formatter) 1041 | logger.addHandler(udp_handler) 1042 | 1043 | # Log to STDOUT if and only if log_file and log_server aren't enabled 1044 | if (not config.has_option('daemon', 'log_file') 1045 | and not config.has_option('daemon', 'log_server')): 1046 | stream_handler = logging.StreamHandler() 1047 | if config.getboolean('daemon', 'json_stdout'): 1048 | stream_handler.setFormatter(json_formatter) 1049 | elif config.getboolean('daemon', 'log_format_journalctl'): 1050 | stream_handler.setFormatter(JournalFormatter('%(threadName)s - %(message)s')) 1051 | else: 1052 | stream_handler.setFormatter(formatter) 1053 | logger.addHandler(stream_handler) 1054 | 1055 | # We can redirect STDERR only to one destination. 1056 | if config.has_option('daemon', 'stderr_file'): 1057 | sys.stderr = CustomRotatingFileLogger( 1058 | filepath=config.get('daemon', 'stderr_file'), 1059 | maxbytes=config.getint('daemon', 'log_maxbytes'), 1060 | backupcount=config.getint('daemon', 'log_backups') 1061 | ) 1062 | elif (config.has_option('daemon', 'stderr_log_server') 1063 | and not config.has_option('daemon', 'stderr_file')): 1064 | sys.stderr = CustomUdpLogger( 1065 | server=config.get('daemon', 'log_server'), 1066 | port=config.getint('daemon', 'log_server_port') 1067 | ) 1068 | else: 1069 | print('messages for unhandled exceptions will go to STDERR') 1070 | 1071 | return logger 1072 | 1073 | 1074 | class CustomLogger: 1075 | """Helper Logger to redirect STDOUT or STDERR to a logging hander. 1076 | 1077 | It wraps a Logger class into a file like object, which provides a handy 1078 | way to redirect STDOUT or STDERR to a logger. This class provides the 1079 | necessary methods (write, flush and close) to build a file-like object 1080 | and it can not be used directly as it does not provide a logging handler. 1081 | Instead, you must instantiate one of subclass (CustomRotatingFileLogger and 1082 | CustomUdpLogger). 1083 | 1084 | Arguments 1085 | handler (int): A logging handler to use. 1086 | 1087 | Methods: 1088 | write(string): Write string to logger with newlines removed. 1089 | flush(): Flushe logger messages. 1090 | close(): Close logger. 1091 | 1092 | Returns: 1093 | A logger object. 1094 | 1095 | """ 1096 | 1097 | def __init__(self, handler): 1098 | """Create a logging.Logger class with extended functionality.""" 1099 | log_format = ('%(asctime)s {program}[%(process)d] ' 1100 | '%(threadName)s %(message)s' 1101 | .format(program=PROGRAM_NAME)) 1102 | self.logger = logging.getLogger('stderr') 1103 | self.logger.setLevel(logging.DEBUG) 1104 | self.handler = handler 1105 | formatter = logging.Formatter(log_format) 1106 | self.handler.setFormatter(formatter) 1107 | self.logger.addHandler(self.handler) 1108 | 1109 | def write(self, string): 1110 | """Erase newline from a string and write to the logger.""" 1111 | string = string.rstrip() 1112 | if string: # Don't log empty lines 1113 | self.logger.critical(string) 1114 | 1115 | def flush(self): 1116 | """Flush logger's data.""" 1117 | # In case multiple handlers are attached to the logger make sure they 1118 | # are flushed. 1119 | for handler in self.logger.handlers: 1120 | handler.flush() 1121 | 1122 | def close(self): 1123 | """Call the closer method of the logger.""" 1124 | # In case multiple handlers are attached to the logger make sure they 1125 | # are all closed. 1126 | for handler in self.logger.handlers: 1127 | handler.close() 1128 | 1129 | 1130 | class CustomRotatingFileLogger(CustomLogger): 1131 | """Subclass CustomLogger to provide a rotating file logger. 1132 | 1133 | The rotation of log file is enabled by default. 1134 | 1135 | Usage: 1136 | >>> import sys 1137 | >>> sys.stderr = CustomRotatingFileLogger(filepath='/foo.log') 1138 | 1139 | Arguments: 1140 | filepath (str) : file path of the log file 1141 | maxbytes (int) : log file is rotated when it grows bigger than maxbytes 1142 | backupcount (int): maximum rotated log file to keep. 1143 | 1144 | Notice: 1145 | If maxbytes is zero, rollover never occurs and we may fill up the disk. 1146 | An external program (logrotate) must not rotate the file in this case, 1147 | as the file descriptor for the log file will be changed without us 1148 | knowing this. 1149 | 1150 | Returns: 1151 | A logger object. 1152 | 1153 | """ 1154 | 1155 | def __init__(self, filepath, *, maxbytes=10485, backupcount=8): 1156 | """Create a logging.Logger class with extended functionality.""" 1157 | handler = logging.handlers.RotatingFileHandler(filepath, 1158 | maxBytes=maxbytes, 1159 | backupCount=backupcount) 1160 | super().__init__(handler=handler) 1161 | 1162 | 1163 | class CustomUdpLogger(CustomLogger): 1164 | """Subclass CustomLogger to provide a UDP logger. 1165 | 1166 | Usage: 1167 | >>> import sys 1168 | >>> sys.stderr = CustomUdpLogger(server='127.0.0.1') 1169 | 1170 | Arguments 1171 | server (str): UDP server name or IP address. 1172 | port (int): Port number. 1173 | 1174 | Returns: 1175 | A logger object. 1176 | 1177 | """ 1178 | 1179 | def __init__(self, server='127.0.0.1', port=514): 1180 | """Create a logging.Logger class with extended functionality.""" 1181 | handler = logging.handlers.SysLogHandler((server, port)) 1182 | super().__init__(handler=handler) 1183 | 1184 | 1185 | class CustomJsonFormatter(jsonlogger.JsonFormatter): 1186 | """Customize the Json Formatter.""" 1187 | 1188 | def process_log_record(self, log_record): 1189 | """Add customer record keys and rename threadName key.""" 1190 | log_record["version"] = __version__ 1191 | log_record["program"] = PROGRAM_NAME 1192 | log_record["service_name"] = log_record.pop('threadName', None) 1193 | # return jsonlogger.JsonFormatter.process_log_record(self, log_record) 1194 | 1195 | return log_record 1196 | 1197 | 1198 | class JournalFormatter(logging.Formatter): 1199 | """Format logs in a way that journald can interpret severity levels.""" 1200 | 1201 | SEVERITY_MAP = { 1202 | logging.DEBUG: 7, # DEBUG 1203 | logging.INFO: 6, # INFO 1204 | logging.WARNING: 4, # WARNING 1205 | logging.ERROR: 3, # ERR 1206 | logging.CRITICAL: 2 # CRIT 1207 | } 1208 | 1209 | def format(self, record): 1210 | message = super().format(record) 1211 | severity = self.SEVERITY_MAP.get(record.levelno, 6) 1212 | 1213 | # Adding journald's special PRIORITY field 1214 | return f"<{severity}>{message}" 1215 | 1216 | 1217 | class ServiceCheckDiedError(Exception): 1218 | """Raised when a thread that runs service check dies. 1219 | 1220 | Arguments: 1221 | name (str): the name of the thread 1222 | trace (str): A trace, preferably the output of traceback.format_exc() 1223 | """ 1224 | 1225 | def __init__(self, name, trace): 1226 | """Initialize.""" 1227 | super().__init__() 1228 | self.name = name 1229 | self.trace = trace 1230 | 1231 | def __str__(self): 1232 | """More useful.""" 1233 | return ("thread for service {n} died due to : {t}" 1234 | .format(n=self.name, t=self.trace)) 1235 | 1236 | 1237 | def run_custom_bird_reconfigure(operation): 1238 | """Reconfigure BIRD daemon by running a custom command. 1239 | 1240 | It adds one argument to the command, either "up" or "down". 1241 | If command times out then we kill it. In order to avoid leaving any orphan 1242 | processes, that may have been started by the command, we start a new 1243 | session when we invoke the command and then we kill process group of that 1244 | session. 1245 | 1246 | Arguments: 1247 | operation (obj): Either a AddOperation or DeleteOperation object. 1248 | 1249 | """ 1250 | log = logging.getLogger(PROGRAM_NAME) 1251 | if isinstance(operation, AddOperation): 1252 | status = 'up' 1253 | else: 1254 | status = 'down' 1255 | cmd = shlex.split(operation.bird_reconfigure_cmd + " " + status) 1256 | log.info("reconfiguring BIRD by running custom command %s", ' '.join(cmd)) 1257 | try: 1258 | proc = subprocess.Popen(cmd, 1259 | start_new_session=True, 1260 | stdout=subprocess.PIPE, 1261 | stderr=subprocess.PIPE) 1262 | _, errs = proc.communicate( 1263 | timeout=operation.bird_reconfigure_timeout 1264 | ) 1265 | except OSError as exc: 1266 | log.error("reconfiguring BIRD failed with: %s", exc) 1267 | except subprocess.TimeoutExpired as exc: 1268 | log.error("reconfiguring bird timed out") 1269 | if proc.poll() is None: # if process is still alive 1270 | try: 1271 | os.killpg(os.getpgid(proc.pid), signal.SIGTERM) 1272 | except PermissionError as exc: 1273 | log.error("failed to terminate custom bird command: %s", exc) 1274 | else: 1275 | if proc.returncode != 0: 1276 | log.error("reconfiguring BIRD failed with return code: %s and " 1277 | "stderr: %s", proc.returncode, errs) 1278 | else: 1279 | log.info("custom command successfully reconfigured Bird") 1280 | 1281 | 1282 | class MainExporter(Thread): 1283 | """Handle the health checking for a service. 1284 | 1285 | Methods: 1286 | run(): Run method of the thread. 1287 | 1288 | """ 1289 | 1290 | def __init__(self, registry, services, config): 1291 | """Set the name of thread to be the name of the service.""" 1292 | super(MainExporter, self).__init__() 1293 | self.daemon = True 1294 | self.name = 'PrometheusExporter' 1295 | self.registry = registry 1296 | self.uptime = Gauge( 1297 | name='uptime', 1298 | namespace=f"{METRIC_PREFIX}", 1299 | documentation='Uptime of the process in seconds', 1300 | registry=registry 1301 | ) 1302 | self.state = Gauge( 1303 | name='state', 1304 | namespace=f"{METRIC_PREFIX}", 1305 | documentation='The current state of the process: 0 = down, 1 = up', 1306 | registry=registry 1307 | ) 1308 | self.info = Info( 1309 | name='version', 1310 | documentation='Version of the software', 1311 | namespace=f"{METRIC_PREFIX}", 1312 | registry=registry 1313 | ) 1314 | self.metric_services = Gauge( 1315 | name='service', 1316 | namespace=f"{METRIC_PREFIX}", 1317 | labelnames=['service_name', 'ip_prefix'], 1318 | documentation='The configured service checks', 1319 | registry=registry 1320 | ) 1321 | self.services = services 1322 | self.config = config 1323 | self.startup_time = time.time() 1324 | 1325 | def run(self): 1326 | """The run method.""" 1327 | 1328 | textfile = os.path.join( 1329 | self.config.get( 1330 | 'daemon', 1331 | 'prometheus_collector_textfile_dir' 1332 | ), 1333 | "anycast_healthchecker.prom", 1334 | ) 1335 | log = logging.getLogger(PROGRAM_NAME) 1336 | interval = self.config.getint('daemon', 'prometheus_exporter_interval') 1337 | start_offset = time.time() % interval 1338 | # Go in a loop until we are told to stop 1339 | while True: 1340 | self.uptime.set(int(time.time() - self.startup_time)) 1341 | self.state.set(1) 1342 | self.info.info({'version': __version__}) 1343 | for service in self.services: 1344 | self.metric_services.labels( 1345 | service, 1346 | getattr(self.config, SERVICE_OPTIONS_TYPE['ip_prefix'])( 1347 | service, 'ip_prefix') 1348 | ).set(1) 1349 | 1350 | try: 1351 | write_to_textfile(path=textfile, registry=self.registry) 1352 | except OSError as err: 1353 | log.critical(f"failed to write metrics to {textfile}:{err}") 1354 | log.debug(f"dumped Prometheus metrics to {textfile}") 1355 | # calculate sleep time 1356 | sleep = start_offset - time.time() % interval 1357 | if sleep < 0: 1358 | sleep += interval 1359 | time.sleep(sleep) 1360 | --------------------------------------------------------------------------------