├── tests ├── __init__.py ├── conftest.py ├── fixtures │ ├── __init__.py │ └── kube.py ├── test_config.py ├── test_kube.py ├── test_snapshots.py ├── test_deltas.py └── test_volume_from_pvc.py ├── k8s_snapshots ├── __init__.py ├── logging.py ├── backends │ ├── __init__.py │ ├── abstract.py │ ├── digitalocean.py │ ├── aws.py │ └── google.py ├── serialize.py ├── context.py ├── events.py ├── config.py ├── errors.py ├── __main__.py ├── asyncutils.py ├── kube.py ├── rule.py ├── logconf.py ├── snapshot.py └── core.py ├── .github └── FUNDING.yml ├── .gitignore ├── examples ├── snapshotrule-volumeclaim.yml ├── snapshotrule-aws.yml ├── snapshotrule-google.yml └── backup-kops-etcd.yml ├── manifests ├── third-party-resource.yml ├── custom-resource-definition.yml └── rbac.yaml ├── docs ├── digitalocean.md ├── google-cloud.md └── aws.md ├── setup.py ├── Dockerfile ├── pyproject.toml ├── DEVELOPMENT.md ├── .circleci └── config.yml ├── LICENSE ├── CHANGES └── README.md /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /k8s_snapshots/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | issuehunt: miracle2k/k8s-snapshots 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.egg-info/ 2 | build/ 3 | dist/ 4 | .cache/ 5 | .mypy_cache/ 6 | .vscode 7 | -------------------------------------------------------------------------------- /examples/snapshotrule-volumeclaim.yml: -------------------------------------------------------------------------------- 1 | apiVersion: "k8s-snapshots.elsdoerfer.com/v1" 2 | kind: SnapshotRule 3 | metadata: 4 | name: mysql 5 | spec: 6 | deltas: P1D P30D 7 | persistentVolumeClaim: my-mysql-disk 8 | -------------------------------------------------------------------------------- /manifests/third-party-resource.yml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: ThirdPartyResource 3 | metadata: 4 | name: snapshot-rule.k8s-snapshots.elsdoerfer.com 5 | description: "Defines snapshot management rules for a disk." 6 | versions: 7 | - name: v1 8 | -------------------------------------------------------------------------------- /docs/digitalocean.md: -------------------------------------------------------------------------------- 1 | ### Configure access permissions on DigitalOcean 2 | 3 | To create volume snapshots on DigitalOcean, you have to provide the 4 | `DIGITALOCEAN_ACCESS_TOKEN` env var. 5 | 6 | Note that, DO limits the number of snapshots for a single volume to 25. 7 | -------------------------------------------------------------------------------- /examples/snapshotrule-aws.yml: -------------------------------------------------------------------------------- 1 | apiVersion: "k8s-snapshots.elsdoerfer.com/v1" 2 | kind: SnapshotRule 3 | metadata: 4 | name: mysql 5 | spec: 6 | deltas: P1D P30D 7 | backend: aws 8 | disk: 9 | region: eu-west-1 10 | volumeId: vol-0aa6f44aad0daf9f2 11 | -------------------------------------------------------------------------------- /examples/snapshotrule-google.yml: -------------------------------------------------------------------------------- 1 | apiVersion: "k8s-snapshots.elsdoerfer.com/v1" 2 | kind: SnapshotRule 3 | metadata: 4 | name: mysql 5 | spec: 6 | deltas: P1D P30D 7 | backend: google 8 | disk: 9 | name: my-mysql-disk 10 | zone: europe-west1-c 11 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='k8s-snapshots', 5 | packages=find_packages(exclude=['tests']), 6 | entry_points={ 7 | 'console_scripts': [ 8 | 'k8s-snapshots=k8s_snapshots.__main__:main' 9 | ] 10 | } 11 | ) 12 | -------------------------------------------------------------------------------- /k8s_snapshots/logging.py: -------------------------------------------------------------------------------- 1 | import attr 2 | 3 | 4 | class Loggable: 5 | def __structlog__(self): 6 | if attr.has(self.__class__): 7 | return attr.asdict(self) 8 | 9 | if hasattr(self, 'to_dict') and callable(self.to_dict): 10 | return self.to_dict() 11 | 12 | return self 13 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from k8s_snapshots.logconf import configure_logging 4 | 5 | 6 | @pytest.fixture(scope='session', autouse=True) 7 | def configured_logging(): 8 | configure_logging( 9 | level_name='DEBUG', 10 | for_humans=True, 11 | ) 12 | 13 | from .fixtures import * # noqa 14 | from .fixtures.kube import * # noqa 15 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9-alpine 2 | 3 | ADD . /app 4 | WORKDIR /app 5 | RUN apk add --no-cache --virtual .build_deps gcc musl-dev libffi-dev 6 | RUN pip3 install poetry 7 | RUN poetry config virtualenvs.create false 8 | RUN poetry install --no-dev 9 | RUN apk del .build_deps gcc musl-dev libffi-dev 10 | 11 | ENV TZ UTC 12 | 13 | CMD ["python", "-m", "k8s_snapshots"] 14 | -------------------------------------------------------------------------------- /manifests/custom-resource-definition.yml: -------------------------------------------------------------------------------- 1 | apiVersion: apiextensions.k8s.io/v1beta1 2 | kind: CustomResourceDefinition 3 | metadata: 4 | name: snapshotrules.k8s-snapshots.elsdoerfer.com 5 | spec: 6 | group: k8s-snapshots.elsdoerfer.com 7 | version: v1 8 | scope: Namespaced 9 | names: 10 | plural: snapshotrules 11 | singular: snapshotrule 12 | kind: SnapshotRule 13 | shortNames: 14 | - sr 15 | -------------------------------------------------------------------------------- /examples/backup-kops-etcd.yml: -------------------------------------------------------------------------------- 1 | apiVersion: "k8s-snapshots.elsdoerfer.com/v1" 2 | kind: SnapshotRule 3 | metadata: 4 | name: etcd-main 5 | namespace: kube-system 6 | spec: 7 | deltas: P1D P30D 8 | backend: aws 9 | disk: 10 | region: eu-west-2 11 | volumeId: vol-0c9f96dd263e10067 12 | 13 | --- 14 | apiVersion: "k8s-snapshots.elsdoerfer.com/v1" 15 | kind: SnapshotRule 16 | metadata: 17 | name: etcd-events 18 | namespace: kube-system 19 | spec: 20 | deltas: P1D P30D 21 | backend: aws 22 | disk: 23 | region: eu-west-2 24 | volumeId: vol-070121e34012404fd 25 | -------------------------------------------------------------------------------- /tests/fixtures/__init__.py: -------------------------------------------------------------------------------- 1 | from unittest import mock 2 | 3 | import pytest 4 | 5 | from k8s_snapshots import errors 6 | from k8s_snapshots.context import Context 7 | from tests.fixtures.kube import make_resource, KUBE_CONFIG 8 | 9 | 10 | @pytest.fixture 11 | def fx_context(request): 12 | request.getfixturevalue('fx_mock_context_kube_config') 13 | request.getfixturevalue('fx_mock_context_kube_client') 14 | ctx = Context({ 15 | 'deltas_annotation_key': 'test.k8s-snapshots.example/deltas' 16 | }) 17 | return ctx 18 | 19 | 20 | @pytest.fixture 21 | def fx_deltas(request): 22 | return 'PT10S PT40S' 23 | -------------------------------------------------------------------------------- /tests/test_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | import contextlib 3 | import datetime 4 | 5 | 6 | 7 | @contextlib.contextmanager 8 | def set_env(**environ): 9 | """ 10 | Temporarily set the process environment variables. 11 | 12 | >>> with set_env(PLUGINS_DIR=u'test/plugins'): 13 | ... "PLUGINS_DIR" in os.environ 14 | True 15 | 16 | >>> "PLUGINS_DIR" in os.environ 17 | False 18 | 19 | :type environ: dict[str, unicode] 20 | :param environ: Environment variables to set 21 | """ 22 | old_environ = dict(os.environ) 23 | os.environ.update(environ) 24 | try: 25 | yield 26 | finally: 27 | os.environ.clear() 28 | os.environ.update(old_environ) 29 | 30 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "k8s-snapshots" 3 | version = "0.1.0" 4 | description = "" 5 | authors = ["Michael Elsdorfer "] 6 | 7 | [tool.poetry.dependencies] 8 | python = "^3.9" 9 | google-api-python-client = "^1.6.2" 10 | requests = "^2.27.1" 11 | pykube = "^0.14.0" 12 | tarsnapper = "^0.4.0" 13 | aiochannel = "^1.0.1" 14 | structlog = "^21.5.0" 15 | attrs = "^17.3.0" 16 | pendulum = "^0.8.0" 17 | confcollect = "^0.2.3" 18 | isodate = "^0.6.1" 19 | python-dateutil = "^2.6.0" 20 | aiohttp = "^3.5.4" 21 | aiostream = "^0.4.4" 22 | boto3 = "^1.21.10" 23 | yarl = "^1.1.1" 24 | python-digitalocean = "^1.15.0" 25 | 26 | [tool.poetry.dev-dependencies] 27 | 28 | [build-system] 29 | requires = ["poetry-core>=1.0.0"] 30 | build-backend = "poetry.core.masonry.api" 31 | -------------------------------------------------------------------------------- /DEVELOPMENT.md: -------------------------------------------------------------------------------- 1 | Development 2 | =========== 3 | 4 | For local development, you can still connect to an existing Google 5 | Cloud Project and Kubernetes cluster using the config options 6 | available. If you are lucky, your local workstation is already setup 7 | the way you need it. If we can find credentials for Google Cloud 8 | or Kubernetes, they will be used automatically. 9 | 10 | However, depending on the backend, you need to provide some options that 11 | otherwise would be read from the instance metadata: 12 | 13 | 14 | For AWS: 15 | 16 | $ AWS_REGION=eu-west-1 python -m k8s_snapshots 17 | 18 | 19 | For Google Cloud: 20 | 21 | $ GCLOUD_PROJECT=revolving-randy python -m k8s_snapshots 22 | 23 | 24 | ## Releasing a new version 25 | 26 | - Update CHANGES. 27 | - Tag with a v-prefix, which will cause a tag on Docker hub. 28 | 29 | -------------------------------------------------------------------------------- /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | jobs: 3 | build: 4 | docker: 5 | - image: circleci/python:3.6.1 6 | 7 | working_directory: ~/k8s-snapshots 8 | 9 | steps: 10 | - checkout 11 | 12 | - restore_cache: 13 | keys: 14 | - v1-dependencies-{{ checksum "requirements.txt" }} 15 | - v1-dependencies- 16 | 17 | - run: 18 | name: install dependencies 19 | command: | 20 | python3 -m venv venv 21 | . venv/bin/activate 22 | pip install -r requirements.txt 23 | 24 | - save_cache: 25 | paths: 26 | - ./venv 27 | key: v1-dependencies-{{ checksum "requirements.txt" }} 28 | 29 | # run tests! 30 | - run: 31 | name: run tests 32 | command: | 33 | . venv/bin/activate 34 | pip install pytest 35 | py.test tests 36 | -------------------------------------------------------------------------------- /k8s_snapshots/backends/__init__.py: -------------------------------------------------------------------------------- 1 | from importlib import import_module 2 | import pykube.objects 3 | from ..errors import ConfigurationError 4 | 5 | 6 | BACKENDS = ['google', 'aws', 'digitalocean'] 7 | 8 | 9 | def get_backends(): 10 | for name in BACKENDS: 11 | try: 12 | backend = import_module('k8s_snapshots.backends.%s' % name) 13 | except ImportError: 14 | continue 15 | yield name, backend 16 | 17 | 18 | def get_backend(name: str): 19 | try: 20 | return import_module('k8s_snapshots.backends.%s' % name) 21 | except ImportError as e: 22 | raise ConfigurationError(f'No such backed: "{name}"', error=e) 23 | 24 | 25 | def find_backend_for_volume(volume: pykube.objects.PersistentVolume): 26 | """ 27 | See if we have a provider that supports this volume. 28 | """ 29 | for name, backend in get_backends(): 30 | if backend.supports_volume(volume): 31 | return name, backend 32 | 33 | return None, None 34 | 35 | -------------------------------------------------------------------------------- /manifests/rbac.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | labels: 5 | app: k8s-snapshots 6 | name: k8s-snapshots 7 | namespace: kube-system 8 | --- 9 | apiVersion: rbac.authorization.k8s.io/v1 10 | kind: ClusterRole 11 | metadata: 12 | labels: 13 | app: k8s-snapshots 14 | name: k8s-snapshots 15 | namespace: kube-system 16 | rules: 17 | - apiGroups: 18 | - "k8s-snapshots.elsdoerfer.com" 19 | resources: 20 | - snapshotrules 21 | verbs: 22 | - get 23 | - list 24 | - watch 25 | - apiGroups: 26 | - "" 27 | resources: 28 | - namespaces 29 | - pods 30 | - persistentvolumeclaims 31 | - persistentvolumes 32 | verbs: 33 | - get 34 | - list 35 | - watch 36 | --- 37 | apiVersion: rbac.authorization.k8s.io/v1 38 | kind: ClusterRoleBinding 39 | metadata: 40 | labels: 41 | app: k8s-snapshots 42 | name: k8s-snapshots 43 | namespace: kube-system 44 | roleRef: 45 | apiGroup: rbac.authorization.k8s.io 46 | kind: ClusterRole 47 | name: k8s-snapshots 48 | subjects: 49 | - kind: ServiceAccount 50 | name: k8s-snapshots 51 | namespace: kube-system 52 | 53 | -------------------------------------------------------------------------------- /tests/test_kube.py: -------------------------------------------------------------------------------- 1 | import pykube 2 | 3 | from k8s_snapshots import kube 4 | from tests.fixtures.kube import mock_kube 5 | 6 | 7 | def test_mock_kube(fx_context): 8 | n_resources = 5 9 | volume_names = [f'test-volume-{i}' for i in range(0, n_resources)] 10 | 11 | def _volume(name, namespace='default'): 12 | return pykube.objects.PersistentVolume( 13 | fx_context.kube_client(), 14 | { 15 | 'apiVersion': 'v1', 16 | 'kind': 'PersistentVolume', 17 | 'metadata': { 18 | 'name': name, 19 | }, 20 | } 21 | ) 22 | 23 | resources = [_volume(volume_name) for volume_name in volume_names] 24 | 25 | with mock_kube(resources) as _kube: 26 | for expected_resource, volume_name in zip(resources, volume_names): 27 | assert expected_resource.name == volume_name, \ 28 | 'Resources was not ceated properly' 29 | kube_resource = kube.get_resource_or_none_sync( 30 | fx_context.kube_client(), 31 | pykube.objects.PersistentVolume, 32 | name=volume_name 33 | ) 34 | assert kube_resource == expected_resource 35 | 36 | assert len(kube_resource.name) 37 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017, Michael Elsdörfer 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions 6 | are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above 12 | copyright notice, this list of conditions and the following 13 | disclaimer in the documentation and/or other materials 14 | provided with the distribution. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 19 | FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 20 | COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 21 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 22 | BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 23 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 24 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 26 | ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 27 | POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /k8s_snapshots/serialize.py: -------------------------------------------------------------------------------- 1 | from datetime import timedelta 2 | import json 3 | from typing import ( 4 | TypeVar, 5 | Mapping, 6 | Sequence, 7 | Optional, Callable) 8 | 9 | import isodate 10 | import pendulum 11 | from structlog.processors import _json_fallback_handler 12 | 13 | Serializable = TypeVar( 14 | 'Serializable', 15 | int, 16 | float, 17 | str, 18 | bool, 19 | Mapping, 20 | Sequence, 21 | ) 22 | 23 | _DEFAULT_FALLBACK_PROCESSOR = _json_fallback_handler 24 | 25 | 26 | def dumps(*args, **kwargs): 27 | kwargs['default'] = Processor() 28 | return json.dumps(*args, **kwargs) 29 | 30 | 31 | class Processor: 32 | def __init__(self, fallback_processor=_DEFAULT_FALLBACK_PROCESSOR): 33 | self.fallback_processor = fallback_processor 34 | 35 | def __call__(self, obj): 36 | return process(obj, fallback_processor=self.fallback_processor) 37 | 38 | 39 | def process( 40 | obj, 41 | fallback_processor: Optional[ 42 | Callable[..., Serializable] 43 | ]=_DEFAULT_FALLBACK_PROCESSOR, 44 | ) -> Serializable: 45 | if isinstance(obj, timedelta): 46 | return isodate.duration_isoformat(obj) 47 | 48 | if isinstance(obj, pendulum.Pendulum): 49 | return obj.isoformat() 50 | 51 | if fallback_processor is not None: 52 | return fallback_processor(obj) 53 | 54 | raise TypeError( 55 | f'Cannot process object of type {type(obj)}, no fallback_processor ' 56 | f'provided' 57 | ) 58 | -------------------------------------------------------------------------------- /k8s_snapshots/context.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pykube 3 | from importlib import import_module 4 | import structlog 5 | from .backends import get_backend 6 | 7 | 8 | _logger = structlog.get_logger() 9 | 10 | 11 | class Context: 12 | def __init__(self, config=None): 13 | self.config = config 14 | self._kube_config = None 15 | self._backend = None 16 | 17 | @property 18 | def kube_config(self): 19 | if self._kube_config is None: 20 | self._kube_config = self.load_kube_config() 21 | 22 | return self._kube_config 23 | 24 | def get_backend(self): 25 | return get_backend(self.config.get('cloud_provider')) 26 | 27 | def load_kube_config(self): 28 | cfg = None 29 | 30 | kube_config_file = self.config.get('kube_config_file') 31 | 32 | if kube_config_file: 33 | _logger.info('kube-config.from-file', file=kube_config_file) 34 | cfg = pykube.KubeConfig.from_file(kube_config_file) 35 | 36 | if not cfg: 37 | # See where we can get it from. 38 | default_file = os.path.expanduser('~/.kube/config') 39 | if os.path.exists(default_file): 40 | _logger.info( 41 | 'kube-config.from-file.default', 42 | file=default_file) 43 | cfg = pykube.KubeConfig.from_file(default_file) 44 | 45 | # Maybe we are running inside Kubernetes. 46 | if not cfg: 47 | _logger.info('kube-config.from-service-account') 48 | cfg = pykube.KubeConfig.from_service_account() 49 | 50 | return cfg 51 | 52 | def kube_client(self): 53 | return pykube.HTTPClient(self.kube_config) 54 | 55 | 56 | -------------------------------------------------------------------------------- /CHANGES: -------------------------------------------------------------------------------- 1 | 2.0 (2017-08-26) 2 | ------------------ 3 | 4 | Adds support for cloud backends. 5 | 6 | IMPORTANT BREAKING CHANGES 7 | 8 | - The VOLUMES options has been replaced by a custom SnapshotRule 9 | resource. If you need to setup manual snapshot rules for disks not 10 | based on a PersistentVolume resource, see the readme for instructions. 11 | 12 | Other changes 13 | 14 | - Support an AWS backend. 15 | - Improve GCE disk detection; now based on actual data data, not on a 16 | provisioner label. 17 | - GLCOUD_PROJECT environment variable no longer required, is read from 18 | instance metadata. 19 | 20 | 21 | 1.0.1 (2017-08-16) 22 | ------------------ 23 | 24 | - Fix manual volume support via VOLUMES. 25 | - Fix race condition that caused PersistentVolume resources not be be 26 | watched. 27 | 28 | 29 | 1.0 (2017-08-10) 30 | ---------------- 31 | 32 | Vastly improved rewrite. Joar Wandborg contributed most of the changes 33 | in this release. 34 | 35 | 36 | IMPORTANT BREAKING CHANGES 37 | 38 | - *k8s-snapshots* now labels the snapshots it creates, and only looks 39 | at those snapshots that have this label when making decisions about 40 | when to create and delete snapshots. 41 | 42 | Thus, when you upgrade, all existing snapshots will be invisible to 43 | *k8s-snapshots*, and it will begin with a blank slate. 44 | 45 | We recommend that within the Google Cloud UI, you add the label to 46 | your existing snapshots you want the tool to consider. The label 47 | expected is: 48 | 49 | created-by=k8s-snapshots 50 | 51 | - The format used to define deltas has changed. It now uses ISO 8601 52 | durations, so the annotation string will look like this: 53 | 54 | backup.kubernetes.io/deltas: PT1H P30D P180D 55 | 56 | This is 1 hour, 30 days and 180 days. 57 | -------------------------------------------------------------------------------- /tests/test_snapshots.py: -------------------------------------------------------------------------------- 1 | from datetime import timedelta 2 | from pendulum import Pendulum 3 | from unittest import TestCase 4 | 5 | from k8s_snapshots.backends.abstract import Snapshot 6 | from k8s_snapshots.context import Context 7 | from k8s_snapshots.rule import Rule 8 | from k8s_snapshots.snapshot import snapshots_for_rule_are_outdated 9 | 10 | 11 | class TestSnapshotsAreUpToDate(TestCase): 12 | TEST_DISK = 'test-disk' 13 | 14 | def setUp(self): 15 | self.mock_context = Context({}) 16 | self.rule = Rule( 17 | name='test_rule', 18 | deltas=[timedelta(hours=1), timedelta(days=30)], 19 | backend='test_backend', 20 | disk=self.TEST_DISK) 21 | 22 | def test_snapshot_is_required_without_existing_snapshots(self): 23 | assert snapshots_for_rule_are_outdated(self.rule, []) 24 | 25 | def test_snapshot_not_required_with_recent_snapshots(self): 26 | assert not snapshots_for_rule_are_outdated(self.rule, [ 27 | Snapshot( 28 | created_at=Pendulum.now('utc') - timedelta(minutes=59), 29 | name='snapshot-1', 30 | disk=self.TEST_DISK) 31 | ]) 32 | 33 | def test_snapshot_required_with_outdated_snapshot(self): 34 | assert snapshots_for_rule_are_outdated(self.rule, [ 35 | Snapshot( 36 | created_at=Pendulum.now('utc') - timedelta(hours=1, minutes=1), 37 | name='snapshot-1', 38 | disk=self.TEST_DISK) 39 | ]) 40 | 41 | def test_snapshot_required_with_snapshot_for_different_disk(self): 42 | assert snapshots_for_rule_are_outdated(self.rule, [ 43 | Snapshot( 44 | created_at=Pendulum.now('utc') - timedelta(minutes=5), 45 | name='snapshot-1', 46 | disk='some-other-disk') 47 | ]) 48 | -------------------------------------------------------------------------------- /docs/google-cloud.md: -------------------------------------------------------------------------------- 1 | ### Configure access permissions to Google Cloud 2 | 3 | If there are no default credentials to Kubernetes and the Cloud 4 | snapshot API, or the default credentials do not have the required 5 | access scope, you may need to configure these. 6 | 7 | 8 | 9 | 10 | 14 | 15 | 16 | 17 | 23 | 24 | 25 | 26 | 34 | 35 | 36 | 37 | 40 | 41 | 42 | 43 | 47 | 48 |
CLOUD_PROVIDER 11 | Set to 'google' to use gcloud exclusively. 12 | Can be detected based on volume spec gcePersistentDisk. 13 |
GCLOUD_PROJECT 18 | Name of the Google Cloud project. This is required to use the Google 19 | Cloud API, but if it's not given, we try to read the value from 20 | the [instance metadata service](https://cloud.google.com/compute/docs/storing-retrieving-metadata) 21 | which will usually work. 22 |
GCLOUD_CREDENTIALS_FILE 27 | Filename to the JSON gcloud credentials file used to authenticate. 28 | You'll want to mount it into the container. 29 | By default set to here for for PyKube: 30 | ~/.config/gcloud/application_default_credentials.json 31 | PyKube doesn't use env to locate the config but 32 | GOOGLE_APPLICATION_CREDENTIALS takes precedence. 33 |
GOOGLE_APPLICATION_CREDENTIALS 38 | The contents of the JSON keyfile that is used to authenticate. 39 |
KUBE_CONFIG_FILE 44 | Authentification with the Kubernetes API. By default, the 45 | pod service account is used. 46 |
49 | 50 | When using a service account with a custom role to access the Google Cloud API, the following permissions are required: 51 | ``` 52 | compute.disks.createSnapshot 53 | compute.snapshots.create 54 | compute.snapshots.delete 55 | compute.snapshots.get 56 | compute.snapshots.list 57 | compute.snapshots.setLabels 58 | compute.zoneOperations.get 59 | ``` -------------------------------------------------------------------------------- /k8s_snapshots/events.py: -------------------------------------------------------------------------------- 1 | """ 2 | Here is a collection of logging ``event```values that are expected to be kept 3 | more stable. 4 | 5 | These events are provided as a reference for external logging metric tools. 6 | """ 7 | import enum 8 | 9 | 10 | class EventEnum(enum.Enum): 11 | """ Base class for Event Enums """ 12 | pass 13 | 14 | 15 | @enum.unique 16 | class Annotation(EventEnum): 17 | """ 18 | Events related to 'deltas' annotations. 19 | """ 20 | FOUND = 'annotation.found' 21 | NOT_FOUND = 'annotation.not-found' 22 | ERROR = 'annotation.error' 23 | INVALID = 'annotation.invalid' 24 | 25 | 26 | @enum.unique 27 | class VolumeEvent(EventEnum): 28 | """ 29 | Events related to Kubernetes PersistentVolume and PersistentVolumeClaim 30 | resource events. 31 | """ 32 | RECEIVED = 'volume-event.received' 33 | 34 | 35 | @enum.unique 36 | class Volume(EventEnum): 37 | """ 38 | Events related to Kubernetes PersistentVolumes 39 | """ 40 | UNSUPPORTED = 'volume.unsupported' 41 | NOT_FOUND = 'volume.not-found' 42 | 43 | 44 | @enum.unique 45 | class Snapshot(EventEnum): 46 | """ 47 | Events related to snapshots. 48 | """ 49 | SCHEDULED = 'snapshot.scheduled' 50 | START = 'snapshot.start' 51 | ERROR = 'snapshot.error' 52 | CREATED = 'snapshot.created' 53 | EXPIRED = 'snapshot.expired' 54 | 55 | 56 | @enum.unique 57 | class Rule(EventEnum): 58 | """ 59 | Events related to snapshot Rule()s. 60 | """ 61 | PENDING = 'rule.pending' 62 | ADDED_FROM_CONFIG = 'rule.from-config' 63 | ADDED = 'rule.added' 64 | UPDATED = 'rule.updated' 65 | REMOVED = 'rule.removed' 66 | HEARTBEAT = 'rule.heartbeat' 67 | 68 | 69 | @enum.unique 70 | class Expiration(EventEnum): 71 | """ 72 | Events related to snapshot expiration. 73 | """ 74 | STARTED = 'expire.started' 75 | KEPT = 'expire.kept' 76 | DELETE = 'expire.delete' 77 | COMPLETE = 'expire.complete' 78 | 79 | 80 | @enum.unique 81 | class Ping(EventEnum): 82 | """ 83 | Events related to sending pings. 84 | """ 85 | SENT = 'ping.sent' 86 | -------------------------------------------------------------------------------- /docs/aws.md: -------------------------------------------------------------------------------- 1 | ### Configure access permissions on AWS 2 | 3 | To be able to create snapshots, on AWS our pod will need the following permissions: 4 | 5 | ```json 6 | { 7 | "Version": "2012-10-17", 8 | "Statement": [ 9 | { 10 | "Effect": "Allow", 11 | "Action": [ 12 | "ec2:DescribeAvailabilityZones", 13 | "ec2:CreateTags", 14 | "ec2:DescribeTags", 15 | "ec2:DescribeVolumeAttribute", 16 | "ec2:DescribeVolumeStatus", 17 | "ec2:DescribeVolumes", 18 | "ec2:CreateSnapshot", 19 | "ec2:DeleteSnapshot", 20 | "ec2:DescribeSnapshots" 21 | ], 22 | "Resource": "*" 23 | } 24 | ] 25 | } 26 | ``` 27 | 28 | If there are no default credentials injected into your nodes, or the default 29 | credentials do not have the required access scope, you may need to 30 | configure these environment variables: 31 | 32 | 33 | 34 | 35 | 38 | 39 | 40 | 41 | 44 | 45 | 46 | 47 | 50 | 51 |
AWS_ACCESS_KEY_ID 36 | AWS IAM Access Key ID that is used to authenticate. 37 |
AWS_SECRET_ACCESS_KEY 42 | AWS IAM Secret Access Key that is used to authenticate. 43 |
AWS_REGION 48 | The region is usually detected via the meta data service. You can override the value. 49 |
52 | 53 | 54 | ### A tip for kops users 55 | 56 | On older versions of kops, master nodes did have the permissions required. A solution there 57 | is to just run `k8s-snapshots` on a master node. 58 | 59 | To run on a Master, we need to: 60 | * [Overcome a Taint](https://kubernetes.io/docs/concepts/configuration/taint-and-toleration/) 61 | * [Specify that we require a Master](https://kubernetes.io/docs/concepts/configuration/assign-pod-node/) 62 | 63 | To do this, add the following to the above manifest for the k8s-snapshots 64 | Deployment: 65 | 66 | ``` 67 | spec: 68 | ... 69 | template: 70 | ... 71 | spec: 72 | ... 73 | tolerations: 74 | - key: "node-role.kubernetes.io/master" 75 | operator: "Equal" 76 | value: "" 77 | effect: "NoSchedule" 78 | nodeSelector: 79 | kubernetes.io/role: master 80 | ``` -------------------------------------------------------------------------------- /k8s_snapshots/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Dict 3 | import confcollect 4 | import structlog 5 | from k8s_snapshots.errors import ConfigurationError 6 | 7 | 8 | _logger = structlog.get_logger() 9 | 10 | 11 | DEFAULT_CONFIG = { 12 | #: Set to True to make logs more machine-readable 13 | 'json_log': False, 14 | #: If zero, prints one line of JSON per message, if set to a positive 15 | #: non-zero integer to get indented JSON output 16 | 'structlog_json_indent': 0, 17 | #: Anything [^a-z0-9-] will be replaced by '-', the timezone will always be 18 | #: UTC. 19 | 'snapshot_datetime_format': '%d%m%y-%H%M%S', 20 | 'log_level': 'INFO', 21 | 'kube_config_file': '', 22 | 'use_claim_name': False, 23 | 'ping_url': '', 24 | #: The key used when annotating PVs and PVCs with deltas 25 | 'deltas_annotation_key': 'backup.kubernetes.io/deltas', 26 | #: This label will be set on all snapshots created by k8s-snapshots 27 | 'snapshot_author_label': 'k8s-snapshots', 28 | 'snapshot_author_label_key': 'created-by', 29 | #: Number of seconds between Rule.HEARTBEAT events, ``0`` to disable. 30 | 'schedule_heartbeat_interval_seconds': 600, 31 | #: Turns debug mode on, not recommended in production 32 | 'debug': False, 33 | 34 | 'gcloud_project': '', 35 | 'gcloud_credentials_file': os.path.join( 36 | os.path.expanduser('~'), 37 | ".config/gcloud/application_default_credentials.json" 38 | ), 39 | 'google_application_credentials': '', 40 | 41 | 'aws_region': '' 42 | } 43 | 44 | 45 | def validate_config(config: Dict) -> bool: 46 | return True 47 | 48 | 49 | def from_environ_basic() -> Dict: 50 | config = DEFAULT_CONFIG.copy() 51 | config.update(confcollect.from_environ(by_defaults=DEFAULT_CONFIG)) 52 | # Backwards compatability 53 | if config.get('gcloud_json_keyfile_name') and not config.get('gcloud_credentials_file'): 54 | config['gcloud_credentials_file'] = config.get('gcloud_json_keyfile_name') 55 | if config.get('gcloud_json_keyfile_string') and not config.get('google_application_credentials'): 56 | config['google_application_credentials'] = config.get('gcloud_json_keyfile_string') 57 | 58 | return config 59 | 60 | 61 | def from_environ() -> Dict: 62 | config = from_environ_basic() 63 | 64 | if not validate_config(config): 65 | raise ConfigurationError( 66 | 'Invalid configuration. See log for more details', 67 | config=config 68 | ) 69 | 70 | return config 71 | -------------------------------------------------------------------------------- /tests/test_deltas.py: -------------------------------------------------------------------------------- 1 | from datetime import timedelta 2 | 3 | import isodate 4 | import pytest 5 | 6 | import pendulum 7 | 8 | from k8s_snapshots import errors 9 | from k8s_snapshots.rule import parse_deltas 10 | 11 | 12 | @pytest.mark.parametrize( 13 | [ 14 | 'deltas', 15 | 'expected_timedeltas', 16 | ], 17 | [ 18 | pytest.param( 19 | 'PT1M P1M', 20 | [ 21 | isodate.Duration(minutes=1), 22 | isodate.Duration(months=1), 23 | ] 24 | ), 25 | pytest.param( 26 | 'P7D P1D', 27 | [ 28 | isodate.Duration(days=7), 29 | isodate.Duration(days=1), 30 | ] 31 | ), 32 | pytest.param( 33 | 'PT1M PT7.5H P1M P5W P1Y', 34 | [ 35 | isodate.Duration(minutes=1), 36 | isodate.Duration(hours=7.5), 37 | isodate.Duration(months=1), 38 | isodate.Duration(weeks=5), 39 | isodate.Duration(years=1), 40 | ], 41 | ), 42 | pytest.param( 43 | 'PT1D PT1D', 44 | [], 45 | marks=pytest.mark.xfail( 46 | reason='T may only be used before time-based values such as ' 47 | 'minute, hour, second', 48 | raises=errors.DeltasParseError, 49 | strict=True, 50 | ) 51 | ), 52 | pytest.param( 53 | 'PT1M', 54 | [], 55 | marks=pytest.mark.xfail( 56 | raises=errors.DeltasParseError, 57 | reason='Two deltas are required', 58 | strict=True, 59 | ) 60 | ), 61 | pytest.param( 62 | 'P1S P2S', 63 | [], 64 | marks=pytest.mark.xfail( 65 | raises=errors.DeltasParseError, 66 | reason='PT is required', 67 | strict=True 68 | ) 69 | ), 70 | pytest.param( 71 | 'pt2m', 72 | [], 73 | marks=pytest.mark.xfail( 74 | raises=errors.DeltasParseError, 75 | reason='ISO 8601 does not allow lowercase characters', 76 | strict=True 77 | ) 78 | ), 79 | pytest.param( 80 | None, 81 | [], 82 | marks=pytest.mark.xfail( 83 | raises=errors.DeltasParseError, 84 | reason='deltas is None', 85 | strict=True, 86 | ) 87 | ) 88 | ] 89 | ) 90 | def test_parse_deltas(deltas, expected_timedeltas): 91 | parsed_deltas = parse_deltas(deltas) 92 | assert parsed_deltas == expected_timedeltas 93 | -------------------------------------------------------------------------------- /k8s_snapshots/errors.py: -------------------------------------------------------------------------------- 1 | import traceback 2 | from typing import Dict, List, Iterable 3 | 4 | 5 | class StructuredError(Exception): 6 | def __init__(self, message=None, **data): 7 | self.message = message 8 | self.data = data 9 | 10 | def __str__(self): 11 | return f'{self.__class__.__qualname__}: {self.message} {self.data!r}' 12 | 13 | def __repr__(self): 14 | return f'<{self.__class__.__name__}: {self.message} ' \ 15 | f'data={self.data!r}>' 16 | 17 | def __structlog__(self): 18 | return self._serializable_exc() 19 | 20 | def _exc_chain(self) -> Iterable[Exception]: 21 | chain = [] # reverse chronological order 22 | exc = self 23 | 24 | while exc is not None: 25 | chain.append(exc) 26 | exc = exc.__cause__ 27 | 28 | return reversed(chain) 29 | 30 | def _serializable_exc(self) -> List[Dict]: 31 | def serialize_exc(exc: Exception) -> Dict: 32 | if isinstance(exc, StructuredError): 33 | return exc.to_dict() 34 | else: 35 | exc_type = exc.__class__ 36 | exc_tb = exc.__traceback__ 37 | return { 38 | 'type': exc_type.__qualname__, 39 | 'message': str(exc), 40 | 'readable': traceback.format_exception( 41 | exc_type, 42 | exc, 43 | exc_tb, 44 | chain=False 45 | ) 46 | } 47 | 48 | return [serialize_exc(exc) for exc in self._exc_chain()] 49 | 50 | def to_dict(self) -> Dict: 51 | return { 52 | 'type': self.__class__.__qualname__, 53 | 'message': self.message, 54 | 'data': self.data, 55 | 'readable': traceback.format_exception( 56 | self.__class__, 57 | self, 58 | self.__traceback__, 59 | chain=False 60 | ) 61 | } 62 | 63 | 64 | class ConfigurationError(StructuredError): 65 | """ Raised for invalid configuration """ 66 | pass 67 | 68 | 69 | class DeltasParseError(StructuredError): 70 | """ 71 | Raised for invalid delta strings 72 | 73 | - In configuration. 74 | - In PV or PVC annotations. 75 | """ 76 | pass 77 | 78 | 79 | class RuleDependsOn(StructuredError): 80 | pass 81 | 82 | 83 | class VolumeNotFound(StructuredError): 84 | pass 85 | 86 | 87 | class UnsupportedVolume(StructuredError): 88 | """ Raised for PersistentVolumes we can't snapshot """ 89 | pass 90 | 91 | 92 | class SnapshotCreateError(StructuredError): 93 | pass 94 | 95 | 96 | class AnnotationError(StructuredError): 97 | pass 98 | 99 | 100 | class AnnotationNotFound(AnnotationError): 101 | pass 102 | -------------------------------------------------------------------------------- /k8s_snapshots/__main__.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import functools 3 | import signal 4 | import sys 5 | 6 | import confcollect 7 | import structlog 8 | 9 | import k8s_snapshots.config 10 | from k8s_snapshots.logconf import configure_from_config 11 | from k8s_snapshots.core import daemon 12 | 13 | 14 | def main(): 15 | # Read config initially just to setup logging 16 | configure_from_config(k8s_snapshots.config.from_environ_basic()) 17 | 18 | # Now with logging setup, read and validate the config. 19 | config = k8s_snapshots.config.from_environ() 20 | 21 | if config['debug']: 22 | sys.excepthook = debug_excepthook 23 | 24 | # Late import to keep module-level get_logger after configure_from_config 25 | _logger = structlog.get_logger(__name__) 26 | 27 | _logger.bind( 28 | gcloud_project=config['gcloud_project'], 29 | deltas_annotation_key=config['deltas_annotation_key'], 30 | ) 31 | 32 | loop = asyncio.get_event_loop() 33 | 34 | main_task = asyncio.ensure_future(daemon(config)) 35 | 36 | _log = _logger.new(loop=loop, main_task=main_task) 37 | 38 | def handle_signal(name, timeout=10): 39 | _log.info('Received signal', signal_name=name) 40 | 41 | if main_task.cancelled(): 42 | _log.info('main task already cancelled, forcing a quit') 43 | return 44 | 45 | _log.info( 46 | 'Cancelling main task', 47 | task_cancel=main_task.cancel() 48 | ) 49 | 50 | for sig_name in ['SIGINT', 'SIGTERM']: 51 | loop.add_signal_handler( 52 | getattr(signal, sig_name), 53 | functools.partial(handle_signal, sig_name)) 54 | 55 | loop.add_signal_handler(signal.SIGUSR1, print_tasks) 56 | 57 | try: 58 | loop.run_until_complete(main_task) 59 | except asyncio.CancelledError: 60 | _log.exception('main task cancelled') 61 | except Exception as exc: 62 | _log.exception('Unhandled exception in main task') 63 | raise 64 | finally: 65 | loop.run_until_complete(shutdown(loop=loop)) 66 | 67 | 68 | def debug_excepthook(exc_type, exc, exc_tb): 69 | import pdb 70 | loop = asyncio.get_event_loop() 71 | loop.stop() 72 | pdb.post_mortem(exc_tb) 73 | sys.__excepthook__(exc_type, exc, exc_tb) 74 | 75 | 76 | _shutdown = False 77 | 78 | 79 | async def shutdown(*, loop=None): 80 | _logger = structlog.get_logger() 81 | global _shutdown 82 | if _shutdown: 83 | _logger.warning('Already shutting down') 84 | return 85 | 86 | _shutdown = True 87 | 88 | _logger.debug( 89 | 'shutting down', 90 | ) 91 | 92 | print_tasks() 93 | 94 | _logger.info('Shutdown complete') 95 | 96 | 97 | def print_tasks(): 98 | tasks = list(asyncio.all_tasks()) 99 | structlog.get_logger().debug('print tasks', tasks=tasks) 100 | 101 | 102 | if __name__ == '__main__': 103 | sys.exit(main() or 0) 104 | -------------------------------------------------------------------------------- /tests/test_volume_from_pvc.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | import pytest 4 | import pykube 5 | 6 | from k8s_snapshots import errors 7 | from k8s_snapshots.core import volume_from_pvc 8 | from tests.fixtures import make_resource 9 | from tests.fixtures.kube import mock_kube 10 | 11 | PV_RESOURCE = make_resource( 12 | pykube.objects.PersistentVolume, 13 | 'test-pv', 14 | ) 15 | 16 | 17 | @pytest.mark.parametrize( 18 | [ 19 | 'resource', # resource to get volume from 20 | 'resources', # resources in mocked kube 21 | 'expected_volume_index', # index in 'resources' for the expected volume 22 | ], 23 | [ 24 | pytest.param( 25 | make_resource( 26 | pykube.objects.PersistentVolumeClaim, 27 | 'test-pvc', 28 | spec={ 29 | 'volumeName': 'correct-pv' 30 | } 31 | ), 32 | [ 33 | make_resource( 34 | pykube.objects.PersistentVolume, 35 | 'incorrect-pv', 36 | ), 37 | make_resource( 38 | pykube.objects.PersistentVolume, 39 | 'correct-pv', 40 | ), 41 | ], 42 | 1, 43 | id='valid_from_volume_claim' 44 | ), 45 | pytest.param( 46 | make_resource( 47 | pykube.objects.PersistentVolumeClaim, 48 | 'test-pvc', 49 | spec={ 50 | 'volumeName': 'nonexistent-pv' 51 | } 52 | ), 53 | [ 54 | make_resource( 55 | pykube.objects.PersistentVolume, 56 | 'existing-but-different-pv' 57 | ) 58 | ], 59 | None, 60 | id='no_volume_for_claim', 61 | marks=pytest.mark.xfail( 62 | reason='Volume referred by claim\'s .spec.volumeName does not ' 63 | 'exist', 64 | raises=errors.VolumeNotFound, 65 | strict=True, 66 | ) 67 | ), 68 | pytest.param( 69 | make_resource( 70 | pykube.objects.PersistentVolumeClaim, 71 | 'claim-without-spec-volumename', 72 | ), 73 | [], 74 | None, 75 | id='claim_without_spec_volumeName', 76 | marks=pytest.mark.xfail( 77 | reason='Invalid claim spec, missing .spec.volumeName', 78 | raises=errors.VolumeNotFound, 79 | strict=True, 80 | ) 81 | ) 82 | ] 83 | ) 84 | def test_volume_from_resource( 85 | fx_context, 86 | resource, 87 | resources, 88 | expected_volume_index, 89 | ): 90 | loop = asyncio.get_event_loop() 91 | 92 | with mock_kube(resources): 93 | result = loop.run_until_complete( 94 | volume_from_pvc( 95 | ctx=fx_context, 96 | resource=resource, 97 | ) 98 | ) 99 | 100 | if expected_volume_index is not None: 101 | assert result == resources[expected_volume_index] 102 | -------------------------------------------------------------------------------- /k8s_snapshots/backends/abstract.py: -------------------------------------------------------------------------------- 1 | import enum 2 | import pendulum 3 | from typing import Dict, List, NamedTuple, Any 4 | from ..context import Context 5 | import pykube.objects 6 | 7 | 8 | @enum.unique 9 | class SnapshotStatus(enum.Enum): 10 | PENDING = 'snapshot.pending' 11 | COMPLETE = 'snapshot.complete' 12 | 13 | 14 | # It's up to a backend to decide how a disk should be identified. 15 | # However, it does need to be something that is hashable, ideally 16 | # a tuple. 17 | DiskIdentifier = Any 18 | 19 | 20 | class Snapshot(NamedTuple): 21 | """ 22 | Identifies an existing snapshot. 23 | """ 24 | name: str 25 | created_at: pendulum.Pendulum 26 | # A disk id that is known to Kubernetes. 27 | disk: DiskIdentifier 28 | 29 | 30 | # Snapshot creation is a multi-step process. This is an arbitrary value that 31 | # a Cloud provider backend can return to refer to the snapshot within the 32 | # cloud as it's being created. This is distinct from :class:`Snapshot`, which 33 | # represents a completed snapshot. 34 | NewSnapshotIdentifier = Any 35 | 36 | 37 | def get_disk_identifier(volume: pykube.objects.PersistentVolume) -> DiskIdentifier: 38 | """Return a DiskIdentifier from a PersistentVolume.""" 39 | raise NotImplementedError() 40 | 41 | 42 | def supports_volume(volume: pykube.objects.PersistentVolume): 43 | """Return either the given persistent volume is supported by 44 | the backend.""" 45 | raise NotImplementedError() 46 | 47 | 48 | def validate_disk_identifier(disk_id: Dict) -> DiskIdentifier: 49 | """Should take the user-specified dictionary, and convert it to 50 | it's own, local `DiskIdentifier`. If the disk_id is not valid, 51 | it should raise a `ValueError` with a suitable error message. 52 | """ 53 | raise NotImplementedError() 54 | 55 | 56 | def load_snapshots(ctx: Context, label_filters: Dict[str, str]) -> List[Snapshot]: 57 | """ 58 | Return the existing snapshots. Important!! This function must filter 59 | the list of returned snapshots by ``label_filters``. This is because 60 | usually cloud providers make filtering part of their API. 61 | """ 62 | raise NotImplementedError() 63 | 64 | 65 | def create_snapshot( 66 | ctx: Context, 67 | disk: DiskIdentifier, 68 | snapshot_name: str, 69 | snapshot_description: str 70 | ) -> NewSnapshotIdentifier: 71 | """ 72 | Create a snapshot for the given disk. 73 | 74 | This operation is expected to be asynchronous, so the value you return 75 | will identify the snapshot for the next call. 76 | """ 77 | raise NotImplementedError() 78 | 79 | 80 | def get_snapshot_status( 81 | ctx: Context, 82 | snapshot_identifier: NewSnapshotIdentifier 83 | ) -> SnapshotStatus: 84 | """ 85 | Should return the current status of the snapshot. 86 | """ 87 | raise NotImplementedError() 88 | 89 | 90 | def set_snapshot_labels( 91 | ctx: Context, 92 | snapshot_identifier: NewSnapshotIdentifier, 93 | labels: Dict 94 | ): 95 | """ 96 | Set labels on the snapshot. 97 | """ 98 | raise NotImplementedError() 99 | 100 | 101 | def delete_snapshot( 102 | ctx: Context, 103 | snapshot: Snapshot 104 | ): 105 | """ 106 | Delete the snapshot given. 107 | """ 108 | raise NotImplementedError() 109 | -------------------------------------------------------------------------------- /k8s_snapshots/backends/digitalocean.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List, NamedTuple 2 | import digitalocean 3 | from digitalocean.baseapi import NotFoundError 4 | from .abstract import ( 5 | DiskIdentifier, Snapshot, NewSnapshotIdentifier, SnapshotStatus) 6 | import pendulum 7 | import pykube.objects 8 | from ..context import Context 9 | import structlog 10 | 11 | 12 | _logger = structlog.get_logger(__name__) 13 | 14 | 15 | class DODiskIdentifier(NamedTuple): 16 | volume_id: str 17 | 18 | 19 | class InvalidVolumeNameError(ValueError): 20 | def __init__(self, volume_name): 21 | super().__init__("DigitalOcean has no volume named %s.", volume_name) 22 | 23 | 24 | def get_disk_identifier( 25 | volume: pykube.objects.PersistentVolume 26 | ) -> DODiskIdentifier: 27 | volume_id = volume.obj['spec']['csi']['volumeHandle'] 28 | 29 | return DODiskIdentifier(volume_id=volume_id) 30 | 31 | 32 | def supports_volume(volume: pykube.objects.PersistentVolume): 33 | csi = volume.obj['spec'].get('csi') 34 | return csi is not None and csi.get('driver') == 'dobs.csi.digitalocean.com' 35 | 36 | 37 | def validate_disk_identifier(disk_id: Dict) -> DiskIdentifier: 38 | try: 39 | do_volumes = digitalocean.Manager().get_all_volumes() 40 | volume_name = disk_id['volumeName'] 41 | do_volume = next((volume for volume in do_volumes 42 | if volume.name == volume_name), 43 | None) 44 | 45 | if do_volume is None: 46 | raise InvalidVolumeNameError(volume_name) 47 | 48 | return DODiskIdentifier(volume_id=do_volume.id) 49 | except InvalidVolumeNameError as err: 50 | raise err 51 | except: 52 | raise ValueError(disk_id) 53 | 54 | 55 | def load_snapshots( 56 | ctx: Context, label_filters: Dict[str, str] 57 | ) -> List[Snapshot]: 58 | snapshots = digitalocean.Manager().get_volume_snapshots() 59 | 60 | tag_filters = set(k+':'+v for k, v in label_filters.items()) 61 | filtered = [snapshot 62 | for snapshot in snapshots 63 | if tag_filters.intersection(snapshot.tags)] 64 | 65 | _logger.debug('digitalocean.load_snaphots', label_filters=label_filters, 66 | tag_filters=tag_filters, snapshots_count=len(snapshots), 67 | filtered=filtered) 68 | 69 | return list(map(lambda snapshot: Snapshot( 70 | name=snapshot.id, 71 | created_at=pendulum.parse(snapshot.created_at), 72 | disk=DODiskIdentifier(volume_id=snapshot.resource_id), 73 | ), filtered)) 74 | 75 | 76 | def create_snapshot( 77 | ctx: Context, 78 | disk: DODiskIdentifier, 79 | snapshot_name: str, 80 | snapshot_description: str 81 | ) -> NewSnapshotIdentifier: 82 | volume = digitalocean.Volume(id=disk.volume_id) 83 | 84 | snapshot = volume.snapshot(snapshot_name) 85 | 86 | return snapshot['snapshot']['id'] 87 | 88 | 89 | def get_snapshot_status( 90 | ctx: Context, 91 | snapshot_identifier: NewSnapshotIdentifier 92 | ) -> SnapshotStatus: 93 | # DO provides no way to know if a snapshost has finished 94 | return SnapshotStatus.COMPLETE 95 | 96 | 97 | def set_snapshot_labels( 98 | ctx: Context, 99 | snapshot_identifier: NewSnapshotIdentifier, 100 | labels: Dict 101 | ): 102 | for label, value in labels.items(): 103 | tag_name = label + ":" + value 104 | tag = digitalocean.Tag(name=tag_name) 105 | 106 | # Create the tag if it does not exist yet. 107 | _create_missing_tag(tag) 108 | 109 | tag.add_snapshots(snapshot_identifier) 110 | 111 | 112 | def _create_missing_tag(tag: digitalocean.Tag): 113 | # If the tag does not exist, load() raise NotFoundError so we create it. 114 | try: 115 | tag.load() 116 | return 117 | except NotFoundError: 118 | tag.create() 119 | 120 | 121 | def delete_snapshot( 122 | ctx: Context, 123 | snapshot: Snapshot 124 | ): 125 | do_snapshot = digitalocean.Manager().get_snapshot(snapshot.name) 126 | do_snapshot.destroy() 127 | -------------------------------------------------------------------------------- /k8s_snapshots/asyncutils.py: -------------------------------------------------------------------------------- 1 | # Consider: https://github.com/vxgmichel/aiostream 2 | 3 | import asyncio 4 | 5 | import structlog 6 | from aiochannel import Channel 7 | 8 | _logger = structlog.get_logger() 9 | 10 | 11 | async def run_in_executor(func): 12 | return await asyncio.get_event_loop().run_in_executor(None, func) 13 | 14 | 15 | async def combine(**generators): 16 | """Given a bunch of async generators, merges the events from 17 | all of them. Each should have a name, i.e. `foo=gen, bar=gen`. 18 | """ 19 | combined = Channel() 20 | 21 | async def listen_and_forward(name, generator): 22 | async for value in generator: 23 | await combined.put({name: value}) 24 | 25 | tasks = [] 26 | for name, generator in generators.items(): 27 | task = asyncio.ensure_future(listen_and_forward(name, generator)) 28 | 29 | # When task one or fails, close channel so that later our 30 | # iterator stops reading. 31 | def cb(task): 32 | if task.exception(): 33 | combined.close() 34 | task.add_done_callback(cb) 35 | tasks.append(task) 36 | 37 | # This one will stop when either all generators are exhaused, 38 | # or any one of the fails. 39 | async for item in combined: 40 | yield item 41 | 42 | # TODO: gather() can hang, and the task cancellation doesn't 43 | # really work. Happens if one of the generators has an error. 44 | # It seem that is because once we attach a done callback to 45 | # the task, gather() doesn't handle the exception anymore?? 46 | # Any tasks that are still running at this point, cancel them. 47 | for task in tasks: 48 | task.cancel() 49 | # Will consume any task exceptions 50 | await asyncio.gather(*tasks) 51 | 52 | 53 | async def combine_latest(defaults=None, **generators): 54 | """Like "combine", but always includes the latest value from 55 | every generator. 56 | """ 57 | current = defaults.copy() if defaults else {} 58 | async for value in combine(**generators): 59 | current.update(value) 60 | yield current 61 | 62 | 63 | async def debounce(stream, delay): 64 | debounced = Channel() 65 | loop = asyncio.get_event_loop() 66 | 67 | async def iterator(): 68 | scheduled_call = None 69 | async for item in stream: 70 | if scheduled_call: 71 | scheduled_call.cancel() 72 | scheduled_call = loop.call_later( 73 | delay, 74 | lambda: asyncio.ensure_future(debounced.put(item)) 75 | ) 76 | 77 | # Read the incoming iterator in a task. If the task fails, close the 78 | # channel so the iterator below will stop reading. 79 | task = asyncio.ensure_future(iterator()) 80 | def cb(task): 81 | if task.exception(): 82 | debounced.close() 83 | task.add_done_callback(cb) 84 | 85 | async for item in debounced: 86 | yield item 87 | 88 | task.cancel() 89 | await asyncio.gather(task) 90 | 91 | 92 | class StreamReader: 93 | """Allows iterating over the same iterable multiple times, at the same 94 | time. That is, while the source iterable is only running multiple times, 95 | you can consume it with more than one iterator. 96 | 97 | We begin reading from the source when the first iterator starts, and we 98 | stop once the later iterator leaves. 99 | """ 100 | 101 | def __init__(self, iterable): 102 | self._task = None 103 | self.iterable = iterable 104 | self.channels = [] 105 | 106 | async def _iterate_task(self): 107 | async for item in self.iterable: 108 | for channel in self.channels: 109 | await channel.put(item) 110 | 111 | def _ensure_running(self): 112 | if self._task: 113 | return 114 | 115 | self._task = asyncio.ensure_future(self._iterate_task()) 116 | def cb(task): 117 | if task.exception(): 118 | for channel in self.channels: 119 | channel.close() 120 | # Can we fail the channels here, propagate the 121 | # exception to the readers? 122 | raise task.exception() 123 | self._task.add_done_callback(cb) 124 | 125 | def _end(self): 126 | self._task.cancel() 127 | 128 | def iter(self): 129 | # Return a new channel that will receive all the events 130 | channel = Channel() 131 | self.channels.append(channel) 132 | self._ensure_running() 133 | return channel 134 | -------------------------------------------------------------------------------- /k8s_snapshots/kube.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import threading 3 | from typing import (Optional, Iterable, AsyncGenerator, TypeVar, Type, 4 | NamedTuple, Callable) 5 | 6 | import pykube 7 | import structlog 8 | from aiochannel import Channel 9 | 10 | from k8s_snapshots.context import Context 11 | 12 | _logger = structlog.get_logger(__name__) 13 | 14 | Resource = TypeVar( 15 | 'Resource', 16 | bound=pykube.objects.APIObject, 17 | ) 18 | 19 | ClientFactory = Callable[[], pykube.HTTPClient] 20 | 21 | # Copy of a locally-defined namedtuple in 22 | # pykube.query.WatchQuery.object_stream() 23 | _WatchEvent = NamedTuple('_WatchEvent', [ 24 | ('type', str), 25 | ('object', Resource), 26 | ]) 27 | 28 | 29 | class SnapshotRule(pykube.objects.APIObject): 30 | version = "k8s-snapshots.elsdoerfer.com/v1" 31 | endpoint = "snapshotrules" 32 | kind = "SnapshotRule" 33 | 34 | 35 | class Kubernetes: 36 | """ 37 | Allows for easier mocking of Kubernetes resources. 38 | """ 39 | 40 | def __init__(self, client_factory: Optional[ClientFactory] = None): 41 | """ 42 | 43 | Parameters 44 | ---------- 45 | client_factory 46 | Used in threaded operations to create a local 47 | :any:`pykube.HTTPClient` instance. 48 | """ 49 | # Used for threaded operations 50 | self.client_factory = client_factory 51 | 52 | def get_or_none(self, 53 | resource_type: Type[Resource], 54 | name: str, 55 | namespace: Optional[str] = None) -> Optional[Resource]: 56 | """ 57 | Sync wrapper for :any:`pykube.query.Query().get_or_none` 58 | """ 59 | resource_query = resource_type.objects(self.client_factory()) 60 | if namespace is not None: 61 | resource_query = resource_query.filter(namespace=namespace) 62 | 63 | return resource_query.get_or_none(name=name) 64 | 65 | def watch( 66 | self, 67 | resource_type: Type[Resource], 68 | ) -> Iterable[_WatchEvent]: 69 | """ 70 | Sync wrapper for :any:`pykube.query.Query().watch().object_stream()` 71 | """ 72 | return resource_type.objects(self.client_factory())\ 73 | .filter(namespace=pykube.all).watch().object_stream() 74 | 75 | 76 | def get_resource_or_none_sync( 77 | client_factory: ClientFactory, 78 | resource_type: Type[Resource], 79 | name: str, 80 | namespace: Optional[str] = None) -> Optional[Resource]: 81 | return Kubernetes(client_factory).get_or_none( 82 | resource_type, 83 | name, 84 | namespace, 85 | ) 86 | 87 | 88 | async def get_resource_or_none(client_factory: ClientFactory, 89 | resource_type: Type[Resource], 90 | name: str, 91 | namespace: Optional[str] = None, 92 | *, 93 | loop=None) -> Optional[Resource]: 94 | loop = loop or asyncio.get_event_loop() 95 | 96 | def _get(): 97 | return get_resource_or_none_sync( 98 | client_factory=client_factory, 99 | resource_type=resource_type, 100 | name=name, 101 | namespace=namespace, 102 | ) 103 | 104 | return await loop.run_in_executor( 105 | None, 106 | _get, 107 | ) 108 | 109 | 110 | def watch_resources_sync( 111 | client_factory: ClientFactory, 112 | resource_type: pykube.objects.APIObject, 113 | ) -> Iterable: 114 | return Kubernetes(client_factory).watch(resource_type=resource_type) 115 | 116 | 117 | async def watch_resources(ctx: Context, 118 | resource_type: Resource, 119 | *, 120 | delay: int, 121 | allow_missing: bool = False, 122 | loop=None) -> AsyncGenerator[_WatchEvent, None]: 123 | """ Asynchronously watch Kubernetes resources """ 124 | async_gen = _watch_resources_thread_wrapper( 125 | ctx.kube_client, resource_type, allow_missing=allow_missing, loop=loop) 126 | 127 | # Workaround a race condition in pykube: 128 | # https: // github.com / kelproject / pykube / issues / 138 129 | await asyncio.sleep(delay) 130 | 131 | async for item in async_gen: 132 | yield item 133 | 134 | 135 | async def _watch_resources_thread_wrapper( 136 | client_factory: Callable[[], pykube.HTTPClient], 137 | resource_type: Type[Resource], 138 | allow_missing: bool = False, 139 | *, 140 | loop=None) -> AsyncGenerator[_WatchEvent, None]: 141 | """ Async wrapper for pykube.watch().object_stream() """ 142 | loop = loop or asyncio.get_event_loop() 143 | _log = _logger.bind(resource_type_name=resource_type.__name__, ) 144 | channel = Channel() 145 | 146 | def worker(): 147 | try: 148 | _log.debug('watch-resources.worker.start') 149 | while True: 150 | sync_iterator = watch_resources_sync( 151 | client_factory=client_factory, resource_type=resource_type) 152 | _log.debug('watch-resources.worker.watch-opened') 153 | for event in sync_iterator: 154 | # only put_nowait seems to cause SIGSEGV 155 | loop.call_soon_threadsafe(channel.put_nowait, event) 156 | _log.debug('watch-resources.worker.watch-closed') 157 | except pykube.exceptions.HTTPError as e: 158 | # TODO: It's possible that the user creates the resource 159 | # while we are already running. We should pick this up 160 | # automatically, i.e. watch ThirdPartyResource, or just 161 | # check every couple of seconds. 162 | if e.code == 404 and allow_missing: 163 | _log.info('watch-resources.worker.skipped') 164 | else: 165 | _log.exception('watch-resources.worker.error') 166 | except: 167 | _log.exception('watch-resources.worker.error') 168 | finally: 169 | _log.debug('watch-resources.worker.finalized') 170 | channel.close() 171 | 172 | thread = threading.Thread( 173 | target=worker, 174 | daemon=True, 175 | ) 176 | thread.start() 177 | 178 | async for channel_event in channel: 179 | yield channel_event 180 | 181 | _log.debug('watch-resources.done') 182 | -------------------------------------------------------------------------------- /k8s_snapshots/rule.py: -------------------------------------------------------------------------------- 1 | from datetime import timedelta 2 | from typing import Dict, Any, List, Union, Iterable 3 | 4 | import attr 5 | import isodate 6 | import pykube 7 | import structlog 8 | 9 | from k8s_snapshots import kube 10 | from k8s_snapshots.context import Context 11 | from k8s_snapshots.errors import ( 12 | UnsupportedVolume, 13 | AnnotationNotFound, 14 | AnnotationError, 15 | DeltasParseError 16 | ) 17 | from k8s_snapshots.kube import SnapshotRule 18 | from k8s_snapshots.logging import Loggable 19 | from k8s_snapshots.backends import find_backend_for_volume, get_backend 20 | from k8s_snapshots.backends.abstract import DiskIdentifier 21 | 22 | _logger = structlog.get_logger(__name__) 23 | 24 | 25 | @attr.s(slots=True) 26 | class Rule(Loggable): 27 | """ 28 | A rule describes how and when to make backups. 29 | """ 30 | name = attr.ib() 31 | deltas = attr.ib() 32 | backend = attr.ib() 33 | disk = attr.ib() 34 | 35 | #: For Kubernetes resources: The selfLink of the source 36 | source = attr.ib(default=None) 37 | 38 | def to_dict(self) -> Dict[str, Any]: 39 | return attr.asdict(self) 40 | 41 | 42 | def get_backend_for_rule(ctx: Context, rule: Rule): 43 | return get_backend(rule.backend) 44 | 45 | 46 | def rule_name_from_k8s_source( 47 | source: Union[ 48 | pykube.objects.PersistentVolumeClaim, 49 | pykube.objects.PersistentVolume, 50 | SnapshotRule 51 | ], 52 | name: str = False 53 | ) -> str: 54 | """Generates a name for a rule based on a kubernetes resource, 55 | will consider: 56 | """ 57 | 58 | short_kind = { 59 | 'PersistentVolume': 'pv', 60 | 'PersistentVolumeClaim': 'pvc', 61 | 'SnapshotRule': 'rule' 62 | }.pop(source.kind) 63 | 64 | source_namespace = source.namespace 65 | 66 | # PV's have a namespace set to an empty string '' 67 | if source_namespace == 'default' or not source_namespace: 68 | namespace = '' 69 | else: 70 | namespace = f'{source.namespace}-' 71 | 72 | if not name: 73 | name = source.name 74 | rule_name = f'{namespace}{short_kind}-{name}' 75 | 76 | _logger.debug( 77 | 'rule-name-from-k8s', 78 | key_hints=[ 79 | 'source_namespace', 80 | 'source.kind', 81 | 'source.metadata.namespace', 82 | 'source.metadata.name', 83 | 'rule_name', 84 | ], 85 | source_namespace=source_namespace, 86 | source=source.obj, 87 | rule_name=rule_name, 88 | ) 89 | return rule_name 90 | 91 | 92 | def parse_deltas( 93 | delta_string: str 94 | ) -> List[Union[timedelta, isodate.Duration]]: 95 | """q§Parse the given string into a list of ``timedelta`` instances. 96 | """ 97 | if delta_string is None: 98 | raise DeltasParseError( 99 | f'Delta string is None', 100 | ) 101 | 102 | deltas = [] 103 | for item in delta_string.split(' '): 104 | item = item.strip() 105 | if not item: 106 | continue 107 | try: 108 | deltas.append(isodate.parse_duration(item)) 109 | except ValueError as exc: 110 | raise DeltasParseError( 111 | f'Could not parse duration: {item!r}', 112 | error=exc, 113 | item=item, 114 | deltas=deltas, 115 | delta_string=delta_string, 116 | ) from exc 117 | 118 | if deltas and len(deltas) < 2: 119 | raise DeltasParseError( 120 | 'At least two deltas are required', 121 | deltas=deltas, 122 | delta_string=delta_string, 123 | ) 124 | 125 | return deltas 126 | 127 | 128 | def serialize_deltas(deltas: Iterable[timedelta]) -> str: 129 | delta_strs = [ 130 | isodate.duration_isoformat(delta) 131 | for delta in deltas 132 | ] 133 | return ' '.join(delta_strs) 134 | 135 | 136 | async def rule_from_pv( 137 | ctx: Context, 138 | volume: pykube.objects.PersistentVolume, 139 | deltas: List[timedelta], 140 | *, 141 | source: Union[ 142 | pykube.objects.PersistentVolumeClaim, 143 | pykube.objects.PersistentVolume, 144 | SnapshotRule 145 | ] 146 | ) -> Rule: 147 | """Given a persistent volume object, create a backup rule 148 | object. Can return None if this volume is not configured for 149 | backups, or is not suitable. 150 | 151 | The configuration for the rule will either come from the volume, 152 | or it's claim, if one is associated. 153 | """ 154 | _log = _logger.new(volume=volume.obj) 155 | 156 | # Do we have a backend that supports this disk? 157 | backend_name, backend_module = find_backend_for_volume(volume) 158 | if not backend_module: 159 | raise UnsupportedVolume( 160 | 'Unsupported volume', 161 | volume=volume 162 | ) 163 | 164 | # Let the backend parse and validate this volume. 165 | disk = backend_module.get_disk_identifier(volume) 166 | _log.debug('Volume supported by backend', 167 | volume=volume, backend=backend_module, disk=disk) 168 | 169 | # If configured, use the name from the claim 170 | claim_name = "" 171 | if ctx.config.get('use_claim_name'): 172 | claim_ref = volume.obj['spec'].get('claimRef') 173 | if claim_ref: 174 | claim_name = claim_ref.get('name') 175 | 176 | return Rule( 177 | name=rule_name_from_k8s_source(source, claim_name), 178 | backend=backend_name, 179 | source=source, 180 | deltas=deltas, 181 | disk=disk 182 | ) 183 | 184 | 185 | def get_deltas(annotations: Dict, deltas_annotation_key: str) -> List[timedelta]: 186 | """ 187 | Helper annotation-deltas-getter 188 | 189 | Parameters 190 | ---------- 191 | annotations 192 | 193 | Returns 194 | ------- 195 | 196 | """ 197 | try: 198 | deltas_str = annotations[deltas_annotation_key] 199 | except KeyError as exc: 200 | raise AnnotationNotFound( 201 | 'No such annotation key', 202 | key=deltas_annotation_key 203 | ) from exc 204 | 205 | if not deltas_str: 206 | raise AnnotationError('Invalid delta string', deltas_str=deltas_str) 207 | 208 | try: 209 | deltas = parse_deltas(deltas_str) 210 | except DeltasParseError as exc: 211 | raise AnnotationError( 212 | 'Invalid delta string', 213 | deltas_str=deltas_str 214 | ) from exc 215 | 216 | if deltas is None or not deltas: 217 | raise AnnotationError( 218 | 'parse_deltas returned invalid deltas', 219 | deltas_str=deltas_str, 220 | deltas=deltas, 221 | ) 222 | 223 | return deltas 224 | -------------------------------------------------------------------------------- /k8s_snapshots/backends/aws.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List, NamedTuple 2 | import pykube.objects 3 | import requests 4 | import pendulum 5 | import boto3 6 | from urllib.parse import urlparse 7 | from ..context import Context 8 | from k8s_snapshots.snapshot import Snapshot 9 | from .abstract import NewSnapshotIdentifier, SnapshotStatus 10 | from ..errors import SnapshotCreateError 11 | 12 | 13 | def validate_config(config): 14 | """Ensure the config of this backend is correct. 15 | 16 | manual volumes are validated by the backend 17 | - for aws, google cloud, need different data, say, region or zone. 18 | """ 19 | pass 20 | 21 | 22 | def supports_volume(volume: pykube.objects.PersistentVolume): 23 | """Returns `True` if the AWS backend can handle the given volume. 24 | 25 | Currently, this responds to volumes using the in-tree "awsElasticBlockStore" driver, 26 | as well as CSI volumes using `ebs.csi.aws.com`. 27 | """ 28 | if 'csi' in volume.obj['spec']: 29 | if volume.obj['spec'].get('csi')['driver'] == 'ebs.csi.aws.com': 30 | return True 31 | return bool(volume.obj['spec'].get('awsElasticBlockStore')) 32 | 33 | 34 | class AWSDiskIdentifier(NamedTuple): 35 | """An AWS volume id (e.g. `vol-07c6ffacaac8cf641`) + region (e.g. `eu-west-1`). 36 | """ 37 | region: str 38 | volume_id: str 39 | 40 | 41 | def get_current_region(ctx): 42 | """Get the current region from the metadata service. 43 | """ 44 | if not ctx.config['aws_region']: 45 | response = requests.get( 46 | 'http://169.254.169.254/latest/meta-data/placement/availability-zone', 47 | timeout=5) 48 | response.raise_for_status() 49 | ctx.config['aws_region'] = response.text[:-1] 50 | 51 | return ctx.config['aws_region'] 52 | 53 | 54 | def get_disk_identifier(volume: pykube.objects.PersistentVolume) -> AWSDiskIdentifier: 55 | """Parses the AWS volume id, and the region the volume is in, from the given `PersistentVolume`, 56 | and returns them as a `AWSDiskIdentifier` tuple. 57 | 58 | This information is not encoded in a standard way and differs between Kubernetes versions and 59 | storage backends. 60 | """ 61 | 62 | csi = volume.obj['spec'].get('csi') 63 | if csi and csi['driver'] == 'ebs.csi.aws.com': 64 | volume_url = csi['volumeHandle'] 65 | else: 66 | volume_url = volume.obj['spec'].get('awsElasticBlockStore')['volumeID'] 67 | 68 | # A url such as `aws://eu-west-1a/vol-00292b2da3d4ed1e4`. The region is included. 69 | if volume_url.startswith('aws://'): 70 | parts = urlparse(volume_url) 71 | zone = parts.netloc 72 | volume_id = parts.path[1:] 73 | 74 | return AWSDiskIdentifier(region=zone[:-1], volume_id=volume_id) 75 | 76 | # We then assume the volume id is given directly, e.g. `vol-00292b2da3d4ed1e4`. 77 | volume_id = volume_url 78 | 79 | # We still need the region. Sometimes there is a label: 80 | region = volume.obj.get('metadata').get('labels', {}).get('failure-domain.beta.kubernetes.io/region') 81 | if region: 82 | return AWSDiskIdentifier(region=region, volume_id=volume_id) 83 | 84 | # Or we would expect there to be a nodeAffinity selector 85 | nodeSelectorTerms = volume.obj['spec']['nodeAffinity']['required']['nodeSelectorTerms'] 86 | for term in nodeSelectorTerms: 87 | matchExpressions = term.get('matchExpressions') 88 | if matchExpressions: 89 | for expression in matchExpressions: 90 | if expression.get('key') in ("failure-domain.beta.kubernetes.io/region",): 91 | region = expression.get('values')[0] 92 | if expression.get('key') in ('topology.ebs.csi.aws.com/zone',): 93 | region = expression.get('values')[0][:-1] 94 | 95 | return AWSDiskIdentifier(region=region, volume_id=volume_id) 96 | 97 | 98 | def parse_timestamp(date) -> pendulum.Pendulum: 99 | return pendulum.instance(date) 100 | 101 | 102 | def validate_disk_identifier(disk_id: Dict): 103 | try: 104 | return AWSDiskIdentifier( 105 | region=disk_id['region'], 106 | volume_id=disk_id['volumeId'] 107 | ) 108 | except: 109 | raise ValueError(disk_id) 110 | 111 | # AWS can filter by volume-id, which means we wouldn't have to match in Python. 112 | # In any case, it might be easier to let the backend handle the matching. Then 113 | # it relies less on the DiskIdentifier object always matching. 114 | #filters={'volume-id': volume.id} 115 | def load_snapshots(ctx: Context, label_filters: Dict[str, str]) -> List[Snapshot]: 116 | connection = get_connection(ctx, region=get_current_region(ctx)) 117 | 118 | snapshots = connection.describe_snapshots( 119 | OwnerIds=['self'], 120 | Filters=[{'Name': f'tag:{k}', 'Values': [v]} for k, v in label_filters.items()] 121 | ) 122 | 123 | return list(map(lambda snapshot: Snapshot( 124 | name=snapshot['SnapshotId'], 125 | created_at=parse_timestamp(snapshot['StartTime']), 126 | disk=AWSDiskIdentifier( 127 | volume_id=snapshot['VolumeId'], 128 | region=ctx.config['aws_region'] 129 | ) 130 | ), snapshots['Snapshots'])) 131 | 132 | 133 | def create_snapshot( 134 | ctx: Context, 135 | disk: AWSDiskIdentifier, 136 | snapshot_name: str, 137 | snapshot_description: str 138 | ) -> NewSnapshotIdentifier: 139 | 140 | connection = get_connection(ctx, disk.region) 141 | 142 | # TODO: Seems like the API doesn't actually allow us to set a snapshot 143 | # name, although it's possible in the UI. 144 | snapshot = connection.create_snapshot( 145 | VolumeId=disk.volume_id, 146 | Description=snapshot_name 147 | ) 148 | 149 | return { 150 | 'id': snapshot['SnapshotId'], 151 | 'region': disk.region 152 | } 153 | 154 | 155 | def get_snapshot_status( 156 | ctx: Context, 157 | snapshot_identifier: NewSnapshotIdentifier 158 | ) -> SnapshotStatus: 159 | connection = get_connection(ctx, snapshot_identifier['region']) 160 | 161 | snapshots = connection.describe_snapshots( 162 | SnapshotIds=[snapshot_identifier['id']] 163 | ) 164 | snapshot = snapshots['Snapshots'][0] 165 | 166 | # Can be pending | completed | error 167 | if snapshot['State'] == 'pending': 168 | return SnapshotStatus.PENDING 169 | elif snapshot['State'] == 'completed': 170 | return SnapshotStatus.COMPLETE 171 | elif snapshot['State'] == 'error': 172 | raise SnapshotCreateError(snapshot['status']) 173 | else: 174 | raise NotImplementedError() 175 | 176 | 177 | def set_snapshot_labels( 178 | ctx: Context, 179 | snapshot_identifier: NewSnapshotIdentifier, 180 | labels: Dict 181 | ): 182 | connection = get_connection(ctx, snapshot_identifier['region']) 183 | connection.create_tags( 184 | Resources=[snapshot_identifier['id']], 185 | Tags=[{'Key': k, 'Value': v} for k, v in labels.items()] 186 | ) 187 | 188 | 189 | def delete_snapshot( 190 | ctx: Context, 191 | snapshot: Snapshot 192 | ): 193 | connection = get_connection(ctx, snapshot.disk.region) 194 | connection.delete_snapshot(SnapshotId=snapshot.name) 195 | 196 | 197 | def get_connection(ctx: Context, region): 198 | connection = boto3.client('ec2', region_name=region) 199 | return connection 200 | -------------------------------------------------------------------------------- /k8s_snapshots/logconf.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import logging.config 3 | from collections import OrderedDict 4 | from typing import Optional, List, Any, Dict 5 | 6 | import structlog 7 | import sys 8 | 9 | from k8s_snapshots import serialize 10 | 11 | 12 | class ProcessStructuredErrors: 13 | def __init__(self): 14 | pass 15 | 16 | def __call__(self, logger, method_name, event_dict): 17 | exc_info = event_dict.pop('exc_info', None) 18 | 19 | if exc_info is None: 20 | return event_dict 21 | 22 | exc_type, exc, exc_tb = structlog.processors._figure_out_exc_info( 23 | exc_info) 24 | 25 | __structlog__ = getattr(exc, '__structlog__', None) 26 | 27 | if not callable(__structlog__): 28 | event_dict['exc_info'] = exc_info 29 | return event_dict 30 | 31 | structured_error = __structlog__() 32 | event_dict['structured_error'] = structured_error 33 | 34 | return event_dict 35 | 36 | 37 | def add_message(logger, method_name, event_dict): 38 | """ 39 | Creates a ``message`` value based on the ``hint`` and ``key_hint`` keys. 40 | 41 | ``key_hint`` : ``Optional[str]`` 42 | a '.'-separated path of dictionary keys. 43 | 44 | ``hint`` : ``Optional[str]`` 45 | will be formatted using ``.format(**event_dict)``. 46 | """ 47 | def from_hint(ed): 48 | hint = event_dict.pop('hint', None) 49 | if hint is None: 50 | return 51 | 52 | try: 53 | return hint.format(**event_dict) 54 | except Exception as exc: 55 | return f'! error formatting message: {exc!r}' 56 | 57 | def path_value(dict_: Dict[str, Any], key_path: str) -> Optional[Any]: 58 | value = dict_ 59 | 60 | for key in key_path.split('.'): 61 | if value is None: 62 | return 63 | 64 | __structlog__ = getattr(value, '__structlog__', None) 65 | if __structlog__ is not None: 66 | value = __structlog__() 67 | 68 | value = value.get(key) 69 | 70 | return value 71 | 72 | def from_key_hint(ed) -> Optional[str]: 73 | key_hint = ed.pop('key_hint', None) 74 | if key_hint is None: 75 | return 76 | 77 | value = path_value(ed, key_hint) 78 | 79 | return format_kv(key_hint, value) 80 | 81 | def from_key_hints(ed) -> List[str]: 82 | key_hints = ed.pop('key_hints', None) 83 | if key_hints is None: 84 | return [] 85 | 86 | return [ 87 | format_kv(key_hint, path_value(ed, key_hint)) 88 | for key_hint in key_hints 89 | ] 90 | 91 | def format_kv(key: str, value: Any) -> str: 92 | return f'{key}={serialize.process(value)}' 93 | 94 | hints = [ 95 | from_hint(event_dict), 96 | from_key_hint(event_dict) 97 | ] 98 | hints += from_key_hints(event_dict) 99 | 100 | if all(hint is None for hint in hints): 101 | if event_dict.get('message') is None: 102 | event_dict['message'] = event_dict.get('event') 103 | return event_dict 104 | 105 | prefix = event_dict['event'] 106 | hint = ', '.join(hint for hint in hints if hint is not None) 107 | 108 | message = event_dict.get('message') 109 | if message is not None: 110 | message = f'{prefix}: {message}, {hint}' 111 | else: 112 | message = f'{prefix}: {hint}' 113 | 114 | event_dict['message'] = message 115 | return event_dict 116 | 117 | 118 | def configure_from_config(config): 119 | configure_logging( 120 | level_name=config['log_level'], 121 | for_humans=not config['json_log'], 122 | json_indent=config['structlog_json_indent'] or None, 123 | ) 124 | 125 | 126 | def configure_logging( 127 | level_name: str='INFO', 128 | for_humans: bool=False, 129 | json_indent: Optional[int]=None, 130 | ): 131 | configure_structlog( 132 | for_humans=for_humans, 133 | json_indent=json_indent, 134 | level_name=level_name, 135 | ) 136 | 137 | 138 | def configure_structlog( 139 | for_humans: bool=False, 140 | json_indent: Optional[int]=None, 141 | level_name: str='INFO' 142 | ): 143 | key_order = ['message', 'event', 'level'] 144 | timestamper = structlog.processors.TimeStamper(fmt='ISO') 145 | 146 | processors = [ 147 | event_enum_to_str, 148 | ProcessStructuredErrors(), 149 | structlog.stdlib.add_logger_name, 150 | structlog.stdlib.add_log_level, 151 | rename_level_to_severity, 152 | timestamper, 153 | structlog.processors.StackInfoRenderer(), 154 | structlog.processors.format_exc_info, 155 | add_func_name, 156 | add_message, 157 | order_keys(key_order), 158 | structlog.stdlib.ProcessorFormatter.wrap_for_formatter, 159 | ] 160 | 161 | if for_humans: 162 | renderer = structlog.dev.ConsoleRenderer() # <=== 163 | else: 164 | # Make it so that 0 ⇒ None 165 | indent = json_indent or None 166 | renderer = structlog.processors.JSONRenderer( 167 | indent=indent, 168 | serializer=serialize.dumps 169 | ) 170 | 171 | foreign_pre_chain = [ 172 | # Add the log level and a timestamp to the event_dict if the log entry 173 | # is not from structlog. 174 | structlog.processors.StackInfoRenderer(), 175 | structlog.processors.format_exc_info, 176 | structlog.stdlib.add_log_level, 177 | structlog.stdlib.add_logger_name, 178 | foreign_event_to_message, 179 | rename_level_to_severity, 180 | timestamper, 181 | ] 182 | 183 | if level_name == 'DEBUG': 184 | root_logger_level = 'DEBUG' 185 | else: 186 | root_logger_level = 'ERROR' 187 | 188 | logging_config = { 189 | 'version': 1, 190 | 'disable_existing_loggers': False, 191 | 'formatters': { 192 | 'structlog': { 193 | '()': structlog.stdlib.ProcessorFormatter, 194 | 'processor': renderer, 195 | 'foreign_pre_chain': foreign_pre_chain, 196 | }, 197 | }, 198 | 'handlers': { 199 | 'default': { 200 | 'level': level_name, 201 | 'class': 'logging.StreamHandler', 202 | 'stream': sys.stdout, 203 | 'formatter': 'structlog', 204 | }, 205 | }, 206 | 'loggers': { 207 | '': { 208 | 'handlers': ['default'], 209 | 'level': root_logger_level, 210 | 'propagate': True, 211 | }, 212 | 'k8s_snapshots': { 213 | 'level': 'DEBUG', 214 | } 215 | } 216 | } 217 | 218 | logging.config.dictConfig(logging_config) 219 | 220 | structlog.configure( 221 | processors=processors, 222 | context_class=OrderedDict, 223 | logger_factory=structlog.stdlib.LoggerFactory(), 224 | wrapper_class=structlog.stdlib.BoundLogger, 225 | cache_logger_on_first_use=True, 226 | ) 227 | 228 | 229 | def foreign_event_to_message(logger, method_name, event_dict): 230 | event = event_dict.get('event') 231 | 232 | if event is not None and 'message' not in event_dict: 233 | event_dict['message'] = event 234 | event_dict['event'] = 'foreign' 235 | 236 | return event_dict 237 | 238 | 239 | def rename_level_to_severity(logger, method_name, event_dict): 240 | level = event_dict.pop('level', None) 241 | 242 | event_dict['severity'] = level.upper() 243 | 244 | return event_dict 245 | 246 | 247 | def add_func_name(logger, method_rame, event_dict): 248 | record = event_dict.get('_record') 249 | if record is None: 250 | return event_dict 251 | 252 | event_dict['function'] = record.funcName 253 | 254 | return event_dict 255 | 256 | 257 | def order_keys(order): 258 | """ 259 | Order keys for JSON readability when not using json_log=True 260 | """ 261 | def processor(logger, method_name, event_dict): 262 | if not isinstance(event_dict, OrderedDict): 263 | return event_dict 264 | 265 | for key in reversed(order): 266 | if key in event_dict: 267 | event_dict.move_to_end(key, last=False) 268 | 269 | return event_dict 270 | return processor 271 | 272 | 273 | def event_enum_to_str(logger, method_name, event_dict): 274 | from k8s_snapshots import events 275 | event = event_dict.get('event') 276 | if event is None: 277 | return event_dict 278 | 279 | if isinstance(event, events.EventEnum): 280 | event_dict['event'] = event.value 281 | 282 | return event_dict 283 | -------------------------------------------------------------------------------- /tests/fixtures/kube.py: -------------------------------------------------------------------------------- 1 | import contextlib 2 | from typing import Dict, Iterable, Optional, Tuple, List, Any, Type, Hashable, \ 3 | NamedTuple, Generator, Callable 4 | from unittest import mock 5 | from unittest.mock import MagicMock, Mock 6 | 7 | import structlog 8 | import pykube 9 | import pytest 10 | from _pytest.fixtures import FixtureRequest 11 | 12 | from k8s_snapshots import kube, errors 13 | from k8s_snapshots.context import Context 14 | 15 | _logger = structlog.get_logger(__name__) 16 | 17 | KUBE_SAFETY_CHECK_CONFIG_KEY = 'test-fixture-safety-check' 18 | 19 | KUBE_CONFIG = pykube.KubeConfig({ 20 | 'apiVersion': 'v1', 21 | 'kind': 'Config', 22 | 'clusters': [ 23 | { 24 | 'name': 'test-fixture-cluster', 25 | 'certificate-authority-data': 'From fixture fx_kube_config', 26 | 'server': 'http://test-fixture-server', 27 | }, 28 | ], 29 | 'contexts': [ 30 | { 31 | 'name': 'test-fixture-context', 32 | 'context': { 33 | 'cluster': 'test-fixture-cluster', 34 | 'user': 'test-fixture-user', 35 | }, 36 | }, 37 | ], 38 | 'current-context': 'test-fixture-context', 39 | KUBE_SAFETY_CHECK_CONFIG_KEY: 'I am present', 40 | }) 41 | 42 | LABEL_ZONE_VALUE = 'test-zone' 43 | 44 | LABEL_ZONE_KEY = 'failure-domain.beta.kubernetes.io/zone' 45 | LABEL_ZONE = {LABEL_ZONE_KEY: LABEL_ZONE_VALUE} 46 | 47 | DELTAS_ANNOTATION = 'PT1M PT2M' 48 | 49 | DEFAULT = object() 50 | 51 | 52 | @pytest.fixture(scope='session', autouse=True) 53 | def fx_mock_context_kube_config(): 54 | with mock.patch( 55 | 'k8s_snapshots.context.Context.load_kube_config', 56 | return_value=KUBE_CONFIG) as _mock: 57 | assert Context().load_kube_config() == KUBE_CONFIG 58 | yield _mock 59 | 60 | 61 | @pytest.fixture(scope='session', autouse=True) 62 | def fx_mock_context_kube_client(): 63 | def _fake_client(self: Context): 64 | return MagicMock( 65 | spec=pykube.HTTPClient, 66 | config=self.load_kube_config() 67 | ) 68 | with mock.patch( 69 | 'k8s_snapshots.context.Context.kube_client', 70 | _fake_client, 71 | ) as _mock: 72 | yield _mock 73 | 74 | 75 | @pytest.fixture 76 | def fx_kube_config(request: FixtureRequest) -> pykube.KubeConfig: 77 | """ 78 | Minimal fake pykube.HTTPClient config fixture. 79 | """ 80 | return KUBE_CONFIG 81 | 82 | 83 | class MockKubernetes(kube.Kubernetes): 84 | def __init__(self, *args, **kwargs): 85 | super(MockKubernetes, self).__init__(*args, **kwargs) 86 | 87 | def get_or_none( 88 | self, 89 | resource_type: Type[kube.Resource], 90 | name: str, 91 | namespace: Optional[str]=None, 92 | ) -> Optional[kube.Resource]: 93 | return self.resource_map.get( 94 | self.make_key( 95 | resource_type, 96 | name, 97 | namespace 98 | ) 99 | ) 100 | 101 | def watch( 102 | self, 103 | resource_type: Type[kube.Resource], 104 | ): 105 | raise NotImplementedError 106 | 107 | # Mock-specific methods 108 | 109 | ResourceKey = NamedTuple( 110 | 'ResourceKey', 111 | [ 112 | ('namespace', str), 113 | ('resource_type', Type[kube.Resource]), 114 | ('name', str) 115 | ] 116 | ) 117 | 118 | resource_map: Dict[ResourceKey, kube.Resource] = {} 119 | 120 | # def filter_resources( 121 | # self, 122 | # namespace: Optional[str]=None, 123 | # resource_type: Optional[Type[kube.Resource]]=None, 124 | # name: Optional[str]=None 125 | # ) -> Generator[kube.Resource, None, None]: 126 | # tests: List[Callable[[self.ResourceKey, kube.Resource], bool]] 127 | # tests = [] 128 | # if namespace is not None: 129 | # tests.append(lambda k, v: k.namespace == namespace) 130 | # 131 | # if resource_type is not None: 132 | # tests.append(lambda k, v: k.resource_type == resource_type) 133 | # 134 | # if name is not None: 135 | # tests.append(lambda k, v: k.name == name) 136 | # 137 | # for key, resource in self.resource_map.items(): 138 | # if all(test(key, resource) for test in tests): 139 | # yield resource 140 | 141 | @classmethod 142 | def resource_key(cls, resource: kube.Resource) -> Hashable: 143 | return cls.make_key(type(resource), resource.name, resource.namespace) 144 | 145 | @classmethod 146 | def make_key( 147 | cls, 148 | resource_type: Type[kube.Resource], 149 | name: str, 150 | namespace: Any=DEFAULT, 151 | ) -> ResourceKey: 152 | if namespace is DEFAULT: 153 | namespace = 'default' 154 | return cls.ResourceKey(namespace, resource_type, name) 155 | 156 | @classmethod 157 | def add_resource(cls, resource, overwrite=False): 158 | key = cls.make_key(type(resource), resource.name, resource.namespace) 159 | if not overwrite and key in cls.resource_map: 160 | raise AssertionError( 161 | f'An object with the key {key!r} already exists in the ' 162 | f'resource map') 163 | _logger.debug('MockKubernetes.add_resource', resource=resource) 164 | cls.resource_map[key] = resource 165 | 166 | @classmethod 167 | @contextlib.contextmanager 168 | def patch(cls, resources: Iterable[kube.Resource]): 169 | try: 170 | _logger.debug( 171 | 'MockKubernetes.patch', 172 | message='Patching Kubernetes', 173 | resources=resources 174 | ) 175 | for resource in resources: 176 | cls.add_resource(resource) 177 | 178 | patch_kubernetes = mock.patch( 179 | 'k8s_snapshots.kube.Kubernetes', 180 | cls 181 | ) 182 | with patch_kubernetes: 183 | yield 184 | finally: 185 | _logger.debug( 186 | 'MockKubernetes.patch', 187 | message='Cleaning up after patch' 188 | ) 189 | cls.resource_map.clear() 190 | 191 | 192 | @contextlib.contextmanager 193 | def mock_kube(resources: Iterable[kube.Resource]): 194 | """ 195 | Mock the resources available through the `k8s_snapshots.kube.Kubernetes` 196 | abstraction. 197 | 198 | Parameters 199 | ---------- 200 | resources 201 | 202 | Returns 203 | ------- 204 | The `k8s_snapshots.kube.Kubernetes` mock 205 | 206 | """ 207 | with MockKubernetes.patch(resources): 208 | yield 209 | 210 | 211 | def make_resource( 212 | resource_type: Type[kube.Resource], 213 | name, 214 | namespace=DEFAULT, 215 | labels=DEFAULT, 216 | annotations=DEFAULT, 217 | spec=DEFAULT, 218 | ) -> kube.Resource: 219 | """ 220 | Create a Kubernetes Resource. 221 | """ 222 | 223 | if namespace is DEFAULT: 224 | namespace = 'default' 225 | 226 | if annotations is DEFAULT: 227 | annotations = {} 228 | 229 | api = MagicMock( 230 | spec=pykube.HTTPClient, 231 | config=Mock() 232 | ) 233 | 234 | if spec is DEFAULT: 235 | spec = {} 236 | 237 | obj = { 238 | 'metadata': { 239 | 'name': name, 240 | 'annotations': annotations, 241 | 'selfLink': f'test/{namespace}/{resource_type.endpoint}/{name}' 242 | }, 243 | 'spec': spec, 244 | } 245 | 246 | if labels is not DEFAULT: 247 | obj['metadata']['labels'] = labels 248 | if namespace is not DEFAULT: 249 | obj['metadata']['namespace'] = namespace 250 | 251 | return resource_type(api, obj) 252 | 253 | 254 | def make_volume_and_claim( 255 | ctx, 256 | volume_name='test-pv', 257 | claim_name='test-pvc', 258 | volume_annotations=DEFAULT, 259 | claim_annotations=DEFAULT, 260 | claim_namespace=DEFAULT, 261 | volume_zone_label=DEFAULT, 262 | ) -> Tuple[ 263 | pykube.objects.PersistentVolume, 264 | pykube.objects.PersistentVolumeClaim 265 | ]: 266 | """ 267 | Creates 268 | 269 | """ 270 | if volume_zone_label is DEFAULT: 271 | volume_zone_label = {LABEL_ZONE_KEY: LABEL_ZONE_VALUE} 272 | 273 | pv = make_resource( 274 | pykube.objects.PersistentVolume, 275 | volume_name, 276 | annotations=volume_annotations, 277 | labels=volume_zone_label, 278 | spec={ 279 | 'claimRef': { 280 | 'name': claim_name, 281 | 'namespace': claim_namespace, 282 | }, 283 | 'gcePersistentDisk': { 284 | 'pdName': 'test-pd' 285 | } 286 | } 287 | ) 288 | 289 | pvc = make_resource( 290 | pykube.objects.PersistentVolumeClaim, 291 | claim_name, 292 | annotations=claim_annotations, 293 | namespace=claim_namespace, 294 | spec={ 295 | 'volumeName': volume_name, 296 | } 297 | ) 298 | 299 | return pv, pvc 300 | 301 | 302 | @pytest.fixture 303 | def fx_volume_zone_label(request): 304 | return {LABEL_ZONE_KEY: LABEL_ZONE_VALUE} 305 | 306 | 307 | @pytest.fixture 308 | def fx_annotation_deltas(request): 309 | deltas = request.getfixturevalue('fx_deltas') 310 | context = request.getfixturevalue('fx_context') 311 | return { 312 | context.config['deltas_annotation_key']: deltas 313 | } 314 | 315 | 316 | def spec_gce_persistent_disk(pd_name): 317 | return { 318 | 'gcePersistentDisk': { 319 | 'pdName': pd_name 320 | } 321 | } 322 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Interval-based Volume Snapshots and Expiry on Kubernetes 2 | ======================================================== 3 | 4 | **What you do:** Create a custom `SnapshotRule` resource which defines your desired snapshot intervals. 5 | **What I do:** Create snapshots of your volumes, and expire old ones using a Grandfather-father-son backup scheme. 6 | 7 | **Supported Environments**: 8 | 9 | - Google Compute Engine disks. 10 | - AWS EBS disks. 11 | - Digital Ocean. 12 | 13 | Want to help adding support for other backends? It's pretty straightforward. 14 | Have a look at the [API that backends need to implement](https://github.com/miracle2k/k8s-snapshots/blob/master/k8s_snapshots/backends/abstract.py). 15 | 16 | 17 | Quickstart 18 | ---------- 19 | 20 | A persistent volume claim: 21 | 22 | ``` 23 | cat < 178 | spec: 179 | serviceAccountName: k8s-snapshots 180 | containers: 181 | - name: k8s-snapshots 182 | image: elsdoerfer/k8s-snapshots:v2.0 183 | 184 | ``` 185 | 186 | Further Configuration Options 187 | ----------------------------- 188 | 189 | 190 | ### Pinging a third party service 191 | 192 | 193 | 194 | 195 | 200 | 201 |
PING_URL 196 | We'll send a GET request to this url whenever a backup completes. 197 | This is useful for integrating with monitoring services like 198 | Cronitor or Dead Man's Snitch. 199 |
202 | 203 | 204 | ### Make snapshot names more readable 205 | 206 | If your persistent volumes are auto-provisioned by Kubernetes, then 207 | you'll end up with snapshot names such as 208 | ``pv-pvc-01f74065-8fe9-11e6-abdd-42010af00148``. If you want that 209 | prettier, set the enviroment variable ``USE_CLAIM_NAME=true``. Instead 210 | of the auto-generated name of the persistent volume, *k8s-snapshots* 211 | will instead use the name that you give to your 212 | ``PersistentVolumeClaim``. 213 | 214 | 215 | ### SnapshotRule resources 216 | 217 | It's possible to ask *k8s-snapshots* to create snapshots of volumes 218 | for which no `PersistentVolume` object exists within the Kubernetes 219 | cluster. For example, you might have a volume at your Cloud provider 220 | that you use within Kubernetes by referencing it directly. 221 | 222 | To do this, we use a custom Kubernetes resource, `SnapshotRule`. 223 | 224 | First, you need to create this custom resource. 225 | 226 | On Kubernetes 1.7 and higher: 227 | 228 | ``` 229 | cat < 310 | 311 | LOG_LEVEL 312 | **Default: INFO**. Possible values: DEBUG, INFO, WARNING, ERROR 313 | 314 | 315 | JSON_LOG 316 | **Default: False**. Output the log messages as JSON objects for 317 | easier processing. 318 | 319 | 320 | TZ 321 | **Default: UTC**. Used to change the timezone. ie. TZ=America/Montreal 322 | 323 | 324 | 325 | 326 | FAQ 327 | ---- 328 | 329 | **What if I manually create snapshots for the same volumes that 330 | *k8s-snapshots* manages?** 331 | 332 | Starting with v0.3, when *k8s-snapshots* decides when to create the 333 | next snapshot, and which snapshots it deletes, it no longer considers 334 | snapshots that are not correctly labeled by it. 335 | -------------------------------------------------------------------------------- /k8s_snapshots/snapshot.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import inspect 3 | from datetime import timedelta 4 | from typing import Dict, Tuple, List, Iterable, Callable, Union, \ 5 | Awaitable, Any, Set 6 | 7 | import aiohttp 8 | import pendulum 9 | import re 10 | import structlog 11 | from tarsnapper.expire import expire 12 | 13 | from k8s_snapshots import events, errors, serialize 14 | from k8s_snapshots.asyncutils import run_in_executor, combine_latest, debounce 15 | from k8s_snapshots.context import Context 16 | from k8s_snapshots.rule import Rule, get_backend_for_rule 17 | from .backends.abstract import Snapshot, NewSnapshotIdentifier, SnapshotStatus 18 | 19 | 20 | _logger = structlog.get_logger(__name__) 21 | 22 | 23 | async def expire_snapshots(ctx, rule: Rule): 24 | """ 25 | Expire existing snapshots for the rule. 26 | """ 27 | _log = _logger.new( 28 | rule=rule, 29 | ) 30 | 31 | _log.debug(events.Expiration.STARTED) 32 | 33 | backend = get_backend_for_rule(ctx, rule) 34 | 35 | snapshots_objects = filter_snapshots_by_rule( 36 | await load_snapshots(ctx, {backend}), rule) 37 | snapshots_with_date = {s: s.created_at for s in snapshots_objects} 38 | 39 | to_keep = expire(snapshots_with_date, rule.deltas) 40 | expired_snapshots: List[str] = [] 41 | kept_snapshots = [] 42 | 43 | for snapshot, snapshot_time_created in snapshots_with_date.items(): 44 | _log_inner = _log.new( 45 | snapshot_name=snapshot.name, 46 | snapshot_time_created=snapshot_time_created, 47 | key_hints=[ 48 | 'snapshot_name', 49 | 'snapshot_time_created', 50 | ] 51 | ) 52 | 53 | if snapshot in to_keep: 54 | _log_inner.debug(events.Expiration.KEPT) 55 | kept_snapshots.append(snapshot.name) 56 | continue 57 | 58 | if snapshot not in to_keep: 59 | _log_inner.info(events.Expiration.DELETE) 60 | 61 | # TODO: Deleting a snapshot is usually an async process too, 62 | # and to be completely accurate, we should wait for it to complete. 63 | backend = get_backend_for_rule(ctx, rule) 64 | await run_in_executor( 65 | lambda: backend.delete_snapshot(ctx, snapshot) 66 | ) 67 | expired_snapshots.append(snapshot.name) 68 | 69 | _log.info( 70 | events.Expiration.COMPLETE, 71 | snapshots={ 72 | 'expired': expired_snapshots, 73 | 'kept': kept_snapshots, 74 | } 75 | ) 76 | 77 | 78 | async def make_backup(ctx, rule): 79 | """Execute a single backup job. 80 | 81 | 1. Create the snapshot 82 | 2. Wait until the snapshot is finished. 83 | 3. Expire old snapshots 84 | """ 85 | 86 | backend = get_backend_for_rule(ctx, rule) 87 | snapshot_name = new_snapshot_name(ctx, rule) 88 | 89 | _log = _logger.new( 90 | snapshot_name=snapshot_name, 91 | rule=rule 92 | ) 93 | 94 | time_start = pendulum.now() 95 | 96 | try: 97 | snapshot_identifier = await create_snapshot( 98 | ctx, 99 | rule, 100 | snapshot_name, 101 | snapshot_description=serialize.dumps(rule), 102 | ) 103 | 104 | _log.debug( 105 | 'snapshot.operation-started', 106 | key_hints=[ 107 | 'snapshot_name' 108 | ], 109 | snapshot_identifier=snapshot_identifier 110 | ) 111 | 112 | await poll_for_status( 113 | lambda: get_snapshot_status(ctx, backend, snapshot_identifier), 114 | retry_for=(SnapshotStatus.PENDING,) 115 | ) 116 | 117 | # TODO: If there is some kind of coding error, we should crash I think. 118 | except Exception as exc: 119 | _log.exception( 120 | events.Snapshot.ERROR, 121 | key_hints=['snapshot_name', 'rule.name'] 122 | ) 123 | raise errors.SnapshotCreateError( 124 | 'Error creating snapshot' 125 | ) from exc 126 | 127 | await set_snapshot_labels( 128 | ctx, 129 | backend, 130 | snapshot_identifier, 131 | snapshot_labels(ctx), 132 | ) 133 | time_taken = pendulum.now() - time_start 134 | 135 | _log.info( 136 | events.Snapshot.CREATED, 137 | snapshot_identifier=snapshot_identifier, 138 | time_taken=time_taken, 139 | time_taken_seconds=time_taken.total_seconds(), 140 | key_hints=[ 141 | 'snapshot_name', 142 | 'rule.name', 143 | 'time_taken_seconds' 144 | ], 145 | ) 146 | 147 | ping_url = ctx.config.get('ping_url') 148 | if ping_url: 149 | async with aiohttp.ClientSession() as session: 150 | response = await session.request('GET', ping_url) 151 | _log.info( 152 | events.Ping.SENT, 153 | status=response.status, 154 | url=ping_url, 155 | ) 156 | 157 | await expire_snapshots(ctx, rule) 158 | 159 | 160 | async def create_snapshot( 161 | ctx: Context, 162 | rule: Rule, 163 | snapshot_name: str, 164 | snapshot_description: str 165 | ) -> NewSnapshotIdentifier: 166 | _log = _logger.new( 167 | disk=rule.disk, 168 | rule=rule, 169 | snapshot_name=snapshot_name, 170 | snapshot_description=snapshot_description 171 | ) 172 | 173 | _log.info( 174 | events.Snapshot.START, 175 | key_hints=['snapshot_name', 'rule.name'] 176 | ) 177 | 178 | backend = get_backend_for_rule(ctx, rule) 179 | return await run_in_executor( 180 | lambda: backend.create_snapshot( 181 | ctx, 182 | rule.disk, 183 | snapshot_name, 184 | snapshot_description 185 | ) 186 | ) 187 | 188 | 189 | async def poll_for_status( 190 | refresh_func: Callable[..., Union[Dict, Awaitable[Dict]]], 191 | retry_for: Tuple[SnapshotStatus], 192 | sleep_time: int=1, 193 | ): 194 | """ 195 | Call refresh_func until the return value is not one of the values 196 | in ``retry_for``. 197 | 198 | Parameters 199 | ---------- 200 | refresh_func 201 | Callable that returns either 202 | 203 | - The new version of the resource. 204 | - An awaitable for the new version of the resource. 205 | retry_for 206 | A list of statuses to retry for. 207 | sleep_time 208 | The time, in seconds, to sleep for between calls. 209 | 210 | Returns 211 | ------- 212 | 213 | """ 214 | _log = _logger.new() 215 | refresh_count = 0 216 | time_start = pendulum.now() 217 | 218 | while True: 219 | await asyncio.sleep(sleep_time) # Sleep first 220 | 221 | result = refresh_func() 222 | if inspect.isawaitable(result): 223 | result = await result 224 | 225 | _log.debug( 226 | 'poll-for-status.refreshed', 227 | key_hints=[ 228 | 'result' 229 | ], 230 | refresh_count=refresh_count, 231 | result=result 232 | ) 233 | 234 | if not result in retry_for: 235 | break 236 | 237 | refresh_count += 1 238 | 239 | time_taken = pendulum.now() - time_start 240 | 241 | _log.debug( 242 | 'poll-for-status.done', 243 | key_hints=[ 244 | 'refresh_count', 245 | 'time_taken', 246 | ], 247 | refresh_count=refresh_count, 248 | time_start=time_start, 249 | time_taken=time_taken 250 | ) 251 | 252 | return result 253 | 254 | 255 | def snapshot_author_label(ctx: Context) -> Tuple[str, str]: 256 | return ( 257 | ctx.config['snapshot_author_label_key'], 258 | ctx.config['snapshot_author_label'] 259 | ) 260 | 261 | 262 | def snapshot_labels(ctx: Context) -> Dict: 263 | return dict([snapshot_author_label(ctx)]) 264 | 265 | 266 | async def set_snapshot_labels( 267 | ctx: Context, 268 | backend: Any, 269 | snapshot_identifier: NewSnapshotIdentifier, 270 | labels: Dict 271 | ): 272 | _log = _logger.new( 273 | snapshot_identifier=snapshot_identifier, 274 | labels=labels, 275 | ) 276 | 277 | _log.debug( 278 | 'snapshot.set-labels', 279 | key_hints=['body.labels'] 280 | ) 281 | return await run_in_executor( 282 | lambda: backend.set_snapshot_labels(ctx, snapshot_identifier, labels) 283 | ) 284 | 285 | 286 | def new_snapshot_name(ctx: Context, rule: Rule) -> str: 287 | """ 288 | Get a new snapshot name for rule. 289 | Returns rule name and pendulum.now('utc') formatted according to settings. 290 | """ 291 | 292 | time_str = re.sub( 293 | r'[^-a-z0-9]', '-', 294 | pendulum.now('utc').format(ctx.config['snapshot_datetime_format']), 295 | flags=re.IGNORECASE) 296 | 297 | # Won't be truncated 298 | suffix = f'-{time_str}' 299 | 300 | # Will be truncated 301 | name_truncated = rule.name[:63 - len(suffix)] 302 | 303 | return f'{name_truncated}{suffix}' 304 | 305 | 306 | async def get_snapshot_status( 307 | ctx: Context, 308 | backend: Any, 309 | snapshot_identifier: NewSnapshotIdentifier 310 | ): 311 | return await run_in_executor( 312 | lambda: backend.get_snapshot_status(ctx, snapshot_identifier) 313 | ) 314 | 315 | 316 | async def get_snapshots(ctx: Context, rulesgen, reload_trigger): 317 | """Query the existing snapshots from the cloud provider backend(s). 318 | 319 | "rules" are all the disk rules we know about, and through it, we know 320 | the set of backends that are in play, and that need to verified. 321 | 322 | If the channel "reload_trigger" contains any value, we 323 | refresh the list of snapshots. This will then cause the 324 | next backup to be scheduled. 325 | """ 326 | 327 | combined = combine_latest( 328 | rules=debounce(rulesgen, 4), 329 | reload=reload_trigger 330 | ) 331 | 332 | async for item in combined: 333 | # Figure out a set of backends that are in use with the rules 334 | backends = set() 335 | for rule in item['rules']: 336 | backends.add(get_backend_for_rule(ctx, rule)) 337 | 338 | # Load and yield the snapshots for the set of backends. 339 | yield await load_snapshots(ctx, backends) 340 | 341 | 342 | async def load_snapshots(ctx: Context, backends: Set[Any]) -> List[Snapshot]: 343 | snapshot_label_filters = dict([snapshot_author_label(ctx)]) 344 | 345 | tasks = map(lambda backend: run_in_executor( 346 | lambda: backend.load_snapshots(ctx, snapshot_label_filters) 347 | ), backends) 348 | 349 | snapshot_results = await asyncio.gather(*tasks) 350 | return [snapshot for result in snapshot_results for snapshot in result] 351 | 352 | 353 | def determine_next_snapshot(snapshots, rules): 354 | """ 355 | Given a list of snapshots, and a list of rules, determine the next snapshot 356 | to be made. 357 | 358 | Returns a 2-tuple (rule, target_datetime) 359 | """ 360 | next_rule = None 361 | next_timestamp = None 362 | next_snapshot_times = None 363 | 364 | for rule in rules: 365 | _log = _logger.new(rule=rule) 366 | snapshot_times = get_snapshot_times_for_rule(snapshots, rule) 367 | 368 | # There are no snapshots for this rule; create the first one. 369 | if not snapshot_times: 370 | next_rule = rule 371 | next_timestamp = pendulum.now('utc') + timedelta(seconds=10) 372 | next_snapshot_times = snapshot_times 373 | break 374 | 375 | target = snapshot_times[0] + rule.deltas[0] 376 | if not next_timestamp or target < next_timestamp: 377 | next_rule = rule 378 | next_timestamp = target 379 | next_snapshot_times = snapshot_times 380 | 381 | if next_rule is not None and next_timestamp is not None: 382 | _logger.info( 383 | events.Snapshot.SCHEDULED, 384 | key_hints=['rule.name', 'target'], 385 | target=next_timestamp, 386 | rule=next_rule, 387 | times=list(map(lambda t: str(t), next_snapshot_times)) 388 | ) 389 | 390 | return next_rule, next_timestamp 391 | 392 | 393 | def get_snapshot_times_for_rule(snapshots: List[Snapshot], rule: Rule): 394 | # Find all the snapshots that match this rule 395 | # This returns a object 396 | snapshots_for_rule = filter_snapshots_by_rule(snapshots, rule) 397 | # Rewrite the list to snapshot 398 | snapshot_times = [item.created_at for item in snapshots_for_rule] 399 | # Sort by timestamp 400 | snapshot_times = sorted(snapshot_times, reverse=True) 401 | return list(snapshot_times) 402 | 403 | 404 | def filter_snapshots_by_rule(snapshots: List[Snapshot], rule: Rule) -> Iterable[Snapshot]: 405 | def match_disk(snapshot: Snapshot): 406 | return snapshot.disk == rule.disk 407 | return filter(match_disk, snapshots) 408 | 409 | 410 | async def is_snapshot_required(ctx: Context, rule: Rule): 411 | backend = get_backend_for_rule(ctx, rule) 412 | all_snapshots = await load_snapshots(ctx, {backend}) 413 | return snapshots_for_rule_are_outdated(rule, all_snapshots) 414 | 415 | 416 | def snapshots_for_rule_are_outdated(rule: Rule, existing_snapshots: List[Snapshot]): 417 | snapshot_times = get_snapshot_times_for_rule(existing_snapshots, rule) 418 | 419 | if not snapshot_times: 420 | return True 421 | 422 | next_snapshot_time = snapshot_times[0] + rule.deltas[0] 423 | return next_snapshot_time < pendulum.now('utc') 424 | -------------------------------------------------------------------------------- /k8s_snapshots/backends/google.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pendulum 3 | import re 4 | import requests 5 | from typing import List, Dict, NamedTuple 6 | from googleapiclient import discovery 7 | from oauth2client.service_account import ServiceAccountCredentials 8 | from oauth2client.client import GoogleCredentials 9 | import pykube.objects 10 | import structlog 11 | from k8s_snapshots.context import Context 12 | from .abstract import Snapshot, SnapshotStatus, DiskIdentifier, NewSnapshotIdentifier 13 | from ..errors import SnapshotCreateError, UnsupportedVolume 14 | 15 | 16 | _logger = structlog.get_logger(__name__) 17 | 18 | 19 | #: The regex that a snapshot name has to match. 20 | #: Regex provided by the createSnapshot error response. 21 | GOOGLE_SNAPSHOT_NAME_REGEX = r'^(?:[a-z](?:[-a-z0-9]{0,61}[a-z0-9])?)$' 22 | 23 | # Google Label keys and values must conform to the following restrictions: 24 | # - Keys and values cannot be longer than 63 characters each. 25 | # - Keys and values can only contain lowercase letters, numeric characters, 26 | # underscores, and dashes. International characters are allowed. 27 | # - Label keys must start with a lowercase letter and international characters 28 | # are allowed. 29 | # - Label keys cannot be empty. 30 | # See https://cloud.google.com/compute/docs/labeling-resources for more 31 | 32 | #: The regex that a label key and value has to match, additionally it has to be 33 | #: lowercase, this is checked with str().islower() 34 | GOOGLE_LABEL_REGEX = r'^(?:[-\w]{0,63})$' 35 | 36 | 37 | def get_project_id(ctx: Context): 38 | if not ctx.config['gcloud_project']: 39 | response = requests.get( 40 | 'http://metadata.google.internal/computeMetadata/v1/project/project-id', 41 | headers={ 42 | 'Metadata-Flavor': 'Google' 43 | }) 44 | response.raise_for_status() 45 | ctx.config['gcloud_project'] = response.text 46 | 47 | return ctx.config['gcloud_project'] 48 | 49 | 50 | # TODO: This is currently not called. When should we do so? Once the Google 51 | # Cloud backend is loaded for the first time? 52 | def validate_config(config): 53 | """Ensure the config of this backend is correct. 54 | """ 55 | 56 | is_valid = True 57 | 58 | test_datetime = pendulum.now('utc').format( 59 | config['snapshot_datetime_format']) 60 | test_snapshot_name = f'dummy-snapshot-{test_datetime}' 61 | 62 | if not re.match(GOOGLE_SNAPSHOT_NAME_REGEX, test_snapshot_name): 63 | _logger.error( 64 | 'config.error', 65 | key='snapshot_datetime_format', 66 | message='Snapshot datetime format returns invalid string. ' 67 | 'Note that uppercase characters are forbidden.', 68 | test_snapshot_name=test_snapshot_name, 69 | regex=GOOGLE_SNAPSHOT_NAME_REGEX 70 | ) 71 | is_valid = False 72 | 73 | # Configuration keys that are either a Google 74 | glabel_key_keys = {'snapshot_author_label'} 75 | glabel_value_keys = {'snapshot_author_label_key'} 76 | 77 | for key in glabel_key_keys | glabel_value_keys: 78 | value = config[key] # type: str 79 | re_match = re.match(GOOGLE_LABEL_REGEX, value) 80 | is_glabel_key = key in glabel_key_keys 81 | is_glabel_valid = ( 82 | re_match and value.islower() and 83 | value[0].isalpha() or not is_glabel_key 84 | ) 85 | 86 | if not is_glabel_valid: 87 | _logger.error( 88 | 'config.error', 89 | message=f'Configuration value is not a valid ' 90 | f'Google Label {"Key" if is_glabel_key else "Value"}. ' 91 | f'See ' 92 | f'https://cloud.google.com/compute/docs/labeling-resources ' 93 | f'for more', 94 | key_hints=['value', 'regex'], 95 | key=key, 96 | is_lower=value.islower(), 97 | value=config[key], 98 | regex=GOOGLE_LABEL_REGEX, 99 | ) 100 | is_valid = False 101 | 102 | return is_valid 103 | 104 | 105 | class GoogleDiskIdentifier(NamedTuple): 106 | name: str 107 | regional: bool 108 | zone: str = None 109 | region: str = None 110 | 111 | 112 | def get_disk_identifier(volume: pykube.objects.PersistentVolume) -> GoogleDiskIdentifier: 113 | gce_disk = volume.obj['spec']['gcePersistentDisk']['pdName'] 114 | 115 | # How can we know the zone? In theory, the storage class can 116 | # specify a zone; but if not specified there, K8s can choose a 117 | # random zone within the master region. So we really can't trust 118 | # that value anyway. 119 | # There is a label that gives a failure region, but labels aren't 120 | # really a trustworthy source for this. 121 | # Apparently, this is a thing in the Kubernetes source too, see: 122 | # getDiskByNameUnknownZone in pkg/cloudprovider/providers/gce/gce.go, 123 | # e.g. https://github.com/jsafrane/kubernetes/blob/2e26019629b5974b9a311a9f07b7eac8c1396875/pkg/cloudprovider/providers/gce/gce.go#L2455 124 | gce_disk_zone = volume.labels.get( 125 | 'failure-domain.beta.kubernetes.io/zone' 126 | ) 127 | 128 | if not gce_disk_zone: 129 | raise UnsupportedVolume('cannot find the zone of the disk') 130 | 131 | gce_disk_region = volume.labels.get( 132 | 'failure-domain.beta.kubernetes.io/region' 133 | ) 134 | 135 | if not gce_disk_region: 136 | raise UnsupportedVolume('cannot find the region of the disk') 137 | 138 | if "__" in gce_disk_zone: 139 | # seems like Google likes to put __ in between zones in the label 140 | # failure-domain.beta.kubernetes.io/zone when the pv is regional 141 | return GoogleDiskIdentifier(name=gce_disk, region=gce_disk_region, regional=True) 142 | else: 143 | return GoogleDiskIdentifier(name=gce_disk, zone=gce_disk_zone, regional=False) 144 | 145 | 146 | def supports_volume(volume: pykube.objects.PersistentVolume): 147 | return bool(volume.obj['spec'].get('gcePersistentDisk')) 148 | 149 | 150 | def parse_timestamp(date_str: str) -> pendulum.Pendulum: 151 | return pendulum.parse(date_str).in_timezone('utc') 152 | 153 | 154 | def validate_disk_identifier(disk_id: Dict) -> DiskIdentifier: 155 | """Should take the user-specified dictionary, and convert it to 156 | it's own, local `DiskIdentifier`. If the disk_id is not valid, 157 | it should raise a `ValueError` with a suitable error message. 158 | """ 159 | 160 | try: 161 | return GoogleDiskIdentifier( 162 | zone=disk_id['zone'], 163 | name=disk_id['name'] 164 | ) 165 | except: 166 | raise ValueError(disk_id) 167 | 168 | 169 | def snapshot_list_filter_expr(label_filters: Dict[str, str]) -> str: 170 | key = list(label_filters.keys())[0] 171 | value = label_filters[key] 172 | return f'labels.{key} eq {value}' 173 | 174 | 175 | def load_snapshots(ctx, label_filters: Dict[str, str]) -> List[Snapshot]: 176 | """ 177 | Return the existing snapshots. 178 | """ 179 | snapshots = get_gcloud(ctx).snapshots() 180 | request = snapshots.list( 181 | project=get_project_id(ctx), 182 | filter=snapshot_list_filter_expr(label_filters), 183 | maxResults=500, 184 | ) 185 | 186 | loaded_snapshots = [] 187 | 188 | while request is not None: 189 | resp = request.execute() 190 | for item in resp.get('items', []): 191 | # We got to parse out the disk zone and name from the source disk. 192 | # It's an url that ends with '/zones/{zone}/disks/{name}'/ 193 | sourceDiskList = item['sourceDisk'].split('/') 194 | 195 | disk = sourceDiskList[-1] 196 | 197 | if "regions" in sourceDiskList: 198 | region = sourceDiskList[8] 199 | loaded_snapshots.append(Snapshot( 200 | name=item['name'], 201 | created_at=parse_timestamp(item['creationTimestamp']), 202 | disk=GoogleDiskIdentifier(name=disk, region=region, regional=True) 203 | )) 204 | else: 205 | zone = sourceDiskList[8] 206 | loaded_snapshots.append(Snapshot( 207 | name=item['name'], 208 | created_at=parse_timestamp(item['creationTimestamp']), 209 | disk=GoogleDiskIdentifier(name=disk, zone=zone, regional=False) 210 | )) 211 | 212 | request = snapshots.list_next(request, resp) 213 | 214 | return loaded_snapshots 215 | 216 | 217 | def create_snapshot( 218 | ctx: Context, 219 | disk: GoogleDiskIdentifier, 220 | snapshot_name: str, 221 | snapshot_description: str 222 | ) -> NewSnapshotIdentifier: 223 | request_body = { 224 | 'name': snapshot_name, 225 | 'description': snapshot_description 226 | } 227 | 228 | gcloud = get_gcloud(ctx) 229 | 230 | # Returns a ZoneOperation: {kind: 'compute#operation', 231 | # operationType: 'createSnapshot', ...}. 232 | # Google's documentation is confusing regarding this, since there's two 233 | # tables of payload parameter descriptions on the page, one of them 234 | # describes the input parameters, but contains output-only parameters, 235 | # the correct table can be found at 236 | # https://cloud.google.com/compute/docs/reference/latest/disks/createSnapshot#response 237 | if disk.regional: 238 | operation = gcloud.regionDisks().createSnapshot( 239 | disk=disk.name, 240 | project=get_project_id(ctx), 241 | region=disk.region, 242 | body=request_body 243 | ).execute() 244 | return { 245 | 'snapshot_name': snapshot_name, 246 | 'region': disk.region, 247 | 'operation_name': operation['name'] 248 | } 249 | 250 | else: 251 | operation = gcloud.disks().createSnapshot( 252 | disk=disk.name, 253 | project=get_project_id(ctx), 254 | zone=disk.zone, 255 | body=request_body 256 | ).execute() 257 | return { 258 | 'snapshot_name': snapshot_name, 259 | 'zone': disk.zone, 260 | 'operation_name': operation['name'] 261 | } 262 | 263 | def get_snapshot_status( 264 | ctx: Context, 265 | snapshot_identifier: NewSnapshotIdentifier 266 | ) -> SnapshotStatus: 267 | """In Google Cloud, the createSnapshot operation returns a ZoneOperation 268 | object which goes from PENDING, to RUNNING, to DONE. 269 | The snapshot object itself can be CREATING, DELETING, FAILED, READY, 270 | or UPLOADING. 271 | 272 | We check both states to make sure the snapshot was created. 273 | """ 274 | 275 | _log = _logger.new( 276 | snapshot_identifier=snapshot_identifier, 277 | ) 278 | 279 | gcloud = get_gcloud(ctx) 280 | 281 | # First, check the operation state 282 | 283 | if "region" in snapshot_identifier: 284 | operation = gcloud.regionOperations().get( 285 | project=get_project_id(ctx), 286 | region=snapshot_identifier['region'], 287 | operation=snapshot_identifier['operation_name'] 288 | ).execute() 289 | else: 290 | operation = gcloud.zoneOperations().get( 291 | project=get_project_id(ctx), 292 | zone=snapshot_identifier['zone'], 293 | operation=snapshot_identifier['operation_name'] 294 | ).execute() 295 | 296 | if not operation['status'] == 'DONE': 297 | _log.debug('google.status.operation_not_complete', 298 | status=operation['status']) 299 | return SnapshotStatus.PENDING 300 | 301 | # To be sure, check the state of the snapshot itself 302 | snapshot = gcloud.snapshots().get( 303 | snapshot=snapshot_identifier['snapshot_name'], 304 | project=get_project_id(ctx) 305 | ).execute() 306 | 307 | status = snapshot['status'] 308 | if status == 'FAILED': 309 | _log.debug('google.status.failed', 310 | status=status) 311 | raise SnapshotCreateError(status) 312 | elif status != 'READY': 313 | _log.debug('google.status.not_ready', 314 | status=status) 315 | return SnapshotStatus.PENDING 316 | 317 | return SnapshotStatus.COMPLETE 318 | 319 | 320 | def set_snapshot_labels( 321 | ctx: Context, 322 | snapshot_identifier: NewSnapshotIdentifier, 323 | labels: Dict 324 | ): 325 | gcloud = get_gcloud(ctx) 326 | 327 | snapshot = gcloud.snapshots().get( 328 | snapshot=snapshot_identifier['snapshot_name'], 329 | project=get_project_id(ctx) 330 | ).execute() 331 | 332 | body = { 333 | 'labels': labels, 334 | 'labelFingerprint': snapshot['labelFingerprint'], 335 | } 336 | return gcloud.snapshots().setLabels( 337 | resource=snapshot_identifier['snapshot_name'], 338 | project=get_project_id(ctx), 339 | body=body, 340 | ).execute() 341 | 342 | 343 | def delete_snapshot( 344 | ctx: Context, 345 | snapshot: Snapshot 346 | ): 347 | gcloud = get_gcloud(ctx) 348 | return gcloud.snapshots().delete( 349 | snapshot=snapshot.name, 350 | project=get_project_id(ctx) 351 | ).execute() 352 | 353 | 354 | def get_gcloud(ctx, version: str= 'v1'): 355 | """ 356 | Get a configured Google Compute API Client instance. 357 | 358 | Note that the Google API Client is not threadsafe. Cache the instance locally 359 | if you want to avoid OAuth overhead between calls. 360 | 361 | Parameters 362 | ---------- 363 | version 364 | Compute API version 365 | """ 366 | SCOPES = 'https://www.googleapis.com/auth/compute' 367 | credentials = None 368 | 369 | if ctx.config.get('gcloud_credentials_file'): 370 | credentials = ServiceAccountCredentials.from_json_keyfile_name( 371 | ctx.config.get('gcloud_credentials_file'), 372 | scopes=SCOPES) 373 | 374 | if ctx.config.get('google_application_credentials'): 375 | keyfile = json.loads(ctx.config.get('google_application_credentials')) 376 | credentials = ServiceAccountCredentials.from_json_keyfile_dict( 377 | keyfile, scopes=SCOPES) 378 | 379 | if not credentials: 380 | credentials = GoogleCredentials.get_application_default() 381 | 382 | if not credentials: 383 | raise RuntimeError("Auth for Google Cloud was not configured") 384 | 385 | compute = discovery.build( 386 | 'compute', 387 | version, 388 | credentials=credentials, 389 | # https://github.com/google/google-api-python-client/issues/299#issuecomment-268915510 390 | cache_discovery=False 391 | ) 392 | return compute 393 | -------------------------------------------------------------------------------- /k8s_snapshots/core.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | TODO: prevent a backup loop: A failsafe mechanism to make sure we 4 | don't create more than x snapshots per disk; in case something 5 | is wrong with the code that loads the exsting snapshots from GCloud. 6 | """ 7 | import asyncio 8 | from typing import List, AsyncIterable, Optional, Tuple, Dict 9 | 10 | import pendulum 11 | import pykube 12 | import structlog 13 | from aiochannel import Channel, ChannelEmpty 14 | from aiostream import stream 15 | 16 | from k8s_snapshots import events 17 | from k8s_snapshots.backends import get_backend 18 | from k8s_snapshots.asyncutils import combine_latest, StreamReader 19 | from k8s_snapshots.context import Context 20 | from k8s_snapshots.errors import ( 21 | AnnotationNotFound, 22 | AnnotationError, 23 | UnsupportedVolume, 24 | VolumeNotFound, 25 | ConfigurationError, 26 | DeltasParseError, 27 | RuleDependsOn) 28 | from k8s_snapshots.kube import ( 29 | watch_resources, 30 | get_resource_or_none, 31 | SnapshotRule, 32 | _WatchEvent) 33 | from k8s_snapshots.rule import ( 34 | rule_from_pv, Rule, parse_deltas, rule_name_from_k8s_source, get_deltas) 35 | from k8s_snapshots.snapshot import ( 36 | make_backup, 37 | get_snapshots, 38 | determine_next_snapshot, 39 | is_snapshot_required 40 | ) 41 | 42 | _logger = structlog.get_logger() 43 | 44 | 45 | async def volume_from_pvc( 46 | ctx: Context, 47 | resource: pykube.objects.PersistentVolumeClaim 48 | ) -> pykube.objects.PersistentVolume: 49 | """Given a `PersistentVolumeClaim`, return the `PersistentVolume` 50 | it is bound to. 51 | """ 52 | _log = _logger.new(resource=resource) 53 | 54 | pvc = resource 55 | 56 | try: 57 | volume_name = resource.obj['spec']['volumeName'] 58 | except KeyError as exc: 59 | raise VolumeNotFound( 60 | 'Could not get volume name from volume claim', 61 | volume_claim=pvc.obj 62 | ) from exc 63 | 64 | _log = _log.bind( 65 | volume_name=volume_name 66 | ) 67 | 68 | _log.debug( 69 | 'Looking for volume', 70 | key_hints=['volume_name'] 71 | ) 72 | 73 | volume = await get_resource_or_none( 74 | ctx.kube_client, 75 | pykube.objects.PersistentVolume, 76 | volume_name, 77 | ) 78 | if volume is None: 79 | raise VolumeNotFound( 80 | f'Could not find volume with name {volume_name!r}', 81 | volume_claim=pvc.obj, 82 | ) 83 | return volume 84 | 85 | 86 | async def rule_from_snapshotrule( 87 | ctx: Context, 88 | resource: SnapshotRule 89 | ) -> Optional[Rule]: 90 | """This tries to build a rule within a `SnapshotRule` resource - 91 | the resource that we custom designed for this purpose. 92 | 93 | This is invoked whenever Kubernetes tells us that such a resource 94 | was created, deleted, or updated. 95 | 96 | There are two separate ways a `SnapshotRule` can be used: 97 | 98 | - A `SnapshotRule` resource can refer to a specific Cloud disk 99 | id to be snapshotted, e.g. 'example-disk' on 'gcloud'. This 100 | skips Kubernetes entirely. 101 | 102 | - A `SnapshotRule` resource can refer to a `PersistentVolumeClaim`. 103 | The disk this claim is bound to is the one we will snapshot. 104 | Rather than defining the snapshot interval etc. as annotations 105 | of the claim, they are defined here, in a separate resource. 106 | """ 107 | _log = _logger.new(resource=resource, rule=resource.obj) 108 | 109 | spec = resource.obj.get('spec', {}) 110 | 111 | # Validate the deltas 112 | try: 113 | deltas_str = resource.obj.get('spec', {}).get('deltas') 114 | try: 115 | deltas = parse_deltas(deltas_str) 116 | except DeltasParseError as exc: 117 | raise AnnotationError( 118 | 'Invalid delta string', 119 | deltas_str=deltas_str 120 | ) from exc 121 | 122 | if deltas is None or not deltas: 123 | raise AnnotationError( 124 | 'parse_deltas returned invalid deltas', 125 | deltas_str=deltas_str, 126 | deltas=deltas, 127 | ) 128 | except AnnotationError: 129 | _log.exception( 130 | 'rule.invalid', 131 | key_hints=['rule.metadata.name'], 132 | ) 133 | return 134 | 135 | # Refers to a disk from a cloud provider 136 | if spec.get('disk'): 137 | # Validate the backend 138 | backend_name = spec.get('backend') 139 | try: 140 | backend = get_backend(backend_name) 141 | except ConfigurationError as e: 142 | _log.exception( 143 | 'rule.invalid', 144 | message=e.message, 145 | backend=backend_name 146 | ) 147 | return 148 | 149 | # Validate the disk identifier 150 | disk = resource.obj.get('spec', {}).get('disk') 151 | try: 152 | disk = backend.validate_disk_identifier(disk) 153 | except ValueError: 154 | _log.exception( 155 | 'rule.invalid', 156 | key_hints=['rule.metadata.name'], 157 | ) 158 | return 159 | 160 | rule = Rule( 161 | name=rule_name_from_k8s_source(resource), 162 | deltas=deltas, 163 | backend=backend_name, 164 | disk=disk 165 | ) 166 | return rule 167 | 168 | # Refers to a volume claim 169 | if spec.get('persistentVolumeClaim'): 170 | 171 | # Find the claim 172 | volume_claim = await get_resource_or_none( 173 | ctx.kube_client, 174 | pykube.objects.PersistentVolumeClaim, 175 | spec.get('persistentVolumeClaim'), 176 | namespace=resource.metadata['namespace'] 177 | ) 178 | 179 | if not volume_claim: 180 | _log.warning( 181 | events.Rule.PENDING, 182 | reason='Volume claim does not exist', 183 | key_hints=['rule.metadata.name'], 184 | ) 185 | raise RuleDependsOn( 186 | 'The volume claim targeted by this SnapshotRule does not exist yet', 187 | kind='PersistentVolumeClaim', 188 | namespace=resource.metadata['namespace'], 189 | name=spec.get('persistentVolumeClaim') 190 | ) 191 | 192 | # Find the volume 193 | try: 194 | volume = await volume_from_pvc(ctx, volume_claim) 195 | except VolumeNotFound: 196 | _log.warning( 197 | events.Rule.PENDING, 198 | reason='Volume claim is not bound', 199 | key_hints=['rule.metadata.name'], 200 | ) 201 | raise RuleDependsOn( 202 | 'The volume claim targeted by this SnapshotRule is not bound yet', 203 | kind='PersistentVolumeClaim', 204 | namespace=resource.metadata['namespace'], 205 | name=spec.get('persistentVolumeClaim') 206 | ) 207 | 208 | return await rule_from_pv(ctx, volume, deltas, source=resource) 209 | 210 | 211 | async def rule_from_persistent_volume( 212 | ctx: Context, 213 | volume: pykube.objects.PersistentVolume 214 | ) -> Optional[Rule]: 215 | _log = _logger.new(resource=volume) 216 | 217 | volume_name = volume.name 218 | _log = _log.bind( 219 | volume_name=volume_name, 220 | volume=volume.obj, 221 | ) 222 | 223 | try: 224 | _log.debug('Checking volume for deltas') 225 | deltas = get_deltas(volume.annotations, 226 | ctx.config.get('deltas_annotation_key')) 227 | except AnnotationNotFound: 228 | _log.info( 229 | events.Annotation.NOT_FOUND, 230 | key_hints=['volume.metadata.name'] 231 | ) 232 | return 233 | except AnnotationError: 234 | _log.exception( 235 | events.Annotation.ERROR, 236 | key_hints=['volume.metadata.name'], 237 | ) 238 | return 239 | 240 | try: 241 | return await rule_from_pv(ctx, volume, deltas, source=volume) 242 | except UnsupportedVolume as exc: 243 | _log.info( 244 | events.Volume.UNSUPPORTED, 245 | key_hints=['volume.metadata.name'], 246 | exc_info=exc, 247 | ) 248 | 249 | 250 | async def rule_from_persistent_volume_claim( 251 | ctx: Context, 252 | volume_claim: pykube.objects.PersistentVolumeClaim 253 | ) -> Optional[Rule]: 254 | """ 255 | If a `PersistentVolumeClaim` is annotated, we create a rule 256 | based on those annotations, for the disk that the claim is bound to. 257 | 258 | If the claim is currently unbound, we return `None`. We do not have 259 | have to worry about being notified of any future binding, since 260 | Kubernetes will update the `PersistentVolumeClaim` resource when 261 | that happens, so we will see that update. 262 | """ 263 | _log = _logger.new(resource=volume_claim, volume_claim=volume_claim.obj) 264 | 265 | try: 266 | _log.debug('Checking volume claim for deltas') 267 | deltas = get_deltas( 268 | volume_claim.annotations, ctx.config.get('deltas_annotation_key')) 269 | except AnnotationNotFound as exc: 270 | _log.exception( 271 | events.Annotation.NOT_FOUND, 272 | key_hints=['volume_claim.metadata.name'], 273 | ) 274 | return 275 | except AnnotationError: 276 | _log.exception( 277 | events.Annotation.ERROR, 278 | key_hints=['volume_claim.metadata.name'], 279 | ) 280 | return 281 | 282 | try: 283 | volume = await volume_from_pvc(ctx, volume_claim) 284 | except VolumeNotFound: 285 | _log.warning( 286 | events.Rule.PENDING, 287 | reason='Volume claim is not bound', 288 | key_hints=['volume_claim.metadata.name'], 289 | ) 290 | return 291 | 292 | return await rule_from_pv( 293 | ctx, 294 | volume, 295 | deltas=deltas, 296 | source=volume_claim 297 | ) 298 | 299 | 300 | async def rules_from_kubernetes(ctx) -> AsyncIterable[List[Rule]]: 301 | """This generator continuously runs, watching Kubernetes for 302 | certain resources, consuming changes, and determining which 303 | snapshot rules have been defined. 304 | 305 | Every value it returns is a list of `Rule` objects, a complete 306 | set of snapshot rules defined at this point in time. Every set 307 | of rule objects replaces the previous one. 308 | """ 309 | 310 | # These are rules that we are ready to "run". 311 | rules = {} 312 | 313 | # These are resources that we know we have to recheck, because 314 | # they will become rules pending a resource creation. For example: 315 | # A `SnapshotRule` resource points to volume claim. However, this 316 | # volume claim is not yet bound. Once Kubernetes creates the volume, 317 | # we will notify us about creating a `PersistentVolume` and updating 318 | # a `PersistentVolumeClaim`. It will not, however, send us an 319 | # update for the `SnapshotRule` - where the rule is actually 320 | # defined. We thus have to link the rule to the volume. 321 | pending_rules: Dict[Tuple, pykube.objects.APIObject] = {} 322 | 323 | _logger.debug('volume-events.watch') 324 | 325 | merged_stream = stream.merge( 326 | watch_resources(ctx, pykube.objects.PersistentVolume, delay=0), 327 | watch_resources(ctx, pykube.objects.PersistentVolumeClaim, delay=2), 328 | watch_resources(ctx, SnapshotRule, delay=3, allow_missing=True) 329 | ) 330 | 331 | iterable: AsyncIterable[_WatchEvent] = merged_stream.stream() 332 | async with iterable as merged_events: 333 | async for event in merged_events: 334 | 335 | _log = _logger.bind( 336 | event_type=event.type, 337 | event_object=event.object.obj, 338 | ) 339 | _log.info( 340 | events.VolumeEvent.RECEIVED, 341 | key_hints=[ 342 | 'event_type', 343 | 'event_object.metadata.name', 344 | ], 345 | ) 346 | 347 | # This is how we uniquely identify the rule. This is important 348 | # such that when an object is deleted, we delete the correct 349 | # rule. 350 | key_by = ( 351 | event.object.kind, 352 | event.object.namespace, 353 | event.object.name 354 | ) 355 | 356 | events_to_process = [ 357 | (event.type, key_by, event.object) 358 | ] 359 | 360 | # Is there some other object that was depending on *this* 361 | # object? 362 | if key_by in pending_rules: 363 | depending_object_key, depending_object = pending_rules.pop(key_by) 364 | if event.type != 'DELETED': 365 | events_to_process.append(('MODIFIED', depending_object_key, depending_object)) 366 | 367 | for (event_type, rule_key, resource) in events_to_process: 368 | 369 | # TODO: there is probably a bug here, where for rule deletion 370 | # we should not have to first successfully build the rule; the key 371 | # is enough to delete it. Same with a modification that causes 372 | # the rule to break; we should remove it until fixed. 373 | try: 374 | if isinstance(resource, SnapshotRule): 375 | rule = await rule_from_snapshotrule(ctx, resource) 376 | elif isinstance(resource, pykube.objects.PersistentVolumeClaim): 377 | rule = await rule_from_persistent_volume_claim(ctx, resource) 378 | elif isinstance(resource, pykube.objects.PersistentVolume): 379 | rule = await rule_from_persistent_volume(ctx, resource) 380 | else: 381 | raise RuntimeError(f'{resource} is not supported.') 382 | 383 | except RuleDependsOn as exc: 384 | # We have to remember this so that when we get an 385 | # update for the dependency that we lack here, we 386 | # can process this resource once more. 387 | pending_rules[( 388 | exc.data['kind'], 389 | exc.data['namespace'], 390 | exc.data['name'], 391 | )] = (rule_key, resource) 392 | continue 393 | 394 | if not rule: 395 | continue 396 | 397 | _log = _log.bind( 398 | rule=rule 399 | ) 400 | 401 | if event_type == 'ADDED' or event_type == 'MODIFIED': 402 | if rule: 403 | if event_type == 'ADDED' or rule_key not in rules: 404 | _log.info( 405 | events.Rule.ADDED, 406 | key_hints=['rule.name'] 407 | ) 408 | else: 409 | _log.info( 410 | events.Rule.UPDATED, 411 | key_hints=['rule.name'] 412 | ) 413 | rules[rule_key] = rule 414 | else: 415 | if rule_key in rules: 416 | _log.info( 417 | events.Rule.REMOVED, 418 | key_hints=['volume_name'] 419 | ) 420 | rules.pop(rule_key) 421 | 422 | elif event_type == 'DELETED': 423 | if rule_key in rules: 424 | _log.info( 425 | events.Rule.REMOVED, 426 | key_hints=['volume_name'] 427 | ) 428 | rules.pop(rule_key) 429 | else: 430 | _log.warning('Unhandled event') 431 | 432 | # We usually have duplicate disks within in `rules`, 433 | # which is indexed by resource kind. One reason is we 434 | # watching both PVCs and PVs, and a PVC/PV pair resolve 435 | # to the same disk. It is also possible that custom rules 436 | # the user defined contain duplicates. Let's make sure 437 | # we only have one rule for every disk. Note that which 438 | # one we pick is undefined. 439 | # 440 | # In the (internal) case of PV/PVC pairs it does't matter, 441 | # since our code is written thus: The rule always references 442 | # the volume, and we always check the volume, then the claim 443 | # for deltas. The behaviour for this case is well-defined. 444 | unique_rules = {rule.disk: rule for rule in rules.values()}.values() 445 | # TODO: Log in a different place, in a debounced way 446 | #_logger.info('sync-get-rules.yield', rule_count=len(unique_rules)) 447 | yield list(unique_rules) 448 | 449 | _logger.debug('sync-get-rules.done') 450 | 451 | 452 | async def get_rules(ctx): 453 | _log = _logger.new() 454 | 455 | async for rules in rules_from_kubernetes(ctx): 456 | _log.debug('get-rules.rules.updated', rules=rules) 457 | yield rules 458 | 459 | _log.debug('get-rules.done') 460 | 461 | 462 | async def watch_schedule(ctx, trigger, *, loop=None): 463 | """Continually yields the next backup to be created. 464 | 465 | It watches two input sources: the rules as defined by 466 | Kubernetes resources, and the existing snapshots, as returned 467 | from Google Cloud. If either of them change, a new backup 468 | is scheduled. 469 | """ 470 | loop = loop or asyncio.get_event_loop() 471 | _log = _logger.new() 472 | 473 | 474 | rules_reader = StreamReader(get_rules(ctx)) 475 | snapgen = get_snapshots(ctx, rules_reader.iter(), trigger) 476 | 477 | _log.debug('watch_schedule.start') 478 | 479 | rules = None 480 | 481 | heartbeat_interval_seconds = ctx.config.get( 482 | 'schedule_heartbeat_interval_seconds' 483 | ) 484 | 485 | async def heartbeat(): 486 | _logger.info( 487 | events.Rule.HEARTBEAT, 488 | rules=rules, 489 | ) 490 | 491 | loop.call_later( 492 | heartbeat_interval_seconds, 493 | asyncio.ensure_future, 494 | heartbeat() 495 | ) 496 | 497 | if heartbeat_interval_seconds: 498 | asyncio.ensure_future(heartbeat()) 499 | 500 | combined = combine_latest( 501 | rules=rules_reader.iter(), 502 | snapshots=snapgen, 503 | defaults={'snapshots': None, 'rules': None} 504 | ) 505 | 506 | async for item in combined: 507 | rules = item.get('rules') 508 | snapshots = item.get('snapshots') 509 | 510 | # Never schedule before we have data from both rules and snapshots 511 | if rules is None or snapshots is None: 512 | _log.debug( 513 | 'watch_schedule.wait-for-both', 514 | ) 515 | continue 516 | 517 | yield determine_next_snapshot(snapshots, rules) 518 | 519 | 520 | async def scheduler(ctx, scheduling_chan, snapshot_reload_trigger): 521 | """The "when to make a backup schedule" depends on the backup delta 522 | rules as defined in Kubernetes volume resources, and the existing 523 | snapshots. 524 | 525 | This simply observes a stream of 'next planned backup' events and 526 | sends then to the channel given. Note that this scheduler 527 | doesn't plan multiple backups in advance. Only ever a single 528 | next backup is scheduled. 529 | """ 530 | _log = _logger.new() 531 | _log.debug('scheduler.start') 532 | 533 | async for schedule in watch_schedule(ctx, snapshot_reload_trigger): 534 | _log.debug('scheduler.schedule', schedule=schedule) 535 | await scheduling_chan.put(schedule) 536 | 537 | 538 | async def backuper(ctx, scheduling_chan, snapshot_reload_trigger): 539 | """Will take tasks from the given queue, then execute the backup. 540 | """ 541 | _log = _logger.new() 542 | _log.debug('backuper.start') 543 | 544 | current_target_time = current_target_rule = None 545 | while True: 546 | await asyncio.sleep(0.1) 547 | 548 | try: 549 | current_target_rule, current_target_time = scheduling_chan.get_nowait() 550 | 551 | # Log a message 552 | if not current_target_time: 553 | _log.debug('backuper.no-target') 554 | else: 555 | _log.debug( 556 | 'backuper.next-backup', 557 | key_hints=[ 558 | 'rule.name', 559 | 'target_time', 560 | ], 561 | rule=current_target_rule, 562 | target_time=current_target_time, 563 | diff=current_target_time.diff(), 564 | ) 565 | except ChannelEmpty: 566 | pass 567 | 568 | if not current_target_time: 569 | continue 570 | 571 | if pendulum.now('utc') > current_target_time: 572 | try: 573 | if await is_snapshot_required(ctx, current_target_rule): 574 | await make_backup(ctx, current_target_rule) 575 | await snapshot_reload_trigger.put(True) 576 | else: 577 | _log.info('backuper.scheduled_backup_no_longer_required', 578 | rule=current_target_rule, 579 | target_time=current_target_time) 580 | finally: 581 | current_target_time = current_target_rule = None 582 | 583 | 584 | async def daemon(config, *, loop=None): 585 | """Main app; it runs two tasks; one schedules backups, the other 586 | one executes the. 587 | """ 588 | loop = loop or asyncio.get_event_loop() 589 | 590 | ctx = Context(config) 591 | 592 | # Using this channel, we can trigger a refresh of the list of 593 | # disk snapshots in the Google Cloud. 594 | snapshot_reload_trigger = Channel() 595 | 596 | # The backup task consumes this channel for the next backup task. 597 | scheduling_chan = Channel() 598 | 599 | schedule_task = asyncio.ensure_future( 600 | scheduler(ctx, scheduling_chan, snapshot_reload_trigger)) 601 | backup_task = asyncio.ensure_future( 602 | backuper(ctx, scheduling_chan, snapshot_reload_trigger)) 603 | 604 | tasks = [schedule_task, backup_task] 605 | 606 | _logger.debug('Gathering tasks', tasks=tasks) 607 | 608 | try: 609 | await asyncio.gather(*tasks) 610 | except asyncio.CancelledError: 611 | _logger.exception( 612 | 'Received CancelledError', 613 | tasks=tasks 614 | ) 615 | 616 | for task in tasks: 617 | task.cancel() 618 | _logger.debug('daemon cancelled task', task=task) 619 | 620 | while True: 621 | finished, pending = await asyncio.wait( 622 | tasks, 623 | return_when=asyncio.FIRST_COMPLETED) 624 | 625 | _logger.debug( 626 | 'task completed', 627 | finished=finished, 628 | pending=pending) 629 | 630 | if not pending: 631 | _logger.debug('all tasks done') 632 | raise 633 | --------------------------------------------------------------------------------