├── debian ├── compat ├── backurne.manpages ├── copyright ├── backurne.install ├── rules ├── control └── changelog ├── src └── backurne │ ├── __init__.py │ ├── __main__.py │ ├── pretty.py │ ├── stats.py │ ├── config.py │ ├── api.py │ ├── log.py │ ├── backup.py │ ├── proxmox.py │ ├── disk.py │ ├── ceph.py │ ├── restore.py │ └── backurne.py ├── .gitignore ├── graph_duration.png ├── graph_in_progress.png ├── graph ├── telegraf │ ├── backurne_inprogress │ └── telegraf.conf └── grafana-backurne.json ├── conf ├── uwsgi.ini └── backurne.conf ├── setup.py ├── sample-api.py ├── .pre-commit-config.yaml ├── bash └── backurne ├── api.md ├── Changelog.md ├── man └── backurne.1 ├── cli.md ├── README.md └── LICENSE /debian/compat: -------------------------------------------------------------------------------- 1 | 11 2 | -------------------------------------------------------------------------------- /src/backurne/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | *.swp 3 | -------------------------------------------------------------------------------- /debian/backurne.manpages: -------------------------------------------------------------------------------- 1 | man/backurne.1 2 | -------------------------------------------------------------------------------- /src/backurne/__main__.py: -------------------------------------------------------------------------------- 1 | from backurne import main 2 | 3 | main() 4 | -------------------------------------------------------------------------------- /graph_duration.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JackSlateur/backurne/HEAD/graph_duration.png -------------------------------------------------------------------------------- /graph_in_progress.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JackSlateur/backurne/HEAD/graph_in_progress.png -------------------------------------------------------------------------------- /graph/telegraf/backurne_inprogress: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ps -aux|grep /bin/sh | grep -i [i]mport-diff -c 3 | -------------------------------------------------------------------------------- /debian/copyright: -------------------------------------------------------------------------------- 1 | License: GPL-2 2 | Copyright: 2017-2019 Alexandre Bruyelles 3 | -------------------------------------------------------------------------------- /debian/backurne.install: -------------------------------------------------------------------------------- 1 | conf/backurne.conf /etc/backurne/ 2 | bash/backurne usr/share/bash-completion/completions 3 | -------------------------------------------------------------------------------- /debian/rules: -------------------------------------------------------------------------------- 1 | #!/usr/bin/make -f 2 | 3 | #export DH_VERBOSE=1 4 | 5 | %: 6 | dh $@ --with python3 --buildsystem=pybuild 7 | -------------------------------------------------------------------------------- /graph/telegraf/telegraf.conf: -------------------------------------------------------------------------------- 1 | # 2 | [[inputs.exec]] 3 | commands = [ "/usr/local/bin/backurne_inprogress" ] 4 | name_override = "backurne_inprogress" 5 | data_format = "value" 6 | -------------------------------------------------------------------------------- /conf/uwsgi.ini: -------------------------------------------------------------------------------- 1 | [uwsgi] 2 | vhost = backurne-api.fqdn 3 | plugins = python3, syslog 4 | module = backurne.api 5 | callable = app 6 | workers = 4 7 | logger = syslog:uwsgi 8 | uid = root 9 | gid = root 10 | socket = 127.0.0.1:7777 11 | -------------------------------------------------------------------------------- /src/backurne/pretty.py: -------------------------------------------------------------------------------- 1 | from prettytable import PrettyTable 2 | from termcolor import colored 3 | 4 | 5 | def bold(text): 6 | return colored(text, attrs=["bold"]) 7 | 8 | 9 | def Pt(header): 10 | header = [bold(i) for i in header] 11 | pt = PrettyTable(header) 12 | pt.align = "l" 13 | pt.padding_width = 2 14 | return pt 15 | -------------------------------------------------------------------------------- /src/backurne/stats.py: -------------------------------------------------------------------------------- 1 | import multiprocessing 2 | 3 | import humanize 4 | 5 | from .ceph import Ceph 6 | from .config import config 7 | 8 | 9 | def print_stats(): 10 | ceph = Ceph(None) 11 | 12 | result = {} 13 | 14 | with multiprocessing.Pool(config["backup_worker"]) as p: 15 | for sizes in p.imap_unordered(ceph.du, ceph.ls()): 16 | sizes = sizes["images"] 17 | for i in sizes: 18 | try: 19 | result[i["name"]] += i["used_size"] 20 | except KeyError: 21 | result[i["name"]] = i["used_size"] 22 | 23 | result = [(k, result[k]) for k in sorted(result, key=result.get)] 24 | for key, value in result: 25 | print(key, humanize.naturalsize(value)) 26 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open("README.md") as fh: 4 | long_description = fh.read() 5 | 6 | setuptools.setup( 7 | name="backurne", 8 | version="2.4.0", 9 | author="Alexandre Bruyelles", 10 | author_email="backurne@jack.fr.eu.org", 11 | description="Backup Ceph's RBD on Ceph, with Proxmox integration", 12 | long_description=long_description, 13 | long_description_content_type="text/markdown", 14 | url="https://github.com/JackSlateur/backurne", 15 | packages=setuptools.find_packages("src"), 16 | package_dir={"": "src"}, 17 | classifiers=[ 18 | "Programming Language :: Python :: 3", 19 | "License :: OSI Approved :: GNU General Public License v2 (GPLv2)", 20 | "Operating System :: POSIX :: Linux", 21 | ], 22 | entry_points={ 23 | "console_scripts": [ 24 | "backurne = backurne.backurne:main", 25 | ] 26 | }, 27 | python_requires=">=3.5", 28 | install_requires=[ 29 | "termcolor", 30 | "PTable", 31 | "requests", 32 | "proxmoxer", 33 | "sh", 34 | "python-dateutil", 35 | "filelock", 36 | "setproctitle", 37 | "progressbar", 38 | ], 39 | ) 40 | -------------------------------------------------------------------------------- /sample-api.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | import json 3 | 4 | from flask import Flask 5 | from flask import request 6 | 7 | app = Flask(__name__) 8 | 9 | 10 | def send_json(data, code=200): 11 | return json.dumps(data), 200, {"Content-Type": "application/json"} 12 | 13 | 14 | @app.route("/", methods=["POST"]) 15 | def profile(): 16 | # data is fed with something like: 17 | # {'cluster': { 18 | # 'fqdn': 'supercluster.fqdn.org', 'name': 'supercluster', 'type': 'proxmox'}, 19 | # 'vm': {'name': 'super-server', 'vmid': 115}, 20 | # 'disk': {'rbd': 'vm-115-disk-1', 'ceph': 'cephcluster'} 21 | # } 22 | data = request.get_json() 23 | 24 | # Add your logic here 25 | # As a sample, we only add profiles if the VM's name is 'super-server' 26 | if data["vm"]["name"] == "super-server" or True: 27 | # A sample output, which is roughly the same as config's profiles 28 | # Each profiles will be added to the config's 29 | # Thus, there is no replacement nor override 30 | json = { 31 | "profiles": { 32 | "daily": { 33 | "count": 365, 34 | "max_on_live": 10, 35 | }, 36 | "hourly": { 37 | "count": 48, 38 | "max_on_live": 0, 39 | "priority": "high", 40 | }, 41 | } 42 | } 43 | else: 44 | # An empty dict means "no additional profile" 45 | json = {} 46 | 47 | # Additionally, we can disable backups by setting 'backup' to False 48 | # Any other values are meaningless 49 | if data["vm"]["vmid"] == 1234: 50 | json["backup"] = False 51 | 52 | return send_json(json) 53 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | repos: 3 | - repo: https://github.com/pre-commit/pre-commit-hooks 4 | rev: v6.0.0 5 | hooks: 6 | - id: check-yaml 7 | - id: end-of-file-fixer 8 | - id: trailing-whitespace 9 | - id: check-builtin-literals 10 | - id: check-added-large-files 11 | - id: check-ast 12 | - id: check-toml 13 | - id: check-case-conflict 14 | - id: name-tests-test 15 | - id: detect-private-key 16 | exclude: | 17 | (?x)^( 18 | README.md| 19 | .config.yml.default 20 | )$ 21 | - id: pretty-format-json 22 | args: ['--autofix'] 23 | - id: mixed-line-ending 24 | args: ['--fix=lf'] 25 | description: Forces to replace line ending by the UNIX 'lf' character. 26 | - repo: https://github.com/astral-sh/ruff-pre-commit 27 | # Ruff version. 28 | rev: v0.12.10 29 | hooks: 30 | # Run the linter. 31 | - id: ruff 32 | args: [--fix] 33 | # Run the formatter. 34 | - id: ruff-format 35 | args: ["--line-length", "88"] 36 | - repo: https://github.com/compilerla/conventional-pre-commit 37 | rev: v4.2.0 38 | hooks: 39 | - id: conventional-pre-commit 40 | stages: [commit-msg] 41 | args: [] 42 | - repo: https://github.com/codespell-project/codespell 43 | rev: v2.4.1 44 | hooks: 45 | - id: codespell 46 | additional_dependencies: 47 | - tomli 48 | args: 49 | - "-L" 50 | - "BU" 51 | - repo: https://github.com/asottile/pyupgrade 52 | rev: v3.20.0 53 | hooks: 54 | - id: pyupgrade 55 | - repo: https://github.com/asottile/reorder-python-imports 56 | rev: v3.15.0 57 | hooks: 58 | - id: reorder-python-imports 59 | args: 60 | - --py39-plus 61 | - repo: https://github.com/google/yamlfmt 62 | rev: v0.17.2 63 | hooks: 64 | - id: yamlfmt 65 | args: 66 | - -formatter 67 | - include_document_start=true 68 | -------------------------------------------------------------------------------- /bash/backurne: -------------------------------------------------------------------------------- 1 | _backurne(){ 2 | local cur prev opts 3 | COMPREPLY=() 4 | cur="${COMP_WORDS[COMP_CWORD]}" 5 | prev="${COMP_WORDS[COMP_CWORD-1]}" 6 | subcmd="${COMP_WORDS[1]}" 7 | 8 | if [ "$prev" == "backurne" ] 9 | then 10 | opts="backup check check-snap list-mapped ls map unmap stats" 11 | COMPREPLY=($(compgen -W "$opts" -- ${cur})) 12 | return 0 13 | fi 14 | 15 | if [ "$subcmd" == "unmap" ] 16 | then 17 | if [ "$prev" == "unmap" ] 18 | then 19 | opts="$(backurne list-mapped --json | jq -r .[].parent_image)" 20 | else 21 | prev=$(echo $prev | tr -d '\\') 22 | opts="$(backurne list-mapped --json | jq -r ".[] | select(.parent_image | contains(\"$prev\")) | .parent_snap")" 23 | fi 24 | COMPREPLY=($(compgen -W "$opts" -- ${cur})) 25 | return 0 26 | fi 27 | 28 | if [ "$subcmd" == "ls" ] 29 | then 30 | if [ "$cur" == "" ] 31 | then 32 | opts="$(backurne ls --json | jq -r .[].uuid)" 33 | COMPREPLY=($(compgen -W "$opts" -- ${cur})) 34 | else 35 | opts="$(backurne ls --json | jq -r .[].uuid | grep $cur)" 36 | COMPREPLY=($opts $(compgen -W "$opts" -- ${cur})) 37 | fi 38 | return 0 39 | fi 40 | 41 | if [ "$subcmd" == "map" ] 42 | then 43 | if [ "$prev" == "map" ] 44 | then 45 | if [ "$cur" == "" ] 46 | then 47 | opts="$(backurne ls --json | jq -r .[].uuid)" 48 | COMPREPLY=($(compgen -W "$opts" -- ${cur})) 49 | else 50 | opts="$(backurne ls --json | jq -r .[].uuid | grep $cur)" 51 | COMPREPLY=($opts $(compgen -W "$opts" -- ${cur})) 52 | fi 53 | else 54 | prev=$(echo $prev | tr -d '\\') 55 | opts="$(backurne ls --json $prev | jq -r .[].uuid)" 56 | COMPREPLY=($(compgen -W "$opts" -- ${cur})) 57 | fi 58 | return 0 59 | fi 60 | 61 | if [ "$subcmd" == "backup" ] 62 | then 63 | if [ "$prev" == "--cluster" ] 64 | then 65 | opts=$(python3 - </") 40 | def get(host): 41 | restore = Restore() 42 | data = restore.ls() 43 | 44 | result = [] 45 | for i in data: 46 | if i["ident"] == host: 47 | result.append( 48 | { 49 | "ident": i["ident"], 50 | "disk": i["disk"], 51 | "uuid": i["uuid"], 52 | } 53 | ) 54 | 55 | return send_json(result) 56 | 57 | 58 | @app.route("/backup//") 59 | def ls_snaps(rbd): 60 | rbd = urllib.parse.unquote(rbd) 61 | restore = Restore(rbd) 62 | data = restore.ls() 63 | 64 | result = [] 65 | for i in data: 66 | result.append( 67 | { 68 | "creation_date": str(i["creation"]), 69 | "uuid": i["uuid"], 70 | } 71 | ) 72 | 73 | return send_json(result) 74 | 75 | 76 | @app.route("/map///") 77 | def map(rbd, snap): 78 | restore = Restore(rbd, snap) 79 | status = restore.mount() 80 | if status is None: 81 | return send_json({"success": False, "path": None}, code=500) 82 | else: 83 | status = status.replace("/tmp/", "") 84 | return send_json({"success": True, "path": status}) 85 | 86 | 87 | @app.route("/unmap///") 88 | def unmap(rbd, snap): 89 | restore = Restore(rbd, snap) 90 | restore.umount() 91 | return send_json({"success": True}) 92 | 93 | 94 | @app.route("/mapped/") 95 | def mapped(): 96 | data = get_mapped(extended=False) 97 | result = [] 98 | for tree in data: 99 | result.append(prepare_tree_to_json(tree)) 100 | return send_json(result) 101 | 102 | 103 | auto_bp = Blueprint("auto_bp", __name__) 104 | # FIXME: use config or something 105 | AutoIndexBlueprint(auto_bp, browse_root="/tmp/") 106 | 107 | app.register_blueprint(auto_bp, url_prefix="/explore") 108 | -------------------------------------------------------------------------------- /debian/control: -------------------------------------------------------------------------------- 1 | Source: backurne 2 | Section: admin 3 | Priority: optional 4 | Maintainer: Alexandre Bruyelles 5 | Build-Depends: debhelper (>= 11), 6 | dh-python, 7 | python3-setuptools, 8 | python3, 9 | Standards-Version: 4.3.0 10 | Rules-Requires-Root: no 11 | Homepage: https://github.com/JackSlateur/backurne 12 | 13 | Package: backurne 14 | Architecture: any 15 | Depends: python3 (>= 3.7), 16 | python3-termcolor, 17 | python3-pkg-resources, 18 | python3-ptable | python3-prettytable, 19 | python3-requests, 20 | python3-proxmoxer, 21 | python3-sh, 22 | python3-dateutil, 23 | python3-filelock, 24 | python3-setproctitle, 25 | python3-progressbar, 26 | python3-psutil, 27 | python3-humanize, 28 | ceph-common (>= 12.2.0), 29 | rbd-nbd (>= 12.2.0), 30 | kpartx, 31 | xxhash, 32 | ${misc:Depends}, 33 | ${python3:Depends}, 34 | ${shlibs:Depends} 35 | Suggests: jq, 36 | vmfs-tools, 37 | vmfs6-tools 38 | Description: Backup Ceph's RBD on Ceph, with Proxmox integration 39 | backurne is a handy tool for backuping RBD's image on RBD. 40 | Yep ! What is better, for backuping a Ceph cluster, than another Ceph cluster ? 41 | . 42 | It does not do much by itself, though, but orchestrate and relies 43 | heavily on other tools. 44 | It has a fine integration with Proxmox, but is able to backup "plain" 45 | (or "raw RBD") cluster as well. 46 | . 47 | Supported features 48 | - Snapshot-based backup, with no agent strictly required on the VM. 49 | . 50 | - Backup inspection and restoration via command line interface as well as 51 | via REST API. 52 | . 53 | - Support multiple retention policy efficiently (both in term of storage 54 | and network bandwidth), dynamically configurable per host (proxmox-only) 55 | via REST API. 56 | . 57 | - Auto cleanup : deletion is never generated by a human, thus no human 58 | mistakes. 59 | . 60 | - Compression and encryption "on the wire" for enhanced efficiency 61 | and security. 62 | . 63 | - Peaceful integration with other snapshots (via Proxmox web interface 64 | or whatever). 65 | . 66 | - Multiple cluster support, with mixed type ("proxmox" and "plain"). 67 | . 68 | - A couple of backups can be stored on the live clusters, for faster recovery. 69 | . 70 | - Optional fsfreeze support (proxmox-only) via Qemu-quest-agent. 71 | . 72 | - Backup deactivation via Proxmox's web interface. 73 | . 74 | - VM tracking, for those who uses a single Proxmox cluster with 75 | multiple Ceph backend. 76 | . 77 | - Encryption and compression at rest are also seamlessly supported 78 | via Bluestore OSDs (see https://ceph.com/community/new-luminous-bluestore/) 79 | -------------------------------------------------------------------------------- /api.md: -------------------------------------------------------------------------------- 1 | # Rest API documentation 2 | 3 | #### Note 4 | No authentication nor authorization is made in any way. You should use a proxy, with basic auth and TLS.\ 5 | Lastly, the API code **must** be run as root (well, it must CAP_SYS_ADMIN), because it will handle block devices, mount filesystems etc. 6 | 7 | ## Listing backed up disks 8 | ``` 9 | 12% [jack@jack:~]curl -s http://localhost:5000/backup/ | python -mjson.tool 10 | [ 11 | { 12 | "disk": "vm-136-disk-1", 13 | "ident": "test-backurne", 14 | "uuid": "8eb4f698-afdc-45bb-9f6c-1833c42ae368;vm-136-disk-1;test-backurne" 15 | } 16 | ] 17 | ``` 18 | 19 | ## Listing snapshot for a disk 20 | ``` 21 | 11% [jack@jack:~]curl -s "http://localhost:5000/backup/8eb4f698-afdc-45bb-9f6c-1833c42ae368;vm-136-disk-1;test-backurne/" | python -mjson.tool 22 | [ 23 | { 24 | "creation_date": "2018-06-01 15:44:26.072348", 25 | "uuid": "backup;daily;30;2018-06-01T15:44:26.072348" 26 | }, 27 | { 28 | "creation_date": "2018-06-01 15:44:26.499066", 29 | "uuid": "backup;hourly;48;2018-06-01T15:44:26.499066" 30 | } 31 | ] 32 | ``` 33 | 34 | ## Map a snapshot 35 | ``` 36 | 11% [jack@jack:~]curl -s "http://localhost:5000/map/8eb4f698-afdc-45bb-9f6c-1833c42ae368;vm-136-disk-1;test-backurne/backup;hourly;48;2018-06-01T15:44:26.499066/" | python -mjson.tool 37 | { 38 | "path": "tmp4_6ipuaw", 39 | "success": true 40 | } 41 | ``` 42 | The files can then be explored via a webgui at http://localhost:5000/explore/tmp4_6ipuaw/ 43 | 44 | ## Listing currently mounted snapshots 45 | ``` 46 | 11% [jack@jack:~]curl -s "http://localhost:5000/mapped/" | python -mjson.tool 47 | [ 48 | { 49 | "dev": "/dev/nbd0", 50 | "fstype": null, 51 | "mountpoint": null, 52 | "image": "restore-1", 53 | "parent_image": "8eb4f698-afdc-45bb-9f6c-1833c42ae368;vm-136-disk-1", 54 | "parent_snap": "test-backurne/backup;hourly;48;2018-06-01T15:44:26.499066", 55 | "mapped": null, 56 | "size": null, 57 | "children": [ 58 | { 59 | "dev": "/dev/nbd0", 60 | "fstype": null, 61 | "mountpoint": null, 62 | "image": null, 63 | "parent_image": null, 64 | "parent_snap": null, 65 | "mapped": null, 66 | "size": "20G", 67 | "children": [ 68 | { 69 | "dev": "/dev/nbd0p1", 70 | "fstype": "xfs", 71 | "mountpoint": "/tmp/tmp4_6ipuaw", 72 | "image": null, 73 | "parent_image": null, 74 | "parent_snap": null, 75 | "mapped": null, 76 | "size": "20G", 77 | "children": [] 78 | } 79 | ] 80 | } 81 | ] 82 | } 83 | ] 84 | ``` 85 | 86 | ## Cleaning things up 87 | ``` 88 | 18% [jack@jack:~]curl -s "http://localhost:5000/unmap/8eb4f698-afdc-45bb-9f6c-1833c42ae368;vm-136-disk-1;test-backurne/" | python -mjson.tool 89 | { 90 | "success": true 91 | } 92 | ``` 93 | -------------------------------------------------------------------------------- /debian/changelog: -------------------------------------------------------------------------------- 1 | backurne (2.4.0) UNRELEASED; urgency=medium 2 | 3 | * Add support for RBD namespaces 4 | 5 | -- Alexandre Bruyelles Thu, 28 Aug 2025 12:54:19 +0200 6 | 7 | backurne (2.3.3) UNRELEASED; urgency=medium 8 | 9 | * Improve the API 10 | 11 | -- Alexandre Bruyelles Wed, 16 Jul 2025 15:30:19 +0100 12 | 13 | backurne (2.3.2) UNRELEASED; urgency=medium 14 | 15 | * Avoid deprecated use of `rnd nbd` command 16 | 17 | -- Alexandre Bruyelles Thu, 22 May 2025 17:08:19 +0100 18 | 19 | backurne (2.3.1) UNRELEASED; urgency=medium 20 | 21 | * Support python 3.12 22 | 23 | -- Alexandre Bruyelles Sat, 13 Jul 2024 17:08:19 +0100 24 | 25 | backurne (2.3.0) UNRELEASED; urgency=medium 26 | 27 | * Report time to influxdb 28 | * Support Microsoft dynamic disks (LDM) 29 | * Implement tasks priority 30 | * Reimplement the workers logic, with a per-cluster pool 31 | 32 | -- Alexandre Bruyelles Sat, 30 Jul 2022 11:08:19 +0100 33 | 34 | backurne (2.2.1) UNRELEASED; urgency=medium 35 | 36 | * gzip has been replaced by zstd. 37 | * fix unmap when a LV is spread across multiple PV, inside the same vmdk 38 | * a per backup progress is now shown in the proctitle 39 | * add a warning if some snapshot could not be deleted in time 40 | 41 | -- Alexandre Bruyelles Mon, 12 Apr 2021 10:01:19 +0100 42 | 43 | backurne (2.2.0) UNRELEASED; urgency=medium 44 | 45 | * add a --cleanup option to the `backup` subcommand. 46 | * fix vmfs6 support. 47 | * add a --debug option for one-shot verbosity. 48 | * rework the `map` subcommand with enhancement to the vmdk support (especially in conjunction with lvm). 49 | * 'Plain' cluster can now be reached not only via SSH, but also via any user-defined way. Kubernetes is the main target here, yet it should work with anything. 50 | 51 | -- Alexandre Bruyelles Sat, 24 Aug 2020 09:40:19 +0100 52 | 53 | backurne (2.1.0) UNRELEASED; urgency=medium 54 | 55 | * Backuping only a subset for a `backurne backup` invocation is now possible, as well as forcing a backup (despite being considered unneeded regarding the profile). 56 | * **Backurne** now reports time elapsed to process each backup, either to a plain file or via syslog. See the `report_time` configuration entry. 57 | 58 | -- Alexandre Bruyelles Sat, 04 Jul 2020 13:50:19 +0100 59 | 60 | backurne (2.0.0) UNRELEASED; urgency=medium 61 | 62 | * The `list-mapped` subcommand has been reworked to support complex mapping. Command outputs (both cli & api) has been altered to support those changes. 63 | * **Backurne** now supports LVM. See README.md for its specific configuration. 64 | * **Backurne** now supports vmware. Also see README.md. 65 | 66 | -- Alexandre Bruyelles Mon, 13 Jan 2020 13:40:19 +0100 67 | 68 | backurne (1.1.0) UNRELEASED; urgency=medium 69 | 70 | * Add hook support 71 | 72 | -- Alexandre Bruyelles Mon, 01 Dec 2019 13:40:19 +0100 73 | 74 | backurne (1.0.0) UNRELEASED; urgency=medium 75 | 76 | * First release 77 | 78 | -- Alexandre Bruyelles Mon, 30 Sep 2019 23:40:19 +0100 79 | -------------------------------------------------------------------------------- /Changelog.md: -------------------------------------------------------------------------------- 1 | PENDING 2 | --- 3 | 4 | **Notable changes**: 5 | 6 | Version 2.4.0 7 | --- 8 | 9 | **Notable changes**: 10 | * Add support for RBD namespaces 11 | 12 | Version 2.3.3 13 | --- 14 | 15 | **Notable changes**: 16 | * Improve the API 17 | 18 | Version 2.3.2 19 | --- 20 | 21 | **Notable changes**: 22 | * Avoid deprecated use of `rnd nbd` command 23 | 24 | Version 2.3.1 25 | --- 26 | 27 | **Notable changes**: 28 | * Support python 3.12 29 | 30 | Version 2.3.0 31 | --- 32 | 33 | **Notable changes**: 34 | * Report time to influxdb 35 | * Support Microsoft dynamic disks (LDM) 36 | * Implement tasks priority 37 | * Reimplement the workers logic, with a per-cluster pool 38 | 39 | Thanks to Cyllene (https://www.groupe-cyllene.com/) for sponsoring this work ! 40 | 41 | Version 2.2.1 42 | --- 43 | 44 | **Notable changes**: 45 | * gzip has been replaced by zstd. 46 | * fix unmap when a LV is spread across multiple PV, inside the same vmdk 47 | * a per backup progress is now shown in the proctitle 48 | * add a warning if some snapshot could not be deleted in time 49 | 50 | Thanks to Cyllene (https://www.groupe-cyllene.com/) for sponsoring this work ! 51 | 52 | **Notable changes** : 53 | 54 | Version 2.2.0 55 | --- 56 | 57 | **Notable changes** : 58 | * add a --cleanup option to the `backup` subcommand. 59 | * fix vmfs6 support. 60 | * add a --debug option for one-shot verbosity. 61 | * rework the `map` subcommand with enhancement to the vmdk support (especially in conjunction with lvm). 62 | * 'Plain' cluster can now be reached not only via SSH, but also via any user-defined way. Kubernetes is the main target here, yet it should work with anything. 63 | 64 | Version 2.1.0 65 | --- 66 | 67 | **Notable changes** : 68 | * Backuping only a subset for a `backurne backup` invocation is now possible, as well as forcing a backup (despite being considered unneeded regarding the profile). See [cli.md](cli.md). 69 | * **Backurne** now reports time elapsed to process each backup, either to a plain file or via syslog. See the `report_time` configuration entry. 70 | 71 | Version 2.0.0 72 | --- 73 | 74 | **Notable changes** : 75 | * The `list-mapped` subcommand has been reworked to support complex mapping. Command outputs (both cli & api) has been altered to support those changes. 76 | * **Backurne** now supports LVM. See [README.md](README.md) for its specific configuration. 77 | * **Backurne** now supports vmware. Also see [README.md](README.md). 78 | 79 | Version 1.1.0 80 | --- 81 | 82 | **Notable changes** : 83 | * **Backurne** now supports a hook infrastructure. Action can be performed before and after specific event : for instance, stopping a database slave before backup, and starting it after. 84 | 85 | Version 1.0.0 86 | --- 87 | 88 | This version is centered around ease of use and reporting. The core algorithm has not changed much, but the release is supposed to be easier for people to use, simpler to understand etc. 89 | 90 | **Notable changes** : 91 | * **Backurne** now supports per-image locks. Multiple **Backurne** can now run at the same time, safely. However, worker count is per instance (backup_worker and live_worker). 92 | * The source tree has been reworks to use python3-setuptools. Debian packages is supported, for easier install / updates. 93 | * Status reporting has been greatly improved : output is more concise, progress is shown as much as possible. Each process current task is shown in **ps**, **htop** etc. 94 | * Options parsing has been reworked and is more bulletproof. 95 | -------------------------------------------------------------------------------- /src/backurne/log.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import logging.handlers 3 | import sys 4 | import syslog 5 | 6 | from termcolor import colored 7 | 8 | from .config import config 9 | 10 | 11 | class ConsoleFormatter(logging.Formatter): 12 | def format(self, record): 13 | if record.levelno == logging.DEBUG: 14 | msg = ( 15 | f"[{record.filename}:{record.lineno}:{record.funcName}()] {record.msg}" 16 | ) 17 | else: 18 | msg = record.msg 19 | if record.levelno == logging.ERROR: 20 | front = colored(" CRIT: ", "red") 21 | if record.levelno == logging.WARNING: 22 | front = colored(" WARN: ", "yellow") 23 | if record.levelno == logging.INFO: 24 | front = colored(" INFO: ", "green") 25 | if record.levelno == logging.DEBUG: 26 | front = colored(" DEBUG: ", "green") 27 | 28 | msg = f"{front}{msg}" 29 | 30 | record.msg = msg 31 | 32 | return logging.Formatter.format(self, record) 33 | 34 | 35 | def report_to_influx(image, endpoint, duration): 36 | from influxdb import InfluxDBClient 37 | 38 | conf = config["influxdb"] 39 | 40 | if conf["host"] is None or conf["db"] is None: 41 | log.warning("influxdb: host or db are not defined, cannot do proper reporting") 42 | return 43 | 44 | if conf["mtls"] is None: 45 | influx = InfluxDBClient( 46 | conf["host"], 47 | conf["port"], 48 | database=conf["db"], 49 | ssl=conf["tls"], 50 | verify_ssl=conf["verify_tls"], 51 | ) 52 | else: 53 | influx = InfluxDBClient( 54 | conf["host"], 55 | conf["port"], 56 | database=conf["db"], 57 | ssl=conf["tls"], 58 | verify_ssl=conf["verify_tls"], 59 | cert=conf["mtls"], 60 | ) 61 | 62 | data = [ 63 | { 64 | "measurement": "backurne", 65 | "tags": { 66 | "image": image, 67 | "endpoint": endpoint, 68 | }, 69 | "time": datetime.datetime.now().replace(microsecond=0).isoformat(), 70 | "fields": { 71 | "duration": int(duration.total_seconds()), 72 | }, 73 | } 74 | ] 75 | 76 | influx.write_points(data) 77 | 78 | 79 | def report_time(image, endpoint, duration): 80 | if config["report_time"] is None: 81 | return 82 | 83 | msg = f"Image {image} from {endpoint} backed up, eelapsed time: {duration}" 84 | msg = f"{datetime.datetime.now()}: {msg}" 85 | if config["report_time"] == "syslog": 86 | syslog.syslog(syslog.LOG_INFO, msg) 87 | elif config["report_time"] == "influxdb": 88 | report_to_influx(image, endpoint, duration) 89 | else: 90 | with open(config["report_time"], "a") as f: 91 | f.write(f"{msg}\n") 92 | 93 | 94 | def has_debug(log): 95 | return log.level == logging.DEBUG 96 | 97 | 98 | log = logging.getLogger("backurne") 99 | 100 | slog = logging.handlers.SysLogHandler(address="/dev/log") 101 | detailed_formatter = logging.Formatter( 102 | "%(name)s[%(process)d]: %(levelname)s: [%(filename)s:%(lineno)s:%(funcName)s()] %(message)s" 103 | ) 104 | slog.setFormatter(detailed_formatter) 105 | log.addHandler(slog) 106 | 107 | if sys.stdout.isatty(): 108 | console = logging.StreamHandler() 109 | if config["pretty_colors"] is True: 110 | console.setFormatter(ConsoleFormatter()) 111 | log.addHandler(console) 112 | 113 | if config["log_level"] == "debug": 114 | log.setLevel(logging.DEBUG) 115 | elif config["log_level"] == "info": 116 | log.setLevel(logging.INFO) 117 | elif config["log_level"] == "warn": 118 | log.setLevel(logging.WARNING) 119 | else: 120 | log.setLevel(logging.ERROR) 121 | -------------------------------------------------------------------------------- /src/backurne/backup.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | import dateutil.parser 4 | 5 | from .config import config 6 | from .log import log as Log 7 | 8 | 9 | class Bck: 10 | def __init__(self, name, ceph, rbd, vm=None, adapter=None): 11 | self.name = name 12 | self.ceph = ceph 13 | self.rbd = rbd 14 | self.vm = vm 15 | self.adapter = adapter 16 | 17 | self.source = f"{self.name}:{self.rbd}" 18 | 19 | self.dest = self.__build_dest() 20 | 21 | # Store here the last snapshot created via this object 22 | # It is not yet on the backup cluster 23 | self.last_created_snap = None 24 | 25 | def __str__(self): 26 | if self.vm is not None: 27 | return "{}/{}".format(self.vm["name"], self.rbd) 28 | else: 29 | return "{}/{}".format(self.name, self.rbd) 30 | 31 | def __build_dest(self): 32 | ident = self.name 33 | comment = None 34 | 35 | if self.vm is not None: 36 | comment = self.vm["name"] 37 | if self.vm["px"].px_config["use_smbios"]: 38 | if self.vm["smbios"] is not None: 39 | ident = self.vm["smbios"] 40 | dest = f"{ident};{self.adapter};{comment}" 41 | return dest 42 | 43 | dest = f"{ident};{self.rbd};{comment}" 44 | return dest 45 | 46 | def __snap_name(self, profile, value): 47 | name = f"{profile};{value}" 48 | Log.debug(f"Processing {self.source} ({name})") 49 | name = f"{config['snap_prefix']};{name}" 50 | return name 51 | 52 | def __last_snap_profile(self, profile): 53 | snaps = self.ceph.backup.snap(self.dest) 54 | good = [] 55 | for snap in snaps: 56 | split = snap.split(";") 57 | if split[1] != profile: 58 | continue 59 | good.append(snap) 60 | return self.ceph.get_last_snap(good) 61 | 62 | def dl_snap(self, snap_name, dest, last_snap): 63 | Log.debug(f"Exporting {self.source} {snap_name}") 64 | if not self.ceph.backup.exists(dest): 65 | # Create a dummy image, on our backup cluster, 66 | # which will receive a full snapshot 67 | self.ceph.backup("create", dest, "-s", "1") 68 | 69 | self.ceph.do_backup(self.rbd, snap_name, dest, last_snap) 70 | Log.debug(f"Export {self.source} {snap_name} complete") 71 | 72 | def check_profile(self, profile): 73 | try: 74 | last_profile = self.__last_snap_profile(profile) 75 | except Exception: 76 | # Image does not exists ? 77 | return True 78 | 79 | if profile == "daily": 80 | delta = datetime.timedelta(days=1) 81 | elif profile == "hourly": 82 | delta = datetime.timedelta(hours=1) 83 | elif profile == "monthly": 84 | delta = datetime.timedelta(days=30) 85 | else: # weekly 86 | delta = datetime.timedelta(days=7) 87 | not_after = datetime.datetime.now() - delta 88 | if last_profile is not None: 89 | last_time = last_profile.split(";")[3] 90 | last_time = dateutil.parser.parse(last_time) 91 | if last_time > not_after: 92 | Log.debug("Our last backup is still young, nothing to do") 93 | return False 94 | return True 95 | 96 | def make_snap(self, profile, value): 97 | dest = self.dest 98 | self.snap_name = self.__snap_name(profile, value) 99 | 100 | self.ceph.backup.update_desc(self.source, dest) 101 | 102 | last_snap = None 103 | if self.last_created_snap is not None: 104 | last_snap = self.last_created_snap 105 | elif len(self.ceph.snap(self.rbd)) == 0: 106 | Log.debug(f"No snaps found on {self.source}") 107 | elif not self.ceph.backup.exists(dest): 108 | Log.debug(f"backup:{dest} does not exist") 109 | elif len(self.ceph.backup.snap(dest)) == 0: 110 | Log.debug(f"No snaps found for backup:{dest}") 111 | else: 112 | last_snap = self.ceph.get_last_shared_snap(self.rbd, dest) 113 | 114 | if last_snap is None: 115 | Log.debug(f"{self.source}: doing full backup") 116 | else: 117 | Log.debug(f"{self.source}: doing incremental backup based on {last_snap}") 118 | 119 | now = datetime.datetime.now().isoformat() 120 | snap_name = f"{self.snap_name};{now}" 121 | self.last_created_snap = snap_name 122 | 123 | self.ceph.mk_snap(self.rbd, snap_name, self.vm) 124 | 125 | return dest, last_snap, snap_name 126 | -------------------------------------------------------------------------------- /man/backurne.1: -------------------------------------------------------------------------------- 1 | .TH backurne 1 "28 Aug 2025" "2.4.0" "backurne man page" 2 | .SH NAME 3 | backurne \- backup Ceph's RBD on Ceph, with Proxmox integration 4 | .SH SYNOPSIS 5 | .B backurne 6 | .RI backup 7 | .I [--cluster ] 8 | .I [--vmid ] 9 | .I [--profile ] 10 | .I [--force] 11 | .I [--no-cleanup] 12 | .I [--cleanup] 13 | .br 14 | .B backurne 15 | .RI precheck 16 | .br 17 | .B backurne 18 | .RI check 19 | .br 20 | .B backurne 21 | .RI check-snap 22 | .br 23 | .B backurne 24 | .RI ls 25 | .I [rbd] 26 | .I [--json] 27 | .br 28 | .B backurne 29 | .RI map 30 | .I rbd 31 | .I snapshot 32 | .I [vmdk] 33 | .br 34 | .B backurne 35 | .RI unmap 36 | .I rbd 37 | .I snapshot 38 | .br 39 | .B backurne 40 | .RI list-mapped 41 | .I [--json] 42 | .br 43 | .B backurne 44 | .RI version 45 | 46 | .SH DESCRIPTION 47 | backurne is a handy tool for backuping RBD's image on RBD. 48 | .br 49 | .SH COMMANDS DESCRIPTIONS 50 | .IP "\fBbackup\fR" 4 51 | backup 52 | .IP 53 | Create backups. 54 | .br 55 | By default, 56 | .B Backurne 57 | connects to every cluster defined on its configuration, processes the required backups and scan every existing backups for potential cleanup. 58 | .br 59 | You can process only a subset of images using a combination of 60 | .I --cluster, --vmid 61 | and 62 | .I --profile. 63 | .br 64 | Please note that 65 | .I --vmid 66 | cannot be used without 67 | .I --cluster, 68 | because those ID are not globally unique. 69 | .br 70 | Also, when you process only such subset, cleaning up existing backups is not done. You may use the 71 | .I --cleanup 72 | option to change that behavior. 73 | .br 74 | Similarly, if you do not want to cleanup with a simple invocation, you can use the 75 | .I --no-cleanup 76 | option. 77 | .IP "\fBprecheck\fR" 4 78 | precheck 79 | .IP 80 | Preprocess checks results. Checking the current status of backup is a relatively slow operation, which makes it unfriendly for checks using icinga2 or other nagios-like probes. Prechecking is supposed to be done regularly via cron. 81 | .IP "\fBcheck\fR" 4 82 | check 83 | .IP 84 | Print the status of backups, preprocessed by the \fBprecheck\fR sub-command. Any image not backed up for too long is reported as an error. The command outputs return code in a nagios-like fashion. 85 | .IP "\fBcheck-snap\fR" 4 86 | check-snap 87 | .IP 88 | Check the coherency of backups, using a hash comparison between backups and production data. This a very slow operation, as it reads 100% of the backup storage. 89 | .IP "\fBls\fR" 4 90 | ls 91 | .I [rbd] 92 | .I [--json] 93 | .IP 94 | List backed up images. 95 | .br 96 | Using the 97 | .I [rbd] 98 | argument, you can list backups for a specific image. 99 | .br 100 | The command produces human friendly output by default. 101 | .br 102 | The 103 | .I [--json] 104 | argument lets it produce machine readable json. 105 | .IP "\fBmap\fR" 4 106 | map 107 | .I rbd 108 | .I snapshot 109 | .I [vmdk] 110 | .IP 111 | Map a snapshot. A mapped snapshot allows the user to explore and restore part of a snapshot. Volatile modifications are seamlessly allowed (the backup is never modified). 112 | .br 113 | .B Backurne 114 | will make the backup visible in /dev. Also, partitions found in the backup (if present) are also mapped. 115 | .br 116 | Finally, if a single partition is found (or no partition table), 117 | .B backurne 118 | will try to mount that filesystem using 119 | .B mount(8). 120 | .br 121 | If 122 | .B backurne 123 | did not manage to mount a filesystem, a message is printed and the user is left with manual handling. As printed, the backup shall be present at /dev/nbdX, and partitions (if any) at /dev/mapper/nbdXpY. 124 | .br 125 | By default, vmdk files are not mapped. The vmfs block device will be mounted, but vmdk will only be listed. 126 | .br 127 | You can use the optional 128 | .I vmdk 129 | parameter to also map a vmdk. 130 | .IP "\fBunmap\fR" 4 131 | unmap 132 | .I rbd 133 | .I snapshot 134 | .IP 135 | Unmap a previously mapped snapshot, removing volating modification in the process. 136 | .br 137 | If the mapping was not fully handled by 138 | .I backurne, 139 | the user must cleanup its actions before calling 140 | .I unmap. 141 | If not, 142 | .I unmap 143 | will fail and will have to be rerun. 144 | .IP "\fBlist-mapped\fR" 4 145 | list-mapped 146 | .I [--json] 147 | .IP 148 | List mapped backups. 149 | .br 150 | By default, a human friendly output is produced. Use 151 | .I --json 152 | for machine readable output. 153 | .IP "\fBversion\fR" 4 154 | version 155 | .IP 156 | Print the current version 157 | 158 | .SH DEBUGGING 159 | .P 160 | You can pass 161 | .I --debug 162 | as a first option to increase verbosity, or increase the 163 | .I log_level 164 | entry in the configuration. 165 | 166 | 167 | .SH BUGS 168 | No known bugs. 169 | .SH AUTHOR 170 | Alexandre Bruyelles 171 | -------------------------------------------------------------------------------- /src/backurne/proxmox.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from proxmoxer import ProxmoxAPI 4 | 5 | from .backup import Bck 6 | from .ceph import Ceph 7 | from .config import config 8 | from .log import log as Log 9 | 10 | 11 | class Proxmox: 12 | def __init__(self, px_config): 13 | self.px_config = px_config 14 | self.px = ProxmoxAPI( 15 | px_config["fqdn"], 16 | user=px_config["user"], 17 | password=px_config["passwd"], 18 | verify_ssl=px_config["tls"], 19 | ) 20 | self.ceph_storage = self.__get_ceph_storage() 21 | 22 | def __get_ceph_endpoint(self, storage): 23 | px = self.px_config["name"] 24 | ceph = storage 25 | if px in config["ceph_endpoint"]: 26 | if ceph in config["ceph_endpoint"][px]: 27 | return config["ceph_endpoint"][px][ceph] 28 | if "default" in config["ceph_endpoint"]: 29 | if ceph in config["ceph_endpoint"]["default"]: 30 | return config["ceph_endpoint"]["default"][ceph] 31 | return storage 32 | 33 | def __get_ceph_storage(self): 34 | result = {} 35 | for storage in self.px.storage.get(): 36 | if storage["type"] != "rbd": 37 | continue 38 | name = storage["storage"] 39 | endpoint = self.__get_ceph_endpoint(name) 40 | result[name] = Ceph( 41 | storage["pool"], namespace=storage.get("namespace"), endpoint=endpoint 42 | ) 43 | return result 44 | 45 | def nodes(self): 46 | nodes = self.px.nodes.get() 47 | return [i["node"] for i in nodes] 48 | 49 | def vms(self): 50 | vms = [] 51 | for vm in self.px.cluster.resources.get(type="vm"): 52 | self.vmid = vm["vmid"] 53 | vm["px"] = self 54 | vm["config"] = self.px.nodes(vm["node"]).qemu(vm["vmid"]).pending.get() 55 | tmp = {} 56 | for i in vm["config"]: 57 | if "value" not in i: 58 | continue 59 | tmp[i["key"]] = i["value"] 60 | vm["config"] = tmp 61 | vm["smbios"] = self.get_smbios(vm["config"]) 62 | vm["to_backup"] = [] 63 | for disk in self.get_disks(vm["config"]): 64 | ceph = self.ceph_storage[disk["ceph"]] 65 | bck = Bck( 66 | disk["ceph"], ceph, disk["rbd"], vm=vm, adapter=disk["adapter"] 67 | ) 68 | vm["to_backup"].append([disk, ceph, bck]) 69 | if "agent" in vm["config"]: 70 | vm["qemu_agent"] = vm["config"]["agent"] 71 | vms.append(vm) 72 | return vms 73 | 74 | def get_smbios(self, conf): 75 | for key, value in conf.items(): 76 | if not re.match("^smbios", key): 77 | continue 78 | return value.split("=")[1] 79 | return None 80 | 81 | def __extract_disk(self, key, value): 82 | disk = False 83 | if re.match("^virtio[0-9]+$", key): 84 | disk = True 85 | elif re.match("^ide[0-9]+$", key): 86 | disk = True 87 | elif re.match("^scsi[0-9]+$", key): 88 | disk = True 89 | elif re.match("^sata[0-9]+$", key): 90 | disk = True 91 | 92 | # Exclude cdrom 93 | if re.match(".*media=.*", str(value)): 94 | disk = False 95 | 96 | # "No backup" is set 97 | if re.match(".*backup=0.*", str(value)): 98 | return None, None, None 99 | 100 | if not disk: 101 | return None, None, None 102 | 103 | storage, volume = value.split(":") 104 | if storage not in self.ceph_storage: 105 | return None, None, None 106 | 107 | volume = volume.split(",")[0] 108 | 109 | match = re.match("vm-([0-9]+)-disk-[0-9]+", volume) 110 | if match is None: 111 | match = re.match("base-([0-9]+)-disk-[0-9]+", volume) 112 | if match.group(1) != str(self.vmid): 113 | return None, None, None 114 | 115 | return storage, volume, key 116 | 117 | def get_disks(self, conf): 118 | result = [] 119 | for key, value in conf.items(): 120 | storage, volume, adapter = self.__extract_disk(key, value) 121 | if storage is None: 122 | continue 123 | result.append({"ceph": storage, "rbd": volume, "adapter": adapter}) 124 | return result 125 | 126 | def is_running(self, qemu): 127 | status = qemu.status.get("current")["status"] 128 | return status == "stopped" 129 | 130 | def freeze(self, node, vm): 131 | if not config["fsfreeze"] or "qemu_agent" not in vm: 132 | return 133 | if vm["qemu_agent"] != 1: 134 | return 135 | qemu = self.px.nodes(node).qemu(vm["vmid"]) 136 | if not self.is_running(qemu): 137 | return 138 | 139 | try: 140 | Log.debug(f"Freezing {vm['vmid']}") 141 | qemu.agent.post("fsfreeze-freeze") 142 | except Exception as e: 143 | Log.warning(f"{e} thrown while freezing {vm['vmid']}") 144 | 145 | def thaw(self, node, vm): 146 | if not config["fsfreeze"] or "qemu_agent" not in vm: 147 | return 148 | if vm["qemu_agent"] != 1: 149 | return 150 | 151 | qemu = self.px.nodes(node).qemu(vm["vmid"]) 152 | if not self.is_running(qemu): 153 | return 154 | 155 | try: 156 | Log.debug(f"Thawing {vm['vmid']}") 157 | qemu.agent.post("fsfreeze-thaw") 158 | except Exception as e: 159 | Log.warning(f"{e} thrown while thawing {vm['vmid']}") 160 | -------------------------------------------------------------------------------- /cli.md: -------------------------------------------------------------------------------- 1 | # Command line interface 2 | 3 | First of all, we should create some backups. Here, we have two backup policy : a daily for 30 days, and a hourly for 48 hours, this is the default: 4 | ``` 5 | 35% [jack:~/backurne]./backurne backup 6 | INFO: Processing proxmox: infrakvm1 7 | INFO: Processing infraceph1:vm-136-disk-1 (daily;30) 8 | DEBUG: No snaps found on infraceph1:vm-136-disk-1 9 | INFO: infraceph1:vm-136-disk-1: doing full backup 10 | INFO: Processing infraceph1:vm-136-disk-1 (hourly;48) 11 | INFO: infraceph1:vm-136-disk-1: doing incremental backup based on backup;daily;30;2018-06-01T15:44:26.072348 12 | INFO: I will now download 2 snaps from px infrakvm1 13 | INFO: Exporting infraceph1:vm-136-disk-1 14 | Exporting image: 100% complete...done. 15 | Importing image diff: 100% complete...done. 16 | INFO: Export infraceph1:vm-136-disk-1 complete 17 | INFO: Exporting infraceph1:vm-136-disk-1 18 | Exporting image: 100% complete...done. 19 | Importing image diff: 100% complete...done. 20 | INFO: Export infraceph1:vm-136-disk-1 complete 21 | INFO: Deleting vm-136-disk-1@backup;daily;30;2018-06-01T15:44:26.072348 .. 22 | INFO: Expiring our snapshots 23 | ``` 24 | As you can see, on the first backup is "full", the other is incremental (based on the full made seconds ago, thus very efficient).\ 25 | This is why using multiple policy does not cost much. 26 | 27 | 28 | Let's run the command again: 29 | ``` 30 | 16% [jack:~/backurne]./backurne backup 31 | INFO: Processing proxmox: infrakvm1 32 | INFO: Our last backup is still young, nothing to do 33 | INFO: Our last backup is still young, nothing to do 34 | INFO: I will now download 0 snaps from px infrakvm1 35 | INFO: Expiring our snapshots 36 | ``` 37 | Nothing to do !\ 38 | You can run this command many time, as it will avoid doing backups if the previous one is not old enough. 39 | 40 | By default, everything is processed. You can filter things using the following backup options: 41 | - `--cluster` 42 | - `--vmid` 43 | - `--profile` 44 | 45 | Also, you can force the processing of a backup, even if the previous one is not old enough, using the `--force` option. 46 | 47 | 48 | Now, we should list our backuped disks: 49 | ``` 50 | 17% [jack:~/backurne]./backurne ls 51 | +-----------------+---------+--------------------------------------------------------------------+ 52 | | Ident | Disk | UUID | 53 | +-----------------+---------+--------------------------------------------------------------------+ 54 | | test-backurne | scsi0 | 8eb4f698-afdc-45bb-9f6c-1833c42ae368;vm-136-disk-1;test-backurne | 55 | +-----------------+---------+--------------------------------------------------------------------+ 56 | ``` 57 | - `ident` is used as an identificator for human: for Proxmox's backups, this is the VM's name from the last run. 58 | - `Disk` is the disk adapter for proxmox, or the rbd image name for plain. 59 | - Finally, `UUID` is the real RBD image, as defined on Ceph, and is used as a primary key. 60 | 61 | 62 | We can list the backups for this disk: 63 | ``` 64 | 32% [jack:~/backurne]./backurne ls '8eb4f698-afdc-45bb-9f6c-1833c42ae368;vm-136-disk-1;test-backurne' 65 | +------------------------------+-----------------------------------------------+ 66 | | Creation date | UUID | 67 | +------------------------------+-----------------------------------------------+ 68 | | 2018-06-01 15:44:26.072348 | backup;daily;30;2018-06-01T15:44:26.072348 | 69 | | 2018-06-01 15:44:26.499066 | backup;hourly;48;2018-06-01T15:44:26.499066 | 70 | +------------------------------+-----------------------------------------------+ 71 | ``` 72 | We see that both snapshots were created almost at the same time. 73 | 74 | 75 | Now, we would like to inspect a snapshot's content. 76 | ``` 77 | 32% [jack:~/backurne]sudo ./ 78 | backurne map 28b868e3-c145-4ea7-8dff-e5ae3b8093af\;scsi0\;nsint5 backup\;daily\;30\;2019-12-30T06\:00\:04.802699 79 | INFO: Mapping 28b868e3-c145-4ea7-8dff-e5ae3b8093af;scsi0;nsint5@backup;daily;30;2019-12-30T06:00:04.802699 .. 80 | INFO: rbd 28b868e3-c145-4ea7-8dff-e5ae3b8093af;scsi0;nsint5 / snap backup;daily;30;2019-12-30T06:00:04.802699 81 | INFO: └── /dev/nbd0 (fstype None, size 20G) 82 | INFO: └── /dev/nbd0p1 on /tmp/tmp09nri0sh (fstype xfs, size 20G) 83 | 32% [jack:~/backurne]ls /tmp/tmp09nri0sh 84 | bin boot dev dlm etc home initrd.img initrd.img.old lib lib32 lib64 media mnt opt proc root run sbin shared srv sys tmp usr var vmlinuz vmlinuz.old 85 | ``` 86 | 87 | The `map` subcommand clones a specific snapshot, maps it, maps the partitions (if any) and tries to mount the filesystems. 88 | Some things to consider: 89 | - the subcommand must be run with CAP_SYS_ADMIN, it will handle block devices and mount filesystems. 90 | - the mounted filesystem (or mapped block devices) is a clone of the snapshot, not the snapshot itself. It is thus writable, and will be deleted later : you can remove files or do whatever you want here without impacting the backup. 91 | 92 | Wait, what is mounted here already ? 93 | ``` 94 | 32% [jack:~/backurne]sudo ./backurne list-mapped 95 | INFO: rbd 28b868e3-c145-4ea7-8dff-e5ae3b8093af;scsi0;nsint5 / snap backup;daily;30;2019-12-30T06:00:04.802699 96 | INFO: └── /dev/nbd0 (fstype None, size 20G) 97 | INFO: └── /dev/nbd0p1 on /tmp/tmp09nri0sh (fstype xfs, size 20G) 98 | ``` 99 | 100 | Once you have recovered your files, you should do some cleanups: 101 | ``` 102 | 32% [jack:~/backurne]./backurne unmap '8eb4f698-afdc-45bb-9f6c-1833c42ae368;vm-136-disk-1;test-backurne' 'backup;hourly;48;2018-06-01T15:44:26.499066' 103 | INFO: Unmapping 8eb4f698-afdc-45bb-9f6c-1833c42ae368;vm-136-disk-1;test-backurne@backup;hourly;48;2018-06-01T15:44:26.499066 .. 104 | INFO: 8eb4f698-afdc-45bb-9f6c-1833c42ae368;vm-136-disk-1;test-backurne@backup;hourly;48;2018-06-01T15:44:26.499066 currently mapped on /dev/nbd0 105 | INFO: Deleting restore-1 .. 106 | ``` 107 | 108 | Finally, there are three subcommands for checks: 109 | - `precheck` crawls images and computes the actual arrors, if there is images on the live cluster without the daily snapshot. This subcommand may be slow, depending on the dataset. 110 | - `check` shows errors from `precheck`. 111 | - `check-snap` hashes images to check if the data on the backup cluster is the same as on the live cluster (but it is slow ..) 112 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # backurne 2 | 3 | `backurne` is a handy tool for backuping RBD's image on RBD.\ 4 | Yep ! What is better, for backuping a Ceph cluster, than another Ceph cluster ? 5 | 6 | 7 | It does not do much by itself, though, but orchestrate and relies heavily on other tools.\ 8 | It has a fine integration with Proxmox, but is able to backup "plain" (or "raw RBD") cluster as well. 9 | 10 | Supported features 11 | --- 12 | - **Snapshot-based backup**, with no agent strictly required on the VM. 13 | - Backup inspection and restoration via **command line interface** as well as via **REST API**. 14 | - **Support multiple retention policy** efficiently (both in term of storage and network bandwidth), dynamically configurable per host (proxmox-only) via REST API. 15 | - Auto cleanup : deletion is never generated by a human, thus **no human mistakes**. 16 | - **Compression** and **encryption** "on the wire" for enhanced efficiency and security. 17 | - Peaceful integration with other snapshots (via Proxmox web interface or whatever). 18 | - Multiple cluster support, with mixed type ("proxmox" and "plain"). 19 | - A couple of backups can be stored on the live clusters, for faster recovery. 20 | - Optional **fsfreeze** support (proxmox-only) via Qemu-quest-agent. 21 | - Backup deactivation via Proxmox's web interface. 22 | - External custom processing via hooks. 23 | - LVM support: backup's lvs are detected and mapped (if possible) for further exploration. See below. 24 | - vmware support: vmfs are detected and supported. Each vmdk are also mapped and mounted. See below. 25 | - Microsoft dynamic disks support: each logical disk will be mapped and mounted. See below. 26 | - VM tracking, for those who uses a single Proxmox cluster with multiple Ceph backend. 27 | 28 | Encryption and compression at rest are also seamlessly supported via Bluestore OSDs (see https://ceph.com/community/new-luminous-bluestore/) 29 | 30 | Required packages 31 | --- 32 | 33 | Core: python (>=3.7), python3-dateutil, python3-termcolor, python3-prettytable, python3-requests, python3-proxmoxer, python3-psutil, python3-anytree (from https://github.com/c0fec0de/anytree, .deb for buster attached for convenience), zstd for compression \ 34 | For mapping (optional): kpartx, rbd-nbd (Mimic or later), lvm2, vmfs-tools, vmfs6-tools, ldmtool\ 35 | For the REST API: python3-flask, python3-flask-autoindex\ 36 | For bash autocompletion: jq 37 | 38 | 39 | Installation 40 | --- 41 | 42 | - Check out the **Authentication** parts. 43 | - Clone the source, edit the configuration 44 | - Setup a Ceph cluster, used to store the backups 45 | - Profit ? 46 | 47 | Configuration 48 | --- 49 | 50 | See [custom.conf.sample](custom.conf.sample) 51 | 52 | Authentication, and where should I run what 53 | --- 54 | 55 | `backurne` interacts with the backup cluster via the `rbd` command line. It must have the required configuration at /etc/ceph/ceph.conf and the needed keyring.\ 56 | It is assumed that `backurne` will be run on a Ceph node (perhaps a monitor), but this is not strictly required (those communications will not be encrypted nor compressed). 57 | 58 | `backurne` connects to proxmox's cluster via their HTTP API. No data is exchanged via this link, it is purely used for "control" (listing VM, listing disks, fetching information etc). 59 | 60 | `backurne` connects to every "live" Ceph clusters via SSH. For each cluster, it will connect to a single node, always the same, defined in Proxmox (and / or overwritten via the configuration).\ 61 | SSH authentication nor authorization is **not** handled by `backurne` in any way.\ 62 | It is up to you to configure ssh : either accept or ignore the host keys, place your public key on the required hosts etc. 63 | 64 | Command line interface 65 | --- 66 | 67 | See [cli.md](cli.md) 68 | 69 | REST API 70 | --- 71 | 72 | See [api.md](api.md) 73 | 74 | Used technology 75 | --- 76 | 77 | - `RBD` is the core technology used by `backurne` : it provides snapshot export, import, diff, mapping etc. 78 | - `ssh` is used to transfer the snapshots between the live clusters and the backup cluster. `RBD` can be manipulated over TCP/IP, but without encryption nor compression, thus this solution was not kept. 79 | - `xxhash` (or other, see the configuration) is used to check the consistency between snapshots. 80 | - `rbd-nbd` is used to map a specific backup and inspect its content. 81 | - `kpartx`, `qemu-img`, `qemu-nbd`, `vmfs-tools` and `vmfs6-tools` are used for vmware exploration, `ldmtool` is used to map microsoft dynamic disks. 82 | 83 | 84 | vmware support 85 | --- 86 | 87 | The assumption is that the rbd image you back up is a single datastore. It contains multiple vmdk, each of them is a VM disk.\ 88 | Datastores are a specific filesystem: VMFS. There is several version, as of today. You will need `vmfs-tools` to mount VMFS up to version 5. For version 6 support, `vmfs6-tools` is required.\ 89 | When `backurne` detects a VMFS, it will try each version until success. If no `vmfs*-tools` is available, the block device is left as is.\ 90 | Once a VMFS device is mounted, each vmdk found inside will be mapped and mounted, recursively. In theory, you could have a VMFS, with inside a VM disk (vmdk), which is itself a datastore with inside more vmdk .. This behavior is not tested, though.\ 91 | 92 | 93 | LVM support 94 | --- 95 | 96 | The same device may be seen at many layer by the device-mapped code.\ 97 | To activate some LV, especially if the lives inside vmdk (see vmware support), you will need to tell LVM to allow such behavior.\ 98 | By default, LVM refuses to activate LVs that shows up in multiple PVs.\ 99 | To allow this, edit `/etc/lvm/lvm.conf`, and set `allow_changes_with_duplicate_pvs` to `1`.\ 100 | 101 | 102 | Microsoft LDM support 103 | --- 104 | 105 | Microsoft dynamic disks are supported. You will need the `ldmtool` to map those.\ 106 | A single dynamic disk as well as a dynamic disk spread across multiple block devices (inside a VMFS for instance) are supported.\ 107 | However, mapping multiple unrelated dynamic disk is not supported. For instance, if you map a backup A, and an unrelated backup B,\ 108 | while both of them contains dynamic disks, the behavior is unexpected.\ 109 | 110 | 111 | "Bare-metal" restore 112 | --- 113 | 114 | Restoring a complete image is out of `backurne`'s scope.\ 115 | If you are using `proxmox`, you may first need to restore the configuration in `/etc/pve/`.\ 116 | Any way, once you know the target rbd image name, you will have to :\ 117 | - find the desired backup image, using `backurne ls` 118 | - find the desired backup snapshot, using `backurne ls ` 119 | - export and import the image, using `rbd export --snap - | ssh rbd import - ` 120 | 121 | 122 | Graph and reporting 123 | --- 124 | 125 | ![alt text](https://github.com/JackSlateur/backurne/blob/master/graph_in_progress.png?raw=true) 126 | ![alt text](https://github.com/JackSlateur/backurne/blob/master/graph_duration.png?raw=true) 127 | 128 | An ugly grafana dashboard is provided in `graph/grafana-backurne.json`, data has stored in an influxdb database.\ 129 | It provides two information:\ 130 | - the number of backups currently running, using data from telegraf (both the script and the config shall be found in `graph/telegraf/*`). 131 | - the duration of each backup 132 | 133 | Merge requests or ideas of improvement are most welcome here. 134 | 135 | 136 | Note 137 | --- 138 | On Proxmox, LXC is not yet supported. Only Qemu so far :/ 139 | 140 | The project is developed mainly for Debian Buster and Proxmox, and is used here on these technologies.\ 141 | The "plain" feature as well as running `backurne` on other operating system is less tested, and may be less bug-proof.\ 142 | Bug report, merge requests and feature requests are welcome : some stuff are not implemented simply because I do not need them, not because it cannot be done nor because I do not want to code them. 143 | -------------------------------------------------------------------------------- /src/backurne/disk.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import json 3 | import os 4 | from collections import namedtuple 5 | 6 | import humanize 7 | import psutil 8 | import sh 9 | from anytree import Node 10 | from anytree import RenderTree 11 | from sh import lsblk 12 | from sh import rbd 13 | 14 | from .ceph import Ceph 15 | from .log import has_debug 16 | from .log import log as Log 17 | 18 | 19 | fields = [ 20 | "dev", 21 | "fstype", 22 | "mountpoint", 23 | "vmfs_fuse", 24 | "image", 25 | "parent_image", 26 | "parent_snap", 27 | "mapped", 28 | "qemu_nbd", 29 | "size", 30 | "ldm", 31 | ] 32 | Part = namedtuple("FS", fields, defaults=(None,) * len(fields)) 33 | 34 | 35 | def get_fs_info(dev): 36 | if dev is None: 37 | return [] 38 | info = lsblk( 39 | "-I", 8, "-p", "-o", "+NAME,FSTYPE,SIZE,MOUNTPOINT,PARTTYPE", "-J", dev 40 | ) 41 | info = json.loads(info.stdout) 42 | return info["blockdevices"] 43 | 44 | 45 | # vmfs-fuse shows itself as /dev/fuse, in /proc/mounts 46 | # Thus, lsblk cannot resolve the device 47 | # However, the cmdline is straightforward: vmfs-fuse 48 | # We will try to list all running processes, and catch the fuse daemon 49 | def resolv_vmfs(dev): 50 | for i in psutil.process_iter( 51 | attrs=[ 52 | "cmdline", 53 | ] 54 | ): 55 | i = i.info["cmdline"] 56 | if len(i) == 0: 57 | continue 58 | if "vmfs-fuse" not in i[0] and "vmfs6-fuse" not in i[0]: 59 | continue 60 | if i[1] != dev: 61 | continue 62 | return i[2] 63 | 64 | 65 | def resolv_qemu_nbd(dev): 66 | for i in psutil.process_iter( 67 | attrs=[ 68 | "cmdline", 69 | ] 70 | ): 71 | i = i.info["cmdline"] 72 | if len(i) == 0: 73 | continue 74 | if "qemu-nbd" not in i[0]: 75 | continue 76 | if i[3] != dev: 77 | continue 78 | return i[2] 79 | 80 | 81 | def get_next_nbd(): 82 | path = "/sys/class/block/" 83 | for i in glob.glob(f"{path}/nbd*"): 84 | dev = i.split("/")[-1] 85 | if "p" in dev: 86 | continue 87 | if os.path.exists(f"{path}/{dev}/pid"): 88 | continue 89 | return f"/dev/{dev}" 90 | 91 | 92 | def get_file_size(path): 93 | size = os.stat(path).st_size 94 | return humanize.naturalsize(size, binary=True) 95 | 96 | 97 | def get_vg_uuid(path): 98 | config = f'devices{{filter = [ "a|{path}|", "r|.*|" ]}}' 99 | config = f"--config={config}" 100 | raw = sh.Command("vgs")("--noheadings", "-o", "uuid,pv_name,vg_name", config) 101 | raw = raw.stdout.decode("utf-8") 102 | for data in raw.split("\n"): 103 | data = data.lstrip().rstrip() 104 | if data == "": 105 | continue 106 | data = data.split(" ") 107 | data = [i for i in data if i != ""] 108 | uuid, pv_name, vg_name = data 109 | if pv_name != path: 110 | continue 111 | return uuid, vg_name 112 | return None, None 113 | 114 | 115 | def deactivate_vg(path): 116 | uuid, _ = get_vg_uuid(path) 117 | if uuid is None: 118 | return 119 | 120 | Log.debug(f"vgchange -an --select vg_uuid={uuid}") 121 | try: 122 | sh.Command("vgchange")("-an", "--select", f"vg_uuid={uuid}") 123 | except Exception: 124 | pass 125 | 126 | 127 | def add_part(part, parent, extended, qemu_nbd=None): 128 | if part["fstype"] == "LVM2_member": 129 | node = Node( 130 | Part( 131 | dev=part["name"], 132 | mountpoint=part["mountpoint"], 133 | fstype=part["fstype"], 134 | size=part["size"], 135 | qemu_nbd=qemu_nbd, 136 | ), 137 | parent=parent, 138 | qemu_nbd=qemu_nbd, 139 | ) 140 | if "children" not in part: 141 | return 142 | for child in part["children"]: 143 | add_part(child, node, extended) 144 | elif part["fstype"] != "VMFS_volume_member": 145 | node = Node( 146 | Part( 147 | dev=part["name"], 148 | mountpoint=part["mountpoint"], 149 | fstype=part["fstype"], 150 | size=part["size"], 151 | qemu_nbd=qemu_nbd, 152 | ), 153 | parent=parent, 154 | qemu_nbd=qemu_nbd, 155 | ) 156 | else: 157 | part["mountpoint"] = resolv_vmfs(part["name"]) 158 | node = Node( 159 | Part( 160 | dev=part["name"], 161 | mountpoint=part["mountpoint"], 162 | fstype=part["fstype"], 163 | size=part["size"], 164 | vmfs_fuse=True, 165 | qemu_nbd=qemu_nbd, 166 | ), 167 | parent=parent, 168 | qemu_nbd=qemu_nbd, 169 | ) 170 | vmdks = "{}/*/*-flat.vmdk".format(part["mountpoint"]) 171 | for vmdk in glob.glob(vmdks): 172 | vmdk_size = get_file_size(vmdk) 173 | vmdk_short = vmdk.split("/")[-1] 174 | qcow2 = glob.glob(f"/tmp/*{vmdk_short}.qcow2") 175 | if len(qcow2) == 0: 176 | sub = Node( 177 | Part(dev=vmdk, size=vmdk_size), parent=node, qemu_nbd=qemu_nbd 178 | ) 179 | continue 180 | qcow2 = qcow2[0] 181 | nbd = resolv_qemu_nbd(qcow2) 182 | sub = Node( 183 | Part(dev=qcow2, size=vmdk_size, qemu_nbd=nbd), 184 | parent=node, 185 | qemu_nbd=qemu_nbd, 186 | ) 187 | get_partitions(nbd, sub, extended=extended, mapped=True) 188 | 189 | 190 | def filter_children(children, mapped): 191 | if mapped is False or mapped is None: 192 | return children 193 | 194 | # We know that the device is mapped 195 | # We will ignore non-mapped devices, to avoid duplicates 196 | result = [] 197 | for child in children: 198 | maj = child["maj:min"].split(":")[0] 199 | if maj.startswith("25") or child["fstype"] == "VMFS_volume_member": 200 | result.append(child) 201 | return result 202 | 203 | 204 | def get_partitions(dev, node, extended=True, mapped=None, qemu_nbd=None): 205 | for part in get_fs_info(dev): 206 | if part["fstype"] is not None: 207 | add_part(part, node, extended, qemu_nbd) 208 | continue 209 | if "children" not in part: 210 | continue 211 | 212 | # A microsoft dynamic disk has a single partition with type 0x42 213 | if part["children"][0]["parttype"] == "0x42": 214 | ldm = True 215 | else: 216 | ldm = None 217 | 218 | if extended is False: 219 | sub_node = node 220 | else: 221 | sub_node = Node( 222 | Part( 223 | dev=dev, 224 | mapped=mapped, 225 | qemu_nbd=qemu_nbd, 226 | size=part["size"], 227 | ldm=ldm, 228 | ), 229 | parent=node, 230 | ) 231 | part["children"] = filter_children(part["children"], mapped) 232 | if not ldm or True: 233 | for part in part["children"]: 234 | get_partitions(part["name"], sub_node, extended, mapped) 235 | 236 | 237 | def wait_dev(dev): 238 | Log.debug(f"udevadm trigger {dev} -w") 239 | sh.Command("udevadm")("trigger", dev, "-w") 240 | sh.Command("udevadm")("settle") 241 | 242 | 243 | def print_node(pre, _node): 244 | node = _node.name 245 | if node.parent_image is not None: 246 | Log.info("{}rbd {} / snap {}".format(pre, node.parent_image, node.parent_snap)) 247 | return 248 | 249 | if node.mountpoint is not None: 250 | msg = "on {} ".format(node.mountpoint) 251 | else: 252 | msg = "" 253 | 254 | if node.dev.endswith(".vmdk"): 255 | dev = node.dev.split("/")[-2] 256 | dev = "vmdk {}".format(dev) 257 | fstype = "vmfs file" 258 | else: 259 | dev = node.dev 260 | fstype = "fstype {}".format(node.fstype) 261 | if has_debug(Log): 262 | Log.info( 263 | "%s%s %s(%s, size %s, nbd %s, vmfs %s, mnt %s, real dev %s, ldm %s)" 264 | % ( 265 | pre, 266 | dev, 267 | msg, 268 | fstype, 269 | node.size, 270 | node.qemu_nbd, 271 | node.vmfs_fuse, 272 | node.mountpoint, 273 | node.dev, 274 | node.ldm, 275 | ) 276 | ) 277 | else: 278 | Log.info("{}{} {}({}, size {})".format(pre, dev, msg, fstype, node.size)) 279 | 280 | 281 | def print_mapped(mapped): 282 | for tree in mapped: 283 | for pre, fill, node in RenderTree(tree): 284 | print_node(pre, node) 285 | 286 | 287 | def prepare_tree_to_json(mapped): 288 | result = mapped.name._asdict() 289 | result["children"] = [] 290 | for child in mapped.children: 291 | result["children"].append(prepare_tree_to_json(child)) 292 | return result 293 | 294 | 295 | def get_rbd_mapped(): 296 | result = [] 297 | mapped = rbd("--format", "json", "-t", "nbd", "device", "list") 298 | for mapped in json.loads(mapped.stdout): 299 | info = Ceph(None).info(mapped["image"])["parent"] 300 | part = Part( 301 | dev=mapped["device"], 302 | image=mapped["image"], 303 | parent_image=info["image"], 304 | parent_snap=info["snapshot"], 305 | ) 306 | result.append(part) 307 | return result 308 | 309 | 310 | def get_mapped(extended=True): 311 | extended = True 312 | result = [] 313 | for i in get_rbd_mapped(): 314 | node = Node(i) 315 | get_partitions(i.dev, node, extended=extended) 316 | result.append(node) 317 | return result 318 | -------------------------------------------------------------------------------- /conf/backurne.conf: -------------------------------------------------------------------------------- 1 | # 2 | config = { 3 | # All our snapshot, on live, will use this prefix. 4 | # Every snapshot prefixed by this prefix will be handled 5 | # (and possibly deleted) 6 | # Must not contains a single quote ( "'" ) nor a semicolon ( ";" ) 7 | #'snap_prefix': 'backup', 8 | 9 | # Define our backup policy : when should be make a snap 10 | # How many backups should we store 11 | # How many backups should be kept on live (for faster restore) 12 | # Regardless of this setting, the last snapshot will 13 | # always be kept on live (for incremental purpose) 14 | # An optional argument called 'priority' can be set on a profile, with 15 | # only one meaningful value : 'high'. If set to high, those backup will 16 | # always be backup first. 17 | # Default: empty 18 | #'profiles': { 19 | # 'daily': { 20 | # 'count': 30, 21 | # 'max_on_live': 0, 22 | # }, 23 | # 'hourly': { 24 | # 'count': 48, 25 | # 'max_on_live': 0, 26 | # 'priority': 'high', 27 | # }, 28 | # 'weekly': { 29 | # 'count': 52, 30 | # 'max_on_live': 0, 31 | # }, 32 | #}, 33 | 34 | # Fetch additional profiles, per VM 35 | # If set to None, the default, no fetch is made 36 | # Else, it must be an URL, http or https 37 | # Backurne POST a json with VM's information, and 38 | # expect a json dict which contains additional profiles 39 | # or an empty dict 40 | # See sample-api-profile.py for a simple implementation 41 | #'profiles_api': None, 42 | 43 | # Where should we store the backups ? 44 | # The pool is dedicated 45 | #'backup_cluster': { 46 | # 'pool': 'rbd', 47 | #}, 48 | 49 | # List of live clusters to back up 50 | #'live_clusters': [ 51 | # # A sample proxmox cluster 52 | # # We will connect to it using http(s) 53 | # { 54 | # # A handy name, which MUST be unique 55 | # 'name': 'clusterpx', 56 | # 'fqdn': 'clusterpx.fqdn.org', 57 | # 'user': 'root@pam', 58 | # 'passwd': 'awesome-passwd', 59 | # 'tls': True, 60 | # 'use_smbios': True, 61 | # 'type': 'proxmox', 62 | # }, 63 | # 64 | # # A sample plain cluster 65 | # # By default, we will connect to it using ssh 66 | # # In that case, you have to ensure backurne 67 | # # can connect to it using ssh keys 68 | # { 69 | # # A handy name, which MUST be unique 70 | # 'name': 'cute-cluster', 71 | # 'fqdn': 'ceph.fqdn.org', 72 | # 'type': 'plain', 73 | # 'pool': 'vms' 74 | # }, 75 | # 76 | # # Alternatively, you may set 'fqdn' to None, and 77 | # # define a helper to tell backurne how to reach the cluster 78 | # # See example below, with a example on how to reach 79 | # # a rook cluster inside kubernetes 80 | # { 81 | # # A handy name, which MUST be unique 82 | # 'name': 'cute-cluster-with-helper', 83 | # 'fqdn': None, 84 | # 'type': 'plain', 85 | # 'pool': 'vms' 86 | # # The get_helper is optional 87 | # 'get_helper' : { 88 | # 'cmd': 'kubectl', 89 | # 'args': [ 90 | # '--kubeconfig', 91 | # '/path/to/kubeconfig', 92 | # '-n', 93 | # 'rook-ceph', 94 | # 'get', 95 | # 'pod', 96 | # '-l', 97 | # 'app=rook-ceph-tools', 98 | # '-o', 99 | # 'jsonpath={.items[0].metadata.name}', 100 | # ], 101 | # }, 102 | # 'use_helper' : { 103 | # 'cmd': 'kubectl', 104 | # 'args': [ 105 | # '--kubeconfig', 106 | # '/path/to/kubeconfig', 107 | # '-n', 108 | # 'rook-ceph', 109 | # 'exec', 110 | # '-i', 111 | # # %HELPERNAME% will be replaced by 112 | # # the output of 'get_helper', if any 113 | # '%HELPERNAME%', 114 | # '--', 115 | # ], 116 | # } 117 | # }, 118 | #], 119 | 120 | # Extra retention time for the last remaining backup, in day. 121 | # When an image is deleted from the live cluster, 122 | # it's backup image will slowly fade away with time 123 | # (each hour / day, a backup will be deleted) 124 | # Thus, with a 30 daily profile, the last backup will 125 | # be deleted 30 days after live's deleting 126 | # This setting increase the retention time, only for that 127 | # last backup. 128 | # If set to 30 and with a 30 daily profile, data will 129 | # be kept around for 60 days. 130 | #'extra_retention_time': 0, 131 | 132 | # Override ceph's endpoint 133 | # We need to connect to the Ceph live cluster 134 | # Identification we have: proxmox's name, and Ceph's name (from proxmox) 135 | # That name does not necessarily maps to a domain name, thus some mapping 136 | # may be required (editing /etc/hosts may works too) 137 | # Plus, you can have the same name on multiple proxmox clusters, but 138 | # pointing on different Ceph cluster 139 | # This entry is a dict of dicts: 140 | # - the first level is the proxmox's name, or 'default' as a catch-all 141 | # - the second level is the Ceph's name inside that proxmox cluster 142 | # ceph_endpoint[proxmox][ceph] has the precedence 143 | # Then ceph_endpoint['default'][ceph] 144 | # If nothing match, the Ceph's name is used as-is, and must 145 | # resolves 146 | # Default: empty 147 | #'ceph_endpoint': { 148 | # 'default': { 149 | # 'ceph1': 'cluster1.fqdn.org', 150 | # }, 151 | # 'proxmox32-lab': { 152 | # 'pool-ssd': 'cephlab.fqdn.org', 153 | # 'pool-hdd': 'cephlab.fqdn.org', 154 | # } 155 | #}, 156 | 157 | # If set to True, snapshots are compressed during transfer 158 | # Useful if you have a low bandwidth 159 | #'download_compression': False, 160 | 161 | # Should we freeze the VM before snapshotting ? 162 | # This requires qemu-guest-agent 163 | # Beware, a current bug lives in proxmox: if qemu-quest-agent 164 | # is enabled on the VM, but the daemon inside that VM is dead, 165 | # then the proxmox API will be stuck in an endless loop for 166 | # ~1H 167 | #'fsfreeze': False, 168 | 169 | # If we set use_smbios to True, but encounter a VM 170 | # without smbios, what should we do ? 171 | # If True, we fallback, as if use_smbios is False, for this VM 172 | # If False, we drop an error and skip the VM 173 | # If fallback is used, and an uuid is defined afterward, 174 | # you will lose this VM's backup history 175 | # (as if it was newly created) 176 | #'uuid_fallback': True, 177 | 178 | # Print pretty color, if stdout is a tty 179 | #'pretty_colors': True, 180 | 181 | # Log level 182 | # Can be any of 'debug', 'info', 'warn', 'err' 183 | #'log_level': 'debug', 184 | 185 | # How many workers should be used when we parallelize 186 | # tasks on the backup cluster 187 | #'backup_worker': 24, 188 | 189 | # How many workers should be used when we parallelize 190 | # tasks on the live cluster 191 | #'live_worker': 12, 192 | 193 | # Hash binary used to compare snapshots 194 | # You can you any executable that meet the follow requirements: 195 | # - eat data from stdin 196 | # - require no argument 197 | # - output the hash to stdout as the word 198 | # The output may contains other words (space-separated list of char), 199 | # which will be ignored 200 | # This can be an absolute path, yet a $PATH lookup can be used 201 | # Default to xxhsum 202 | # This executable must live on every Ceph cluster, as well as the backup 203 | # node, because hash is done remotely 204 | #'hash_binary': 'xxhsum', 205 | 206 | # Sqlite3 database used to track "failed" backups 207 | # We have to make the diff between a failed backup, and a missing backup 208 | # (some newly created disk not yet backed up) 209 | #'check_db': '/tmp/backurne.db', 210 | 211 | # Backurne can run commands before and after some action 212 | # Each command will get parameters as argument : its type, the vm name 213 | # (for proxmox, undef else) and the disk name 214 | # Four hooks are defined: 215 | # - pre_vm hook, that will run once per VM per run, before 216 | # any snapshot is made on that VM's disk, and only if some 217 | # snapshot *will* be made. 218 | # - pre_disk hook, that will run once per disk, before creating 219 | # a snapshot 220 | # - post_disk hook, run just after the snapshot creation 221 | # - post_vm hook, run only once per VM per run, after all 222 | # needed snapshots are created 223 | # pre_vm and pre_disk hooks may return a non-zero return code. 224 | # If pre_vm or pre_disk returns a non-zero code, further processing is 225 | # cancelled. In that case, please note that the associated post_vm or 226 | # post_disk hook will not be run. A warning shall be emitted, containing 227 | # information about the hook, its parameters, and its output. 228 | # On success, hooks output (both stdout and stderr) are ignored. 229 | # Hooks shall clean themselves, and shall always die in a timely fashion, 230 | # as a stuck hook will stuck Backurne (no timeout is set). 231 | # By default, no hook are used. You must set each hook's path. 232 | #'hooks': { 233 | # 'pre_vm': None, 234 | # 'pre_disk': None, 235 | # 'post_disk': None, 236 | # 'post_vm': None, 237 | #}, 238 | 239 | # Report time to process (download and apply) a backup 240 | # Each disk will generate a one-line record, in a human readable, 241 | # with the disk name (rbd image), the cluster from which it is imported 242 | # and the elapsed time (excluding queue time, if present) 243 | # Can be: 244 | # - None, to disable the feature 245 | # - syslog 246 | # - influxdb 247 | # - some absolute file path 248 | # In that last case, the file will be opened, appended and closed for each records. 249 | # If set to influxdb, you will need to configure the 'influxdb' stanza below. 250 | #report_time: 'syslog', 251 | 252 | # Influxdb instance used by report_time. 253 | # The required entries are: 254 | # - host: either an IP or a resolvable FQDN that points to the influxdb instance 255 | # - db: the database to connect to 256 | # Some optional entries exists: 257 | # - port 258 | # - tls: should we connect using http (the default) or https ? 259 | # - verify_tls: if we are using https, should we verify the validity of the instance's 260 | # certificate ? 261 | # - mtls: you can enable mutual tls authentication by passing a tuple here, using a format 262 | # such as ('/path/to/cert', '/path/to/private/key'). If those files are not verified, the 263 | # instance may reject the connection. 264 | #'influxdb': { 265 | # 'host': None, 266 | # 'db': None, 267 | # 'port': 8086, 268 | # 'tls': False, 269 | # 'verify_tls': True, 270 | # 'mtls': None, 271 | #}, 272 | } 273 | -------------------------------------------------------------------------------- /graph/grafana-backurne.json: -------------------------------------------------------------------------------- 1 | { 2 | "__inputs": [ 3 | { 4 | "description": "", 5 | "label": "telegraf_hosting", 6 | "name": "DS_TELEGRAF_HOSTING", 7 | "pluginId": "influxdb", 8 | "pluginName": "InfluxDB", 9 | "type": "datasource" 10 | } 11 | ], 12 | "__requires": [ 13 | { 14 | "id": "grafana", 15 | "name": "Grafana", 16 | "type": "grafana", 17 | "version": "7.3.1" 18 | }, 19 | { 20 | "id": "graph", 21 | "name": "Graph", 22 | "type": "panel", 23 | "version": "" 24 | }, 25 | { 26 | "id": "influxdb", 27 | "name": "InfluxDB", 28 | "type": "datasource", 29 | "version": "1.0.0" 30 | } 31 | ], 32 | "annotations": { 33 | "list": [ 34 | { 35 | "builtIn": 1, 36 | "datasource": "-- Grafana --", 37 | "enable": true, 38 | "hide": true, 39 | "iconColor": "rgba(0, 211, 255, 1)", 40 | "name": "Annotations & Alerts", 41 | "type": "dashboard" 42 | } 43 | ] 44 | }, 45 | "editable": true, 46 | "gnetId": null, 47 | "graphTooltip": 0, 48 | "id": null, 49 | "iteration": 1619168286909, 50 | "links": [], 51 | "panels": [ 52 | { 53 | "aliasColors": {}, 54 | "bars": false, 55 | "dashLength": 10, 56 | "dashes": false, 57 | "datasource": "${DS_TELEGRAF_HOSTING}", 58 | "decimals": null, 59 | "fieldConfig": { 60 | "defaults": { 61 | "custom": {} 62 | }, 63 | "overrides": [] 64 | }, 65 | "fill": 1, 66 | "fillGradient": 0, 67 | "gridPos": { 68 | "h": 9, 69 | "w": 23, 70 | "x": 0, 71 | "y": 0 72 | }, 73 | "hiddenSeries": false, 74 | "id": 2, 75 | "legend": { 76 | "alignAsTable": true, 77 | "avg": false, 78 | "current": true, 79 | "max": true, 80 | "min": true, 81 | "rightSide": true, 82 | "show": true, 83 | "total": false, 84 | "values": true 85 | }, 86 | "lines": true, 87 | "linewidth": 1, 88 | "nullPointMode": "connected", 89 | "options": { 90 | "alertThreshold": true 91 | }, 92 | "percentage": false, 93 | "pluginVersion": "7.3.1", 94 | "pointradius": 2, 95 | "points": false, 96 | "renderer": "flot", 97 | "seriesOverrides": [], 98 | "spaceLength": 10, 99 | "stack": false, 100 | "steppedLine": false, 101 | "targets": [ 102 | { 103 | "alias": "In progress", 104 | "groupBy": [ 105 | { 106 | "params": [ 107 | "$__interval" 108 | ], 109 | "type": "time" 110 | }, 111 | { 112 | "params": [ 113 | "null" 114 | ], 115 | "type": "fill" 116 | } 117 | ], 118 | "measurement": "backurne_inprogress", 119 | "orderByTime": "ASC", 120 | "policy": "default", 121 | "refId": "A", 122 | "resultFormat": "time_series", 123 | "select": [ 124 | [ 125 | { 126 | "params": [ 127 | "value" 128 | ], 129 | "type": "field" 130 | }, 131 | { 132 | "params": [], 133 | "type": "mean" 134 | } 135 | ] 136 | ], 137 | "tags": [] 138 | } 139 | ], 140 | "thresholds": [], 141 | "timeFrom": null, 142 | "timeRegions": [], 143 | "timeShift": null, 144 | "title": "Backups in progress", 145 | "tooltip": { 146 | "shared": true, 147 | "sort": 0, 148 | "value_type": "individual" 149 | }, 150 | "type": "graph", 151 | "xaxis": { 152 | "buckets": null, 153 | "mode": "time", 154 | "name": null, 155 | "show": true, 156 | "values": [] 157 | }, 158 | "yaxes": [ 159 | { 160 | "format": "short", 161 | "label": null, 162 | "logBase": 1, 163 | "max": null, 164 | "min": null, 165 | "show": true 166 | }, 167 | { 168 | "format": "short", 169 | "label": null, 170 | "logBase": 1, 171 | "max": null, 172 | "min": null, 173 | "show": true 174 | } 175 | ], 176 | "yaxis": { 177 | "align": false, 178 | "alignLevel": null 179 | } 180 | }, 181 | { 182 | "aliasColors": {}, 183 | "bars": false, 184 | "dashLength": 10, 185 | "dashes": false, 186 | "datasource": "${DS_TELEGRAF_HOSTING}", 187 | "decimals": null, 188 | "fieldConfig": { 189 | "defaults": { 190 | "custom": {} 191 | }, 192 | "overrides": [] 193 | }, 194 | "fill": 1, 195 | "fillGradient": 0, 196 | "gridPos": { 197 | "h": 9, 198 | "w": 23, 199 | "x": 0, 200 | "y": 9 201 | }, 202 | "hiddenSeries": false, 203 | "id": 3, 204 | "legend": { 205 | "alignAsTable": true, 206 | "avg": false, 207 | "current": true, 208 | "max": true, 209 | "min": true, 210 | "rightSide": true, 211 | "show": true, 212 | "total": false, 213 | "values": true 214 | }, 215 | "lines": true, 216 | "linewidth": 1, 217 | "nullPointMode": "connected", 218 | "options": { 219 | "alertThreshold": true 220 | }, 221 | "percentage": false, 222 | "pluginVersion": "7.3.1", 223 | "pointradius": 2, 224 | "points": false, 225 | "renderer": "flot", 226 | "seriesOverrides": [], 227 | "spaceLength": 10, 228 | "stack": false, 229 | "steppedLine": false, 230 | "targets": [ 231 | { 232 | "alias": "Duration", 233 | "groupBy": [ 234 | { 235 | "params": [ 236 | "$__interval" 237 | ], 238 | "type": "time" 239 | }, 240 | { 241 | "params": [ 242 | "null" 243 | ], 244 | "type": "fill" 245 | } 246 | ], 247 | "measurement": "backurne", 248 | "orderByTime": "ASC", 249 | "policy": "default", 250 | "refId": "A", 251 | "resultFormat": "time_series", 252 | "select": [ 253 | [ 254 | { 255 | "params": [ 256 | "duration" 257 | ], 258 | "type": "field" 259 | }, 260 | { 261 | "params": [], 262 | "type": "mean" 263 | } 264 | ] 265 | ], 266 | "tags": [ 267 | { 268 | "key": "image", 269 | "operator": "=~", 270 | "value": "/^$image$/" 271 | }, 272 | { 273 | "condition": "AND", 274 | "key": "endpoint", 275 | "operator": "=~", 276 | "value": "/^$endpoint$/" 277 | } 278 | ] 279 | } 280 | ], 281 | "thresholds": [], 282 | "timeFrom": null, 283 | "timeRegions": [], 284 | "timeShift": null, 285 | "title": "Backup duration", 286 | "tooltip": { 287 | "shared": true, 288 | "sort": 0, 289 | "value_type": "individual" 290 | }, 291 | "type": "graph", 292 | "xaxis": { 293 | "buckets": null, 294 | "mode": "time", 295 | "name": null, 296 | "show": true, 297 | "values": [] 298 | }, 299 | "yaxes": [ 300 | { 301 | "$$hashKey": "object:592", 302 | "format": "s", 303 | "label": null, 304 | "logBase": 1, 305 | "max": null, 306 | "min": null, 307 | "show": true 308 | }, 309 | { 310 | "$$hashKey": "object:593", 311 | "format": "short", 312 | "label": null, 313 | "logBase": 1, 314 | "max": null, 315 | "min": null, 316 | "show": true 317 | } 318 | ], 319 | "yaxis": { 320 | "align": false, 321 | "alignLevel": null 322 | } 323 | } 324 | ], 325 | "refresh": "10s", 326 | "schemaVersion": 26, 327 | "style": "dark", 328 | "tags": [], 329 | "templating": { 330 | "list": [ 331 | { 332 | "allValue": null, 333 | "current": {}, 334 | "datasource": "${DS_TELEGRAF_HOSTING}", 335 | "definition": "", 336 | "error": null, 337 | "hide": 0, 338 | "includeAll": true, 339 | "label": null, 340 | "multi": false, 341 | "name": "image", 342 | "options": [], 343 | "query": "SHOW TAG VALUES FROM \"backurne_duration\" WITH KEY = \"image\"", 344 | "refresh": 1, 345 | "regex": "", 346 | "skipUrlSync": false, 347 | "sort": 0, 348 | "tagValuesQuery": "", 349 | "tags": [], 350 | "tagsQuery": "", 351 | "type": "query", 352 | "useTags": false 353 | }, 354 | { 355 | "allValue": null, 356 | "current": {}, 357 | "datasource": "${DS_TELEGRAF_HOSTING}", 358 | "definition": "", 359 | "error": null, 360 | "hide": 0, 361 | "includeAll": true, 362 | "label": null, 363 | "multi": false, 364 | "name": "endpoint", 365 | "options": [], 366 | "query": "SHOW TAG VALUES FROM \"backurne\" WITH KEY = \"endpoint\"", 367 | "refresh": 1, 368 | "regex": "", 369 | "skipUrlSync": false, 370 | "sort": 0, 371 | "tagValuesQuery": "", 372 | "tags": [], 373 | "tagsQuery": "", 374 | "type": "query", 375 | "useTags": false 376 | } 377 | ] 378 | }, 379 | "time": { 380 | "from": "now-2d", 381 | "to": "now" 382 | }, 383 | "timepicker": {}, 384 | "timezone": "", 385 | "title": "Backurne", 386 | "uid": "GZwWmKuMz", 387 | "version": 13 388 | } 389 | -------------------------------------------------------------------------------- /src/backurne/ceph.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import json 3 | import re 4 | import threading 5 | import time 6 | from subprocess import DEVNULL 7 | from subprocess import PIPE 8 | from subprocess import Popen 9 | 10 | import dateutil.parser 11 | import setproctitle 12 | import sh 13 | 14 | from .config import config 15 | from .log import log as Log 16 | from .log import report_time 17 | 18 | 19 | class Ceph: 20 | def __init__(self, pool, namespace=None, endpoint=None, cluster_conf={}): 21 | self.endpoint = endpoint 22 | self.cluster = cluster_conf 23 | self.compress = config["download_compression"] 24 | 25 | if pool is None: 26 | pool = config["backup_cluster"]["pool"] 27 | self.pool = pool 28 | self.cmd = sh.Command("rbd").bake("-p", pool) 29 | self.esc = False 30 | else: 31 | self.backup = Ceph(None) 32 | self.pool = pool 33 | 34 | self.__get_helper__() 35 | self.cmd = self.helper.bake("rbd", "-p", pool) 36 | if namespace is not None: 37 | self.namespace = namespace 38 | self.cmd = self.cmd.bake("--namespace", namespace) 39 | 40 | self.json = self.cmd.bake("--format", "json") 41 | 42 | def __get_helper__(self): 43 | if self.endpoint is not None: 44 | self.helper = sh.Command("ssh").bake("-n", self.endpoint) 45 | self.esc = True 46 | return 47 | 48 | if self.cluster.get("get_helper") is not None: 49 | get_helper_cmd = self.cluster["get_helper"]["cmd"] 50 | get_helper_args = self.cluster["get_helper"]["args"] 51 | helper_name = sh.Command(get_helper_cmd)(*get_helper_args) 52 | helper_name = helper_name.stdout.decode("utf-8") 53 | 54 | if self.cluster.get("use_helper") is None: 55 | Log.error(f"One of fqdn or use_helper must be defined ({self.cluster}") 56 | exit(1) 57 | 58 | use_helper_cmd = self.cluster["use_helper"]["cmd"] 59 | use_helper_args = self.cluster["use_helper"]["args"] 60 | use_helper_args = [ 61 | i if i != "%HELPERNAME%" else helper_name for i in use_helper_args 62 | ] 63 | self.helper = sh.Command(use_helper_cmd).bake(*use_helper_args) 64 | self.esc = False 65 | self.compress = False 66 | 67 | def __str__(self): 68 | result = f"pool {self.pool} using config {self.cluster}" 69 | return result 70 | 71 | def __call__(self, *args): 72 | return self.cmd(args) 73 | 74 | def __fetch(self, *args): 75 | result = self.json(args) 76 | result = json.loads(result.stdout.decode("utf-8")) 77 | return result 78 | 79 | def __esc(self, snap): 80 | if self.esc is True: 81 | return f"'{snap}'" 82 | else: 83 | return snap 84 | 85 | def info(self, image): 86 | return self.__fetch("info", image) 87 | 88 | def ls(self): 89 | return self.__fetch("ls") 90 | 91 | def du(self, image): 92 | return self.__fetch("du", image) 93 | 94 | def snap(self, image): 95 | snap = self.__fetch("snap", "ls", image) 96 | snap = [i["name"] for i in snap] 97 | snap = [i for i in snap if i.startswith(config["snap_prefix"])] 98 | return snap 99 | 100 | def protect(self, extsnap): 101 | info = self.info(extsnap) 102 | if info["protected"] == "true": 103 | return 104 | self("snap", "protect", extsnap) 105 | 106 | def unprotect(self, extsnap): 107 | info = self.info(extsnap) 108 | if info["protected"] == "false": 109 | return 110 | self("snap", "unprotect", extsnap) 111 | 112 | def clone(self, extsnap): 113 | for i in range(1, 100): 114 | clone = f"restore-{i}" 115 | if not self.exists(clone): 116 | break 117 | self("clone", extsnap, f"{self.pool}/{clone}") 118 | return clone 119 | 120 | def map(self, image): 121 | # lazy import to avoid circular imports 122 | from .disk import get_rbd_mapped 123 | 124 | if self.esc is True: 125 | Log.error("BUG: cannot map via ssh") 126 | exit(1) 127 | 128 | cmd = ["device", "-t", "nbd", "map", image] 129 | cmd = str(self.cmd).split(" ") + cmd 130 | 131 | Popen(cmd, stdout=DEVNULL, stderr=DEVNULL) 132 | 133 | # Should be enough .. right ? 134 | time.sleep(1) 135 | for mapped in get_rbd_mapped(): 136 | if mapped.image == image: 137 | return mapped.dev 138 | 139 | def unmap(self, dev): 140 | if self.esc is True: 141 | Log.error("BUG: cannot unmap via ssh") 142 | exit(1) 143 | 144 | self.cmd("device", "-t", "nbd", "unmap", dev) 145 | 146 | # Wait a bit to make sure the dev is effectively gone 147 | time.sleep(1) 148 | 149 | def rm(self, image): 150 | Log.debug(f"Deleting image {image} ..") 151 | try: 152 | self("rm", image) 153 | except sh.ErrorReturnCode: 154 | Log.debug(f"{image} cannot be removed, maybe someone mapped it") 155 | 156 | def rm_snap(self, image, snap): 157 | Log.debug(f"Deleting snapshot {image}@{snap} .. ") 158 | snap = self.__esc(snap) 159 | try: 160 | self("snap", "rm", "--snap", snap, image) 161 | except sh.ErrorReturnCode: 162 | Log.debug(f"Cannot rm {image}@{snap}, may be held by something") 163 | 164 | def mk_snap(self, image, snap, vm=None): 165 | snap = self.__esc(snap) 166 | 167 | Log.debug(f"Creating snapshot {image}@{snap} .. ") 168 | 169 | if vm is None: 170 | self("snap", "create", "--snap", snap, image) 171 | return 172 | 173 | self("snap", "create", "--snap", snap, image) 174 | 175 | def exists(self, image): 176 | try: 177 | self.cmd("info", image) 178 | return True 179 | except sh.ErrorReturnCode: 180 | return False 181 | 182 | def enqueue_output(self, out): 183 | original = setproctitle.getproctitle() 184 | 185 | regexp = re.compile(r"\w* \w*: (.*)%") 186 | line = "" 187 | for char in iter(lambda: out.read(1), b""): 188 | char = char.decode("utf-8") 189 | if ord(char) != 13: 190 | line += char 191 | continue 192 | 193 | if line == "": 194 | continue 195 | 196 | progress = regexp.match(line) 197 | if progress is not None: 198 | progress = f"{progress.group(1)}% complete" 199 | else: 200 | progress = line 201 | setproctitle.setproctitle(f"{original} ({progress})") 202 | line = "" 203 | out.close() 204 | 205 | def do_backup(self, image, snap, dest, last_snap=None): 206 | # On this function, we burden ourselves with Popen 207 | # I have not figured out how do fast data transfer 208 | # between processes with python3-sh 209 | snap = self.__esc(snap) 210 | export = ["export-diff", image, "--snap", snap] 211 | export = str(self.cmd).split(" ") + export 212 | if last_snap is None: 213 | export += [ 214 | "-", 215 | ] 216 | else: 217 | last_snap = self.__esc(last_snap) 218 | export += ["--from-snap", last_snap, "-"] 219 | 220 | if self.compress is True: 221 | export += ["|", "zstd"] 222 | imp = f'zstdcat | {self.backup.cmd} import-diff --no-progress - "{dest}"' 223 | else: 224 | imp = f'{self.backup.cmd} import-diff --no-progress - "{dest}"' 225 | 226 | start = datetime.datetime.now() 227 | 228 | p1 = Popen(export, stdout=PIPE, stderr=PIPE, bufsize=0) 229 | 230 | p2 = Popen(imp, stdin=p1.stdout, shell=True) 231 | t = threading.Thread(target=self.enqueue_output, args=(p1.stderr,)) 232 | t.start() 233 | 234 | p1.stdout.close() 235 | p2.communicate() 236 | t.join() 237 | end = datetime.datetime.now() 238 | report_time(image, self.endpoint, end - start) 239 | 240 | def get_last_snap(self, snaps): 241 | last_date = datetime.datetime.fromtimestamp(0) 242 | last = None 243 | for snap in snaps: 244 | split = snap.split(";") 245 | date = dateutil.parser.parse(split[3]) 246 | if date > last_date: 247 | last_date = date 248 | last = snap 249 | return last 250 | 251 | def get_last_shared_snap(self, image, dest): 252 | live_snaps = self.snap(image) 253 | backup_snaps = self.backup.snap(dest) 254 | 255 | inter = list(set(live_snaps).intersection(backup_snaps)) 256 | return self.get_last_snap(inter) 257 | 258 | def update_desc(self, source, dest): 259 | split = dest.split(";") 260 | found = False 261 | for i in self.ls(): 262 | snap = i.split(";") 263 | if snap[0] != split[0] or snap[1] != split[1]: 264 | continue 265 | 266 | if snap[2] == split[2]: 267 | # This is my image, nothing to do 268 | continue 269 | 270 | if found is True: 271 | Log.error(f"{i} matches {dest}, but we already found a match") 272 | found = True 273 | self("mv", i, dest) 274 | 275 | def checksum(self, image, snap): 276 | snap = self.__esc(snap) 277 | cmd = ["export", image, "--snap", snap, "-"] 278 | cmd = str(self.cmd).split(" ") + cmd 279 | 280 | if self.esc is True: 281 | # via ssh 282 | cmd += ["|", config["hash_binary"]] 283 | p1 = Popen(cmd, stdout=PIPE, stderr=DEVNULL) 284 | else: 285 | p2 = Popen(cmd, stdout=PIPE, stderr=DEVNULL) 286 | p1 = Popen( 287 | [ 288 | config["hash_binary"], 289 | ], 290 | stdin=p2.stdout, 291 | stdout=PIPE, 292 | stderr=DEVNULL, 293 | ) 294 | out = p1.communicate()[0] 295 | out = out.decode("utf-8").split(" ")[0] 296 | return out 297 | -------------------------------------------------------------------------------- /src/backurne/restore.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os.path 3 | import tempfile 4 | 5 | import dateutil.parser 6 | import sh 7 | 8 | from .ceph import Ceph 9 | from .disk import deactivate_vg 10 | from .disk import filter_children 11 | from .disk import get_fs_info 12 | from .disk import get_mapped 13 | from .disk import get_next_nbd 14 | from .disk import resolv_qemu_nbd 15 | from .disk import resolv_vmfs 16 | from .disk import wait_dev 17 | from .log import log as Log 18 | 19 | 20 | class Restore: 21 | def __init__(self, rbd=None, snap=None, vmdk=None): 22 | self.ceph = Ceph(None) 23 | self.dev = None 24 | 25 | self.rbd = rbd 26 | self.snap = snap 27 | self.vmdk = vmdk 28 | self.extsnap = f"{self.rbd}@{self.snap}" 29 | self.umounted = [] 30 | 31 | def ls(self): 32 | result = [] 33 | if self.rbd is None: 34 | for i in self.ceph.ls(): 35 | if i.startswith("restore-"): 36 | continue 37 | split = i.split(";") 38 | if len(split) != 3: 39 | Log.warning(f"Unknown image: {i}") 40 | continue 41 | result.append( 42 | { 43 | "ident": split[2], 44 | "disk": split[1], 45 | "uuid": i, 46 | } 47 | ) 48 | else: 49 | for i in self.ceph.snap(self.rbd): 50 | split = i.split(";") 51 | creation = dateutil.parser.parse(split[3]) 52 | result.append( 53 | { 54 | "creation": creation, 55 | "uuid": i, 56 | } 57 | ) 58 | return result 59 | 60 | def get_tmpdir(self): 61 | tmp_dir = tempfile.mkdtemp() 62 | return tmp_dir 63 | 64 | def __map_vmdks(self, path): 65 | if self.vmdk is None: 66 | Log.debug("No vmdk specified, not mapping those") 67 | return 68 | 69 | for vmdk in glob.glob(f"{path}/{self.vmdk}/*-flat.vmdk"): 70 | self.__map_vmdk(vmdk) 71 | 72 | def __map_vmdk(self, vmdk): 73 | vmdk_file = vmdk.split("/")[-1] 74 | vmdk_overlay = f"/tmp/{self.clone}-{vmdk_file}.qcow2" 75 | Log.debug(f"qemu-img create {vmdk_overlay} over {vmdk}") 76 | sh.Command("qemu-img")("create", "-f", "qcow2", "-b", vmdk, vmdk_overlay) 77 | next_nbd = get_next_nbd() 78 | Log.debug(f"qemu-nbd {vmdk_overlay} as {next_nbd}") 79 | sh.Command("qemu-nbd")("--connect", next_nbd, vmdk_overlay) 80 | wait_dev(next_nbd) 81 | try: 82 | sh.Command("kpartx")("-av", next_nbd) 83 | self.mount_dev(next_nbd) 84 | except Exception: 85 | pass 86 | 87 | def __mount_vmfs(self, path, tmp_dir): 88 | for cmd in ("vmfs-fuse", "vmfs6-fuse"): 89 | try: 90 | Log.debug(f"{cmd} {path} {tmp_dir}") 91 | sh.Command(cmd)(path, tmp_dir) 92 | self.__map_vmdks(tmp_dir) 93 | return 94 | except Exception: 95 | pass 96 | 97 | def mount_dev(self, dev, ignore_mapped=False): 98 | wait_dev(dev) 99 | info = get_fs_info(dev)[0] 100 | if info["fstype"] == "VMFS_volume_member": 101 | info["mountpoint"] = resolv_vmfs(dev) 102 | 103 | if info["fstype"] == "swap": 104 | return False 105 | 106 | if info["parttype"] == "0x42": 107 | Log.debug("windows dynamic disk detected: scanning and creating devices") 108 | sh.Command("ldmtool")("scan") 109 | sh.Command("ldmtool")("create", "all") 110 | return False 111 | if ( 112 | info["fstype"] is not None 113 | and info["mountpoint"] is None 114 | and info["fstype"] != "LVM2_member" 115 | ): 116 | tmp_dir = self.get_tmpdir() 117 | if info["fstype"] == "VMFS_volume_member": 118 | self.__mount_vmfs(dev, tmp_dir) 119 | return True 120 | Log.debug(f"mounting {dev} as {info['fstype']} into {tmp_dir}") 121 | if info["fstype"] == "xfs": 122 | Log.debug(f"xfs_repair -L {dev}") 123 | sh.Command("xfs_repair")("-L", dev) 124 | Log.debug(f"mount {dev} {tmp_dir}") 125 | try: 126 | sh.Command("mount")(dev, tmp_dir) 127 | except Exception as e: 128 | os.rmdir(tmp_dir) 129 | if info["fstype"] == "ntfs": 130 | Log.debug(e) 131 | else: 132 | Log.warn(e) 133 | pass 134 | 135 | return True 136 | 137 | if info["fstype"] == "VMFS_volume_member": 138 | changed = False 139 | for vmdk in glob.glob(f"{info['mountpoint']}/{self.vmdk}/*-flat.vmdk"): 140 | vmdk_file = vmdk.split("/")[-1] 141 | vmdk_overlay = f"/tmp/{self.clone}-{vmdk_file}.qcow2" 142 | nbd = resolv_qemu_nbd(vmdk_overlay) 143 | if nbd is None: 144 | self.__map_vmdk(vmdk) 145 | return True 146 | wait_dev(nbd) 147 | result = self.mount_dev(nbd, ignore_mapped=True) 148 | if result is True: 149 | changed = True 150 | if changed is True: 151 | return True 152 | 153 | if "children" not in info: 154 | return False 155 | 156 | info["children"] = filter_children(info["children"], ignore_mapped) 157 | for child in info["children"]: 158 | result = self.mount_dev(child["name"]) 159 | if result is True: 160 | return True 161 | return False 162 | 163 | def clone_image(self): 164 | for i in get_mapped(extended=False): 165 | if i.name.parent_image != self.rbd or i.name.parent_snap != self.snap: 166 | continue 167 | self.clone = i.name.image 168 | self.dev = i.name.dev 169 | return 170 | 171 | Log.info(f"Cloning {self.extsnap} ..") 172 | self.ceph.protect(self.extsnap) 173 | self.clone = self.ceph.clone(self.extsnap) 174 | self.dev = self.ceph.map(self.clone) 175 | 176 | def mount(self): 177 | if self.vmdk is None: 178 | Log.info(f"Mapping {self.extsnap} ..") 179 | else: 180 | Log.info(f"Mapping {self.extsnap} with vmdk {self.vmdk} ..") 181 | self.clone_image() 182 | 183 | if self.dev is None: 184 | Log.error(f"Cannot map {self.clone} (cloned from {self.extsnap})") 185 | return 186 | 187 | while self.mount_dev(self.dev): 188 | Log.debug("Some progress was made, keep running") 189 | pass 190 | 191 | return 192 | 193 | def has_pv(self, tree): 194 | for i in tree.descendants: 195 | if i.name.fstype == "LVM2_member": 196 | return True 197 | return False 198 | 199 | def umount_tree(self, tree, first_pass=False): 200 | for child in tree.children: 201 | if child.name.dev.endswith(".vmdk"): 202 | self.umount_tree(child, first_pass=first_pass) 203 | 204 | ldm = False 205 | for child in tree.children: 206 | if child.name.ldm is True and first_pass is True: 207 | ldm = True 208 | if child.name.dev.endswith(".vmdk"): 209 | continue 210 | self.umount_tree(child, first_pass=first_pass) 211 | if tree.name.fstype == "LVM2_member": 212 | deactivate_vg(tree.name.dev) 213 | 214 | if ldm is True: 215 | Log.debug("windows dynamic disk detected: removing all devices") 216 | sh.Command("ldmtool")("remove", "all") 217 | 218 | if first_pass is True and self.has_pv(tree): 219 | Log.debug(f"{tree.name.dev}: pv found, return") 220 | return 221 | 222 | if tree.name.mountpoint is not None: 223 | if tree.name.mountpoint in self.umounted: 224 | Log.debug(f"We already umounted {tree.name.mountpoint}") 225 | return 226 | 227 | self.umounted.append(tree.name.mountpoint) 228 | Log.debug(f"\t{tree.name.dev}: umount {tree.name.mountpoint}") 229 | sh.Command("umount")(tree.name.mountpoint) 230 | Log.debug(f"\t{tree.name.dev}: rmdir {tree.name.mountpoint}") 231 | os.rmdir(tree.name.mountpoint) 232 | return 233 | 234 | if tree.name.qemu_nbd is not None: 235 | Log.debug(f"\t{tree.name.dev}: kpartx -dv {tree.name.qemu_nbd}") 236 | sh.Command("kpartx")("-dv", tree.name.qemu_nbd) 237 | Log.debug(f"\t{tree.name.dev}: qemu-nbd --disconnect {tree.name.qemu_nbd}") 238 | sh.Command("qemu-nbd")("--disconnect", tree.name.qemu_nbd) 239 | Log.debug(f"\t{tree.name.dev}: rm {tree.name.dev}") 240 | try: 241 | os.unlink(tree.name.dev) 242 | except FileNotFoundError: 243 | pass 244 | return 245 | 246 | if tree.name.image is not None and first_pass is False: 247 | Log.debug(f"\t{tree.name.dev}: rbd unmap {tree.name.image}") 248 | self.ceph.unmap(tree.name.dev) 249 | Log.debug(f"\t{tree.name.dev}: rbd rm {tree.name.image}") 250 | self.ceph.rm(tree.name.image) 251 | Log.debug( 252 | f"\t{tree.name.dev}: rbd unprotect --snap {tree.name.parent_snap} {tree.name.parent_image}" 253 | ) 254 | self.ceph.unprotect(f"{tree.name.parent_image}@{tree.name.parent_snap}") 255 | return 256 | Log.debug(f"{tree.name.dev}: Nothing to do ?") 257 | 258 | def umount(self, recursed=False): 259 | Log.info(f"Unmapping {self.extsnap} ..") 260 | for i in get_mapped(): 261 | part = i.name 262 | if part.parent_image != self.rbd or part.parent_snap != self.snap: 263 | continue 264 | Log.debug("First pass: skip devices which contains PV") 265 | self.umount_tree(i, first_pass=True) 266 | Log.debug("Second pass: process all remaining devices") 267 | self.umount_tree(i, first_pass=False) 268 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 2, June 1991 3 | 4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc., 5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | Preamble 10 | 11 | The licenses for most software are designed to take away your 12 | freedom to share and change it. By contrast, the GNU General Public 13 | License is intended to guarantee your freedom to share and change free 14 | software--to make sure the software is free for all its users. This 15 | General Public License applies to most of the Free Software 16 | Foundation's software and to any other program whose authors commit to 17 | using it. (Some other Free Software Foundation software is covered by 18 | the GNU Lesser General Public License instead.) You can apply it to 19 | your programs, too. 20 | 21 | When we speak of free software, we are referring to freedom, not 22 | price. Our General Public Licenses are designed to make sure that you 23 | have the freedom to distribute copies of free software (and charge for 24 | this service if you wish), that you receive source code or can get it 25 | if you want it, that you can change the software or use pieces of it 26 | in new free programs; and that you know you can do these things. 27 | 28 | To protect your rights, we need to make restrictions that forbid 29 | anyone to deny you these rights or to ask you to surrender the rights. 30 | These restrictions translate to certain responsibilities for you if you 31 | distribute copies of the software, or if you modify it. 32 | 33 | For example, if you distribute copies of such a program, whether 34 | gratis or for a fee, you must give the recipients all the rights that 35 | you have. You must make sure that they, too, receive or can get the 36 | source code. And you must show them these terms so they know their 37 | rights. 38 | 39 | We protect your rights with two steps: (1) copyright the software, and 40 | (2) offer you this license which gives you legal permission to copy, 41 | distribute and/or modify the software. 42 | 43 | Also, for each author's protection and ours, we want to make certain 44 | that everyone understands that there is no warranty for this free 45 | software. If the software is modified by someone else and passed on, we 46 | want its recipients to know that what they have is not the original, so 47 | that any problems introduced by others will not reflect on the original 48 | authors' reputations. 49 | 50 | Finally, any free program is threatened constantly by software 51 | patents. We wish to avoid the danger that redistributors of a free 52 | program will individually obtain patent licenses, in effect making the 53 | program proprietary. To prevent this, we have made it clear that any 54 | patent must be licensed for everyone's free use or not licensed at all. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | GNU GENERAL PUBLIC LICENSE 60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 61 | 62 | 0. This License applies to any program or other work which contains 63 | a notice placed by the copyright holder saying it may be distributed 64 | under the terms of this General Public License. The "Program", below, 65 | refers to any such program or work, and a "work based on the Program" 66 | means either the Program or any derivative work under copyright law: 67 | that is to say, a work containing the Program or a portion of it, 68 | either verbatim or with modifications and/or translated into another 69 | language. (Hereinafter, translation is included without limitation in 70 | the term "modification".) Each licensee is addressed as "you". 71 | 72 | Activities other than copying, distribution and modification are not 73 | covered by this License; they are outside its scope. The act of 74 | running the Program is not restricted, and the output from the Program 75 | is covered only if its contents constitute a work based on the 76 | Program (independent of having been made by running the Program). 77 | Whether that is true depends on what the Program does. 78 | 79 | 1. You may copy and distribute verbatim copies of the Program's 80 | source code as you receive it, in any medium, provided that you 81 | conspicuously and appropriately publish on each copy an appropriate 82 | copyright notice and disclaimer of warranty; keep intact all the 83 | notices that refer to this License and to the absence of any warranty; 84 | and give any other recipients of the Program a copy of this License 85 | along with the Program. 86 | 87 | You may charge a fee for the physical act of transferring a copy, and 88 | you may at your option offer warranty protection in exchange for a fee. 89 | 90 | 2. You may modify your copy or copies of the Program or any portion 91 | of it, thus forming a work based on the Program, and copy and 92 | distribute such modifications or work under the terms of Section 1 93 | above, provided that you also meet all of these conditions: 94 | 95 | a) You must cause the modified files to carry prominent notices 96 | stating that you changed the files and the date of any change. 97 | 98 | b) You must cause any work that you distribute or publish, that in 99 | whole or in part contains or is derived from the Program or any 100 | part thereof, to be licensed as a whole at no charge to all third 101 | parties under the terms of this License. 102 | 103 | c) If the modified program normally reads commands interactively 104 | when run, you must cause it, when started running for such 105 | interactive use in the most ordinary way, to print or display an 106 | announcement including an appropriate copyright notice and a 107 | notice that there is no warranty (or else, saying that you provide 108 | a warranty) and that users may redistribute the program under 109 | these conditions, and telling the user how to view a copy of this 110 | License. (Exception: if the Program itself is interactive but 111 | does not normally print such an announcement, your work based on 112 | the Program is not required to print an announcement.) 113 | 114 | These requirements apply to the modified work as a whole. If 115 | identifiable sections of that work are not derived from the Program, 116 | and can be reasonably considered independent and separate works in 117 | themselves, then this License, and its terms, do not apply to those 118 | sections when you distribute them as separate works. But when you 119 | distribute the same sections as part of a whole which is a work based 120 | on the Program, the distribution of the whole must be on the terms of 121 | this License, whose permissions for other licensees extend to the 122 | entire whole, and thus to each and every part regardless of who wrote it. 123 | 124 | Thus, it is not the intent of this section to claim rights or contest 125 | your rights to work written entirely by you; rather, the intent is to 126 | exercise the right to control the distribution of derivative or 127 | collective works based on the Program. 128 | 129 | In addition, mere aggregation of another work not based on the Program 130 | with the Program (or with a work based on the Program) on a volume of 131 | a storage or distribution medium does not bring the other work under 132 | the scope of this License. 133 | 134 | 3. You may copy and distribute the Program (or a work based on it, 135 | under Section 2) in object code or executable form under the terms of 136 | Sections 1 and 2 above provided that you also do one of the following: 137 | 138 | a) Accompany it with the complete corresponding machine-readable 139 | source code, which must be distributed under the terms of Sections 140 | 1 and 2 above on a medium customarily used for software interchange; or, 141 | 142 | b) Accompany it with a written offer, valid for at least three 143 | years, to give any third party, for a charge no more than your 144 | cost of physically performing source distribution, a complete 145 | machine-readable copy of the corresponding source code, to be 146 | distributed under the terms of Sections 1 and 2 above on a medium 147 | customarily used for software interchange; or, 148 | 149 | c) Accompany it with the information you received as to the offer 150 | to distribute corresponding source code. (This alternative is 151 | allowed only for noncommercial distribution and only if you 152 | received the program in object code or executable form with such 153 | an offer, in accord with Subsection b above.) 154 | 155 | The source code for a work means the preferred form of the work for 156 | making modifications to it. For an executable work, complete source 157 | code means all the source code for all modules it contains, plus any 158 | associated interface definition files, plus the scripts used to 159 | control compilation and installation of the executable. However, as a 160 | special exception, the source code distributed need not include 161 | anything that is normally distributed (in either source or binary 162 | form) with the major components (compiler, kernel, and so on) of the 163 | operating system on which the executable runs, unless that component 164 | itself accompanies the executable. 165 | 166 | If distribution of executable or object code is made by offering 167 | access to copy from a designated place, then offering equivalent 168 | access to copy the source code from the same place counts as 169 | distribution of the source code, even though third parties are not 170 | compelled to copy the source along with the object code. 171 | 172 | 4. You may not copy, modify, sublicense, or distribute the Program 173 | except as expressly provided under this License. Any attempt 174 | otherwise to copy, modify, sublicense or distribute the Program is 175 | void, and will automatically terminate your rights under this License. 176 | However, parties who have received copies, or rights, from you under 177 | this License will not have their licenses terminated so long as such 178 | parties remain in full compliance. 179 | 180 | 5. You are not required to accept this License, since you have not 181 | signed it. However, nothing else grants you permission to modify or 182 | distribute the Program or its derivative works. These actions are 183 | prohibited by law if you do not accept this License. Therefore, by 184 | modifying or distributing the Program (or any work based on the 185 | Program), you indicate your acceptance of this License to do so, and 186 | all its terms and conditions for copying, distributing or modifying 187 | the Program or works based on it. 188 | 189 | 6. Each time you redistribute the Program (or any work based on the 190 | Program), the recipient automatically receives a license from the 191 | original licensor to copy, distribute or modify the Program subject to 192 | these terms and conditions. You may not impose any further 193 | restrictions on the recipients' exercise of the rights granted herein. 194 | You are not responsible for enforcing compliance by third parties to 195 | this License. 196 | 197 | 7. If, as a consequence of a court judgment or allegation of patent 198 | infringement or for any other reason (not limited to patent issues), 199 | conditions are imposed on you (whether by court order, agreement or 200 | otherwise) that contradict the conditions of this License, they do not 201 | excuse you from the conditions of this License. If you cannot 202 | distribute so as to satisfy simultaneously your obligations under this 203 | License and any other pertinent obligations, then as a consequence you 204 | may not distribute the Program at all. For example, if a patent 205 | license would not permit royalty-free redistribution of the Program by 206 | all those who receive copies directly or indirectly through you, then 207 | the only way you could satisfy both it and this License would be to 208 | refrain entirely from distribution of the Program. 209 | 210 | If any portion of this section is held invalid or unenforceable under 211 | any particular circumstance, the balance of the section is intended to 212 | apply and the section as a whole is intended to apply in other 213 | circumstances. 214 | 215 | It is not the purpose of this section to induce you to infringe any 216 | patents or other property right claims or to contest validity of any 217 | such claims; this section has the sole purpose of protecting the 218 | integrity of the free software distribution system, which is 219 | implemented by public license practices. Many people have made 220 | generous contributions to the wide range of software distributed 221 | through that system in reliance on consistent application of that 222 | system; it is up to the author/donor to decide if he or she is willing 223 | to distribute software through any other system and a licensee cannot 224 | impose that choice. 225 | 226 | This section is intended to make thoroughly clear what is believed to 227 | be a consequence of the rest of this License. 228 | 229 | 8. If the distribution and/or use of the Program is restricted in 230 | certain countries either by patents or by copyrighted interfaces, the 231 | original copyright holder who places the Program under this License 232 | may add an explicit geographical distribution limitation excluding 233 | those countries, so that distribution is permitted only in or among 234 | countries not thus excluded. In such case, this License incorporates 235 | the limitation as if written in the body of this License. 236 | 237 | 9. The Free Software Foundation may publish revised and/or new versions 238 | of the General Public License from time to time. Such new versions will 239 | be similar in spirit to the present version, but may differ in detail to 240 | address new problems or concerns. 241 | 242 | Each version is given a distinguishing version number. If the Program 243 | specifies a version number of this License which applies to it and "any 244 | later version", you have the option of following the terms and conditions 245 | either of that version or of any later version published by the Free 246 | Software Foundation. If the Program does not specify a version number of 247 | this License, you may choose any version ever published by the Free Software 248 | Foundation. 249 | 250 | 10. If you wish to incorporate parts of the Program into other free 251 | programs whose distribution conditions are different, write to the author 252 | to ask for permission. For software which is copyrighted by the Free 253 | Software Foundation, write to the Free Software Foundation; we sometimes 254 | make exceptions for this. Our decision will be guided by the two goals 255 | of preserving the free status of all derivatives of our free software and 256 | of promoting the sharing and reuse of software generally. 257 | 258 | NO WARRANTY 259 | 260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY 261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN 262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES 263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED 264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS 266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE 267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, 268 | REPAIR OR CORRECTION. 269 | 270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR 272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING 274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED 275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY 276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER 277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 278 | POSSIBILITY OF SUCH DAMAGES. 279 | 280 | END OF TERMS AND CONDITIONS 281 | 282 | How to Apply These Terms to Your New Programs 283 | 284 | If you develop a new program, and you want it to be of the greatest 285 | possible use to the public, the best way to achieve this is to make it 286 | free software which everyone can redistribute and change under these terms. 287 | 288 | To do so, attach the following notices to the program. It is safest 289 | to attach them to the start of each source file to most effectively 290 | convey the exclusion of warranty; and each file should have at least 291 | the "copyright" line and a pointer to where the full notice is found. 292 | 293 | 294 | Copyright (C) 295 | 296 | This program is free software; you can redistribute it and/or modify 297 | it under the terms of the GNU General Public License as published by 298 | the Free Software Foundation; either version 2 of the License, or 299 | (at your option) any later version. 300 | 301 | This program is distributed in the hope that it will be useful, 302 | but WITHOUT ANY WARRANTY; without even the implied warranty of 303 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 304 | GNU General Public License for more details. 305 | 306 | You should have received a copy of the GNU General Public License along 307 | with this program; if not, write to the Free Software Foundation, Inc., 308 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 309 | 310 | Also add information on how to contact you by electronic and paper mail. 311 | 312 | If the program is interactive, make it output a short notice like this 313 | when it starts in an interactive mode: 314 | 315 | Gnomovision version 69, Copyright (C) year name of author 316 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 317 | This is free software, and you are welcome to redistribute it 318 | under certain conditions; type `show c' for details. 319 | 320 | The hypothetical commands `show w' and `show c' should show the appropriate 321 | parts of the General Public License. Of course, the commands you use may 322 | be called something other than `show w' and `show c'; they could even be 323 | mouse-clicks or menu items--whatever suits your program. 324 | 325 | You should also get your employer (if you work as a programmer) or your 326 | school, if any, to sign a "copyright disclaimer" for the program, if 327 | necessary. Here is a sample; alter the names: 328 | 329 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program 330 | `Gnomovision' (which makes passes at compilers) written by James Hacker. 331 | 332 | , 1 April 1989 333 | Ty Coon, President of Vice 334 | 335 | This General Public License does not permit incorporating your program into 336 | proprietary programs. If your program is a subroutine library, you may 337 | consider it more useful to permit linking proprietary applications with the 338 | library. If this is what you want to do, use the GNU Lesser General 339 | Public License instead of this License. 340 | -------------------------------------------------------------------------------- /src/backurne/backurne.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import atexit 3 | import datetime 4 | import json 5 | import multiprocessing 6 | import queue 7 | import signal 8 | import sqlite3 9 | import time 10 | from functools import wraps 11 | 12 | import dateutil.parser 13 | import filelock 14 | import progressbar 15 | import requests 16 | import setproctitle 17 | import sh 18 | 19 | from . import pretty 20 | from . import stats 21 | from .backup import Bck 22 | from .ceph import Ceph 23 | from .config import config 24 | from .disk import get_mapped 25 | from .disk import prepare_tree_to_json 26 | from .disk import print_mapped 27 | from .log import log as Log 28 | from .proxmox import Proxmox 29 | from .restore import Restore 30 | 31 | 32 | VERSION = "2.4.0" 33 | 34 | 35 | def handle_exc(func): 36 | @wraps(func) 37 | def wrapper(*args, **kwargs): 38 | try: 39 | return func(*args, **kwargs) 40 | except filelock.Timeout as e: 41 | Log.debug(e) 42 | except Exception as e: 43 | Log.warning(f"{e} thrown while running {func.__name__}()") 44 | 45 | return wrapper 46 | 47 | 48 | class Check: 49 | def __init__(self, cluster): 50 | self.cluster = cluster 51 | self.err = [] 52 | 53 | def add_err(self, msg): 54 | if msg is None: 55 | return 56 | msg["cluster"] = self.cluster["name"] 57 | self.err.append(msg) 58 | 59 | @handle_exc 60 | def check_img(self, args): 61 | ceph = args["ceph"] 62 | backup = args["backup"] 63 | rbd = args["image"] 64 | 65 | if not ceph.backup.exists(backup.dest): 66 | msg = f"No backup found for {backup} at {ceph} (image does not exists)" 67 | return {"image": rbd, "msg": msg} 68 | 69 | last = ceph.get_last_shared_snap(rbd, backup.dest) 70 | if last is None: 71 | msg = f"No backup found for {backup} at {ceph} (no shared snap)" 72 | return {"image": rbd, "msg": msg} 73 | 74 | when = last.split(";")[3] 75 | when = dateutil.parser.parse(when) 76 | deadline = datetime.timedelta(days=1) + datetime.timedelta(hours=6) 77 | deadline = datetime.datetime.now() - deadline 78 | if when < deadline: 79 | msg = ( 80 | f"Backup found for {backup} at {ceph}, yet too old (created at {when})" 81 | ) 82 | return {"image": rbd, "msg": msg} 83 | 84 | snaps = ceph.backup.snap(backup.dest) 85 | for snap in snaps: 86 | if not Backup.is_expired(snap): 87 | continue 88 | msg = f"Snapshot {backup.dest} / {snap} was not deleted in time, please investigate (may be protected or mapped)." 89 | return {"image": rbd, "msg": msg} 90 | 91 | def cmp_snap(self, backup, ceph, rbd): 92 | live_snaps = ceph.snap(rbd) 93 | try: 94 | backup_snaps = ceph.backup.snap(backup.dest) 95 | except Exception: 96 | backup_snaps = [] 97 | inter = list(set(live_snaps).intersection(backup_snaps)) 98 | for snap in inter: 99 | Log.debug(f"checking {rbd} @ {snap}") 100 | live = ceph.checksum(rbd, snap) 101 | back = ceph.backup.checksum(backup.dest, snap) 102 | if live == back: 103 | continue 104 | 105 | err = { 106 | "image": rbd, 107 | "msg": f"ERR: shared snapshot {snap} does not match\n\tOn live (image: {rbd}): {live}\n\tOn backup (image: {backup.dest}): {back}", 108 | } 109 | self.add_err(err) 110 | 111 | 112 | class CheckProxmox(Check): 113 | def __init__(self, cluster): 114 | super().__init__(cluster) 115 | self.px = Proxmox(cluster) 116 | 117 | def check(self): 118 | data = [] 119 | for vm in self.px.vms(): 120 | for disk, ceph, bck in vm["to_backup"]: 121 | data.append({"ceph": ceph, "backup": bck, "image": disk["rbd"]}) 122 | 123 | self.err = [] 124 | with multiprocessing.Pool() as pool: 125 | for msg in pool.imap_unordered(self.check_img, data): 126 | self.add_err(msg) 127 | 128 | return self.err 129 | 130 | def check_snap(self): 131 | for vm in self.px.vms(): 132 | for disk, ceph, bck in vm["to_backup"]: 133 | self.cmp_snap(bck, ceph, disk["rbd"]) 134 | return self.err 135 | 136 | 137 | class CheckPlain(Check): 138 | def __init__(self, cluster): 139 | super().__init__(cluster) 140 | self.ceph = Ceph( 141 | self.cluster["pool"], 142 | namespace=self.cluster.get("namespace"), 143 | endpoint=self.cluster["fqdn"], 144 | cluster_conf=self.cluster, 145 | ) 146 | 147 | def check(self): 148 | data = [] 149 | for rbd in self.ceph.ls(): 150 | bck = Bck(self.cluster["name"], self.ceph, rbd) 151 | data.append({"ceph": self.ceph, "backup": bck, "image": rbd}) 152 | 153 | self.err = [] 154 | with multiprocessing.Pool() as pool: 155 | for msg in pool.imap_unordered(self.check_img, data): 156 | self.add_err(msg) 157 | 158 | return self.err 159 | 160 | def check_snap(self): 161 | for rbd in self.ceph.ls(): 162 | bck = Bck(self.cluster["name"], self.ceph, rbd) 163 | self.cmp_snap(bck, self.ceph, rbd) 164 | return self.err 165 | 166 | 167 | def run_hook(kind, vmname, diskname): 168 | if config["hooks"][kind] is not None: 169 | sh.Command(config["hooks"][kind])(kind, vmname, diskname) 170 | 171 | 172 | class Backup: 173 | def __init__(self, cluster, regular_queue, priority_queue, status_queue, args=None): 174 | self.cluster = cluster 175 | self.regular_queue = regular_queue 176 | self.priority_queue = priority_queue 177 | self.status_queue = status_queue 178 | self.args = args 179 | 180 | def is_expired(snap, last=False): 181 | split = snap.split(";") 182 | created_at = dateutil.parser.parse(split[-1]) 183 | profile = split[-3] 184 | value = int(split[-2]) 185 | if profile == "daily": 186 | expiration = datetime.timedelta(days=value) 187 | elif profile == "hourly": 188 | expiration = datetime.timedelta(hours=value) 189 | elif profile == "weekly": 190 | expiration = datetime.timedelta(days=7 * value) 191 | elif profile == "monthly": 192 | expiration = datetime.timedelta(days=30 * value) 193 | else: 194 | Log.warning(f"Unknown profile found, no action taken: {profile}") 195 | return False 196 | 197 | expired_at = created_at + expiration 198 | if last is True: 199 | expired_at += datetime.timedelta(days=config["extra_retention_time"]) 200 | 201 | now = datetime.datetime.now() 202 | if expired_at > now: 203 | return False 204 | return True 205 | 206 | def _create_snap(self, bck, profiles, pre_vm_hook): 207 | todo = [] 208 | is_high_prio = False 209 | 210 | hooked = False 211 | 212 | try: 213 | with Lock(bck.dest): 214 | for profile, value in profiles: 215 | self.status_queue.put("add_item") 216 | if not self.args.force and not bck.check_profile(profile): 217 | self.status_queue.put("done_item") 218 | continue 219 | 220 | if pre_vm_hook is False: 221 | try: 222 | run_hook("pre_vm", bck.vm["name"], bck.rbd) 223 | except Exception as e: 224 | out = ( 225 | e.stdout.decode("utf-8") 226 | + e.stderr.decode("utf-8").rstrip() 227 | ) 228 | Log.warn( 229 | "pre_vm hook failed on %s/%s with code %s : %s" 230 | % (bck.vm["name"], bck.rbd, e.exit_code, out) 231 | ) 232 | self.status_queue.put("done_item") 233 | return None 234 | hooked = True 235 | 236 | try: 237 | if bck.vm is not None: 238 | run_hook("pre_disk", bck.vm["name"], bck.rbd) 239 | else: 240 | run_hook("pre_disk", bck.source, bck.rbd) 241 | except Exception as e: 242 | out = ( 243 | e.stdout.decode("utf-8") + e.stderr.decode("utf-8").rstrip() 244 | ) 245 | Log.warn( 246 | "pre_disk hook failed on %s/%s with code %s : %s" 247 | % (bck.vm["name"], bck.rbd, e.exit_code, out) 248 | ) 249 | self.status_queue.put("done_item") 250 | continue 251 | setproctitle.setproctitle( 252 | f"Backurne: snapshooting {bck.rbd} on {bck.name}" 253 | ) 254 | dest, last_snap, snap_name = bck.make_snap(profile, value["count"]) 255 | 256 | try: 257 | run_hook("post_disk", bck.vm["name"], bck.rbd) 258 | except Exception: 259 | pass 260 | 261 | if dest is not None: 262 | todo.append( 263 | { 264 | "dest": dest, 265 | "last_snap": last_snap, 266 | "snap_name": snap_name, 267 | "backup": bck, 268 | } 269 | ) 270 | 271 | priority = value.get("priority") 272 | if priority == "high": 273 | is_high_prio = True 274 | except filelock.Timeout: 275 | Log.info(f"unable to acquire lock for {bck.vm['name']}") 276 | pass 277 | if len(todo) != 0: 278 | if is_high_prio: 279 | self.priority_queue.put(todo) 280 | else: 281 | self.regular_queue.put(todo) 282 | setproctitle.setproctitle("Backurne idle producer") 283 | return hooked 284 | 285 | def create_snaps(self): 286 | items = self.list() 287 | with multiprocessing.Pool(config["live_worker"]) as pool: 288 | for i in pool.imap_unordered(self.create_snap, items): 289 | pass 290 | 291 | def _custom_key(self, item): 292 | return item.split(";")[3] 293 | 294 | def _expire_item(self, ceph, disk, vm=None): 295 | self.status_queue.put("add_item") 296 | self.status_queue.put("done_item") 297 | 298 | if vm is not None: 299 | bck = Bck(disk["ceph"], ceph, disk["rbd"], vm=vm, adapter=disk["adapter"]) 300 | rbd = disk["rbd"] 301 | else: 302 | bck = Bck(self.cluster["name"], ceph, disk) 303 | rbd = disk 304 | 305 | backups = Ceph(None).snap(bck.dest) 306 | 307 | snaps = ceph.snap(rbd) 308 | shared = list(set(backups).intersection(snaps)) 309 | 310 | try: 311 | shared.sort(key=self._custom_key) 312 | shared = shared.pop() 313 | except IndexError: 314 | shared = None 315 | 316 | by_profile = {} 317 | for snap in snaps: 318 | # The last shared snapshot must be kept 319 | # Also, subsequent snaps shall be kept as well, 320 | # because a backup may be pending elsewhere 321 | if shared is None or snap.split(";")[3] >= shared.split(";")[3]: 322 | continue 323 | tmp = snap.split(";") 324 | if tmp[1] not in by_profile: 325 | by_profile[tmp[1]] = [] 326 | i = by_profile[tmp[1]] 327 | i.append(snap) 328 | 329 | to_del = [] 330 | for profile, snaps in by_profile.items(): 331 | try: 332 | profile = config["profiles"][profile] 333 | except KeyError: 334 | # Profile no longer exists, we can drop all these snaps 335 | to_del += snaps 336 | continue 337 | try: 338 | max_on_live = profile["max_on_live"] 339 | except KeyError: 340 | max_on_live = 1 341 | 342 | for _ in range(0, max_on_live): 343 | try: 344 | snaps.pop() 345 | except IndexError: 346 | # We do not have enough snaps on live 347 | # snaps is now an empty list, nothing to delete 348 | break 349 | 350 | to_del += snaps 351 | for i in to_del: 352 | ceph.rm_snap(rbd, i) 353 | 354 | def expire_live(self): 355 | items = self.list() 356 | with multiprocessing.Pool(config["live_worker"]) as pool: 357 | for i in pool.imap_unordered(self.expire_item, items): 358 | pass 359 | 360 | @handle_exc 361 | def expire_backup(i): 362 | ceph = i["ceph"] 363 | image = i["image"] 364 | i["status_queue"].put("done_item") 365 | 366 | with Lock(image): 367 | snaps = ceph.snap(image) 368 | try: 369 | # Pop the last snapshot 370 | # We will take care of it later 371 | last = snaps.pop() 372 | except IndexError: 373 | # We found an image without snapshot 374 | # Someone is messing around, or this is a bug 375 | # Anyway, the image can be deleted 376 | ceph.rm(image) 377 | return 378 | 379 | for snap in snaps: 380 | if not Backup.is_expired(snap): 381 | continue 382 | ceph.rm_snap(image, snap) 383 | 384 | snaps = ceph.snap(image) 385 | if len(snaps) == 1: 386 | if Backup.is_expired(last, last=True): 387 | ceph.rm_snap(image, snaps[0]) 388 | 389 | if len(ceph.snap(image)) == 0: 390 | Log.debug(f"{image} has no snapshot left, deleting") 391 | ceph.rm(image) 392 | 393 | 394 | class BackupProxmox(Backup): 395 | def __init__(self, cluster, regular_queue, priority_queue, status_queue, args): 396 | super().__init__(cluster, regular_queue, priority_queue, status_queue, args) 397 | 398 | def __fetch_profiles(self, vm, disk): 399 | profiles = list(config["profiles"].items()) 400 | 401 | if config["profiles_api"] is None: 402 | return profiles 403 | 404 | try: 405 | json = { 406 | "cluster": { 407 | "type": "proxmox", 408 | "name": self.cluster["name"], 409 | "fqdn": self.cluster["fqdn"], 410 | }, 411 | "vm": { 412 | "vmid": vm["vmid"], 413 | "name": vm["name"], 414 | }, 415 | "disk": disk, 416 | } 417 | 418 | add = requests.post(config["profiles_api"], json=json, timeout=10) 419 | add.raise_for_status() 420 | add = add.json() 421 | 422 | if "backup" in add and add["backup"] is False: 423 | return [] 424 | 425 | if "profiles" in add: 426 | profiles += list(add["profiles"].items()) 427 | 428 | except Exception as e: 429 | Log.warning(f"{e} thrown while fetching profiles for {vm}") 430 | return profiles 431 | 432 | def list(self): 433 | result = [] 434 | 435 | try: 436 | px = Proxmox(self.cluster) 437 | for vm in px.vms(): 438 | if vm["smbios"] is None and self.cluster["use_smbios"] is True: 439 | if config["uuid_fallback"] is False: 440 | Log.warning("No smbios found, skipping") 441 | continue 442 | result.append(vm) 443 | except Exception as e: 444 | Log.error(f"{e} thrown while listing vm on {self.cluster['name']}") 445 | return result 446 | 447 | def filter_profiles(self, profiles, _filter): 448 | if _filter is None: 449 | return profiles 450 | 451 | result = [] 452 | for profile in profiles: 453 | if profile[0] == _filter: 454 | result.append(profile) 455 | else: 456 | Log.debug(f"Skipping profile {profile[0]}, due to --profile") 457 | return result 458 | 459 | @handle_exc 460 | def create_snap(self, vm): 461 | setproctitle.setproctitle("Backurne idle producer") 462 | 463 | if self.args.vmid is not None: 464 | if vm["vmid"] != self.args.vmid: 465 | Log.debug(f"Skipping VM {vm['vmid']}, due to --vmid") 466 | return 467 | 468 | px = Proxmox(self.cluster) 469 | # We freeze the VM once, thus create all snaps at the same time 470 | # Exports are done after thawing, because it it time-consuming, 471 | # and we must not keep the VM frozen more than necessary 472 | px.freeze(vm["node"], vm) 473 | 474 | pre_vm_hook = False 475 | 476 | for disk, ceph, bck in vm["to_backup"]: 477 | profiles = self.__fetch_profiles(vm, disk) 478 | profiles = self.filter_profiles(profiles, self.args.profile) 479 | hooked = self._create_snap(bck, profiles, pre_vm_hook) 480 | if hooked is None: 481 | # pre_vm hook failed, we skip all its disks 482 | break 483 | 484 | if hooked is True: 485 | pre_vm_hook = True 486 | 487 | if pre_vm_hook is True: 488 | run_hook("post_vm", bck.vm["name"], bck.rbd) 489 | 490 | px.thaw(vm["node"], vm) 491 | 492 | @handle_exc 493 | def expire_item(self, vm): 494 | for disk, ceph, bck in vm["to_backup"]: 495 | if self.args.vmid is not None: 496 | if vm["vmid"] != self.args.vmid: 497 | Log.debug(f"Skipping VM {vm['vmid']}, due to --vmid") 498 | return 499 | 500 | with Lock(bck.dest): 501 | self._expire_item(ceph, disk, vm) 502 | 503 | 504 | class BackupPlain(Backup): 505 | def __init__(self, cluster, regular_queue, priority_queue, status_queue, args): 506 | super().__init__(cluster, regular_queue, priority_queue, status_queue, args) 507 | self.ceph = Ceph( 508 | self.cluster["pool"], 509 | namespace=self.cluster.get("namespace"), 510 | endpoint=self.cluster["fqdn"], 511 | cluster_conf=self.cluster, 512 | ) 513 | 514 | def list(self): 515 | try: 516 | return self.ceph.ls() 517 | except Exception as e: 518 | Log.warning(e) 519 | return [] 520 | 521 | @handle_exc 522 | def create_snap(self, rbd): 523 | setproctitle.setproctitle("Backurne idle producer") 524 | bck = Bck(self.cluster["name"], self.ceph, rbd) 525 | self._create_snap(bck, config["profiles"].items(), True) 526 | 527 | @handle_exc 528 | def expire_item(self, rbd): 529 | bck = Bck(self.cluster["name"], self.ceph, rbd) 530 | with Lock(bck.dest): 531 | self._expire_item(self.ceph, rbd) 532 | 533 | 534 | class Status_updater: 535 | class Real_updater: 536 | def __init__(self, status_queue, desc): 537 | self.todo = 0 538 | self.total = 0 539 | self.status_queue = status_queue 540 | self.desc = desc 541 | 542 | if config["log_level"] != "debug": 543 | # progressbar uses signal.SIGWINCH 544 | # It messes with multiprocessing, so we break it 545 | real_signal = signal.signal 546 | signal.signal = None 547 | widget = [ 548 | progressbar.widgets.SimpleProgress(), 549 | " ", 550 | desc, 551 | " (", 552 | progressbar.widgets.Timer(), 553 | ")", 554 | ] 555 | self.bar = progressbar.ProgressBar(maxval=1, widgets=widget) 556 | signal.signal = real_signal 557 | 558 | @handle_exc 559 | def __call__(self): 560 | Log.debug("Real_updater started") 561 | if config["log_level"] != "debug": 562 | self.bar.start() 563 | self.__work__() 564 | if config["log_level"] != "debug": 565 | self.bar.finish() 566 | Log.debug("Real_updater ended") 567 | 568 | def __update(self): 569 | done = self.total - self.todo 570 | msg = f"Backurne : {done}/{self.total} {self.desc}" 571 | setproctitle.setproctitle(msg) 572 | if config["log_level"] != "debug": 573 | self.bar.maxval = self.total 574 | self.bar.update(done) 575 | 576 | def __work__(self): 577 | while True: 578 | try: 579 | msg = self.status_queue.get(block=False) 580 | except queue.Empty: 581 | self.__update() 582 | time.sleep(1) 583 | continue 584 | if msg == "add_item": 585 | self.total += 1 586 | self.todo += 1 587 | elif msg == "done_item": 588 | self.todo -= 1 589 | else: 590 | Log.error(f"Unknown message received: {msg}") 591 | self.__update() 592 | 593 | def __init__(self, manager, desc): 594 | self.status_queue = manager.Queue() 595 | self.desc = desc 596 | 597 | def __enter__(self): 598 | target = Status_updater.Real_updater(self.status_queue, self.desc) 599 | self.real_updater = multiprocessing.Process(target=target) 600 | atexit.register(self.real_updater.terminate) 601 | self.real_updater.start() 602 | return self.status_queue 603 | 604 | def __exit__(self, type, value, traceback): 605 | self.real_updater.terminate() 606 | print("") 607 | 608 | 609 | class Lock: 610 | def __init__(self, path): 611 | path = path.replace("/", "") 612 | self.path = f"{config['lockdir']}/{path}" 613 | self.lock = filelock.FileLock(self.path, timeout=0) 614 | 615 | def __enter__(self): 616 | Log.debug(f"locking {self.path}") 617 | self.lock.acquire() 618 | 619 | def __exit__(self, type, value, traceback): 620 | Log.debug(f"releasing lock {self.path}") 621 | self.lock.release() 622 | 623 | 624 | class Producer: 625 | def __init__(self, params, args): 626 | self.cluster = params["cluster"] 627 | self.regular_queue = params["regular_q"] 628 | self.priority_queue = params["priority_q"] 629 | self.status_queue = params["status_q"] 630 | self.args = args 631 | 632 | @handle_exc 633 | def __call__(self): 634 | Log.debug("Producer started") 635 | setproctitle.setproctitle("Backurne Producer") 636 | self.__work__() 637 | # We send one None per live_worker 638 | # That way, all of them shall die 639 | for i in range(0, config["live_worker"]): 640 | try: 641 | self.regular_queue.put(None) 642 | self.priority_queue.put(None) 643 | except Exception: 644 | Log.error( 645 | "cannot end a live_worker! This is a critical bug, we will never die" 646 | ) 647 | 648 | Log.debug("Producer ended") 649 | 650 | @handle_exc 651 | def __work__(self): 652 | if self.args.cluster is not None: 653 | if self.cluster["name"] != self.args.cluster: 654 | Log.debug(f"Skipping cluster {self.cluster['name']} due to --cluster") 655 | return 656 | Log.debug(f"Backuping {self.cluster['type']}: {self.cluster['name']}") 657 | if self.cluster["type"] == "proxmox": 658 | bidule = BackupProxmox( 659 | self.cluster, 660 | self.regular_queue, 661 | self.priority_queue, 662 | self.status_queue, 663 | self.args, 664 | ) 665 | else: 666 | bidule = BackupPlain( 667 | self.cluster, 668 | self.regular_queue, 669 | self.priority_queue, 670 | self.status_queue, 671 | self.args, 672 | ) 673 | bidule.create_snaps() 674 | 675 | 676 | class Consumer: 677 | def __init__(self, params): 678 | self.id = params["id"] 679 | self.cluster = params["cluster"] 680 | self.regular_queue = params["regular_q"] 681 | self.priority_queue = params["priority_q"] 682 | self.status_queue = params["status_q"] 683 | 684 | # Track the queue status 685 | # When both are dead, the worker can die in peace 686 | self.priority_alive = True 687 | self.regular_alive = True 688 | 689 | @handle_exc 690 | def __call__(self): 691 | Log.debug("Consumer started") 692 | setproctitle.setproctitle("Backurne Consumer") 693 | try: 694 | lockname = f"Consumer-{self.cluster['name']}-{self.id}" 695 | with Lock(lockname): 696 | self.__work__() 697 | except filelock.Timeout: 698 | Log.debug(f"Cannot lock: {lockname}, another instance is running") 699 | Log.debug("Consumer ended") 700 | 701 | def __work__(self): 702 | while True: 703 | setproctitle.setproctitle( 704 | f"Backurne idle consumer ({self.cluster['name']})" 705 | ) 706 | 707 | if self.priority_alive is False and self.regular_alive is False: 708 | break 709 | 710 | snaps = [] 711 | if self.priority_alive is True: 712 | try: 713 | snaps = self.priority_queue.get_nowait() 714 | except queue.Empty: 715 | pass 716 | 717 | if snaps is None: 718 | self.priority_alive = False 719 | continue 720 | 721 | if len(snaps) == 0 and self.regular_alive is True: 722 | try: 723 | snaps = self.regular_queue.get_nowait() 724 | except queue.Empty: 725 | pass 726 | 727 | if snaps is None: 728 | self.regular_alive = False 729 | continue 730 | 731 | if len(snaps) == 0: 732 | time.sleep(1) 733 | continue 734 | 735 | try: 736 | with Lock(snaps[0]["dest"]): 737 | for snap in snaps: 738 | setproctitle.setproctitle( 739 | f"Backurne: fetching {snap['backup'].source} ({snap['snap_name']})" 740 | ) 741 | backup = snap["backup"] 742 | backup.dl_snap( 743 | snap["snap_name"], snap["dest"], snap["last_snap"] 744 | ) 745 | except filelock.Timeout: 746 | pass 747 | except Exception as e: 748 | Log.error(e) 749 | self.status_queue.put("done_item") 750 | setproctitle.setproctitle("Backurne idle consumer") 751 | 752 | 753 | def get_sqlite(): 754 | sql = sqlite3.connect(config["check_db"], isolation_level=None) 755 | sql.execute( 756 | "create table if not exists results (date text, cluster text, disk text, msg text)" 757 | ) 758 | return sql 759 | 760 | 761 | def print_check_results(): 762 | sql = get_sqlite() 763 | 764 | failed = sql.execute( 765 | 'select * from results where date < strftime("%s", "now") - 7200' 766 | ) 767 | failed = [i for i in failed] 768 | 769 | if len(failed) > 0: 770 | print(f"Error: {len(failed)} failed backups found") 771 | for err in failed: 772 | print(f"{err[1]} : {err[3]}") 773 | exit(2) 774 | 775 | print("OK: all things are backed up!") 776 | exit(0) 777 | 778 | 779 | def update_check_results(check_results): 780 | sql = get_sqlite() 781 | 782 | failed_db = [i for i in sql.execute("select date, cluster, disk from results")] 783 | for i in failed_db: 784 | found = False 785 | for j in check_results: 786 | if i[1] != j["cluster"]: 787 | continue 788 | if i[2] != j["image"]: 789 | continue 790 | found = True 791 | break 792 | if found is False: 793 | sql.execute( 794 | "delete from results where cluster = ? and disk = ?", (i[1], i[2]) 795 | ) 796 | 797 | for i in check_results: 798 | found = False 799 | for j in failed_db: 800 | if j[1] != i["cluster"]: 801 | continue 802 | if j[2] != i["image"]: 803 | continue 804 | found = True 805 | break 806 | if found is False: 807 | sql.execute( 808 | 'insert into results values(strftime("%s", "now"), ?, ?, ?)', 809 | (i["cluster"], i["image"], i["msg"]), 810 | ) 811 | 812 | 813 | def get_args(): 814 | parser = argparse.ArgumentParser() 815 | parser.add_argument("--debug", action="store_true") 816 | 817 | sub = parser.add_subparsers(dest="action", required=True) 818 | back = sub.add_parser("backup") 819 | back.add_argument("--cluster", dest="cluster", nargs="?") 820 | back.add_argument("--vmid", dest="vmid", nargs="?", type=int) 821 | back.add_argument("--profile", dest="profile", nargs="?") 822 | back.add_argument("--force", action="store_true") 823 | back.add_argument("--no-cleanup", action="store_true") 824 | back.add_argument("--cleanup", action="store_true") 825 | 826 | sub.add_parser("precheck") 827 | sub.add_parser("check") 828 | sub.add_parser("check-snap") 829 | sub.add_parser("stats") 830 | sub.add_parser("version") 831 | 832 | ls = sub.add_parser("list-mapped") 833 | ls.add_argument("--json", action="store_true") 834 | 835 | ls = sub.add_parser("ls") 836 | ls.add_argument(dest="rbd", nargs="?") 837 | ls.add_argument("--json", action="store_true") 838 | 839 | _map = sub.add_parser("map") 840 | _map.add_argument(dest="rbd") 841 | _map.add_argument(dest="snapshot") 842 | _map.add_argument(dest="vmdk", nargs="?") 843 | 844 | unmap = sub.add_parser("unmap") 845 | unmap.add_argument(dest="rbd") 846 | unmap.add_argument(dest="snapshot") 847 | return parser.parse_args() 848 | 849 | 850 | def main(): 851 | args = get_args() 852 | 853 | if args.debug: 854 | import logging 855 | 856 | Log.setLevel(logging.DEBUG) 857 | 858 | if args.action == "stats": 859 | stats.print_stats() 860 | elif args.action == "version": 861 | print(f"Backurne version {VERSION}") 862 | elif args.action == "check": 863 | print_check_results() 864 | elif args.action in ("precheck", "check-snap"): 865 | result = [] 866 | 867 | for cluster in config["live_clusters"]: 868 | Log.info(f"Checking {cluster['type']}: {cluster['name']}") 869 | if cluster["type"] == "proxmox": 870 | check = CheckProxmox(cluster) 871 | else: 872 | check = CheckPlain(cluster) 873 | if args.action == "precheck": 874 | ret = check.check() 875 | else: 876 | ret = check.check_snap() 877 | result += ret 878 | 879 | update_check_results(result) 880 | print_check_results() 881 | elif args.action == "backup": 882 | if args.vmid is not None and args.cluster is None: 883 | Log.error("--vmid has no meaning without --cluster") 884 | exit(1) 885 | 886 | manager = multiprocessing.Manager() 887 | atexit.register(manager.shutdown) 888 | 889 | live_workers = [] 890 | 891 | with Status_updater(manager, "images processed") as status_queue: 892 | for cluster in config["live_clusters"]: 893 | params = { 894 | "cluster": cluster, 895 | "regular_q": manager.Queue(), 896 | "priority_q": manager.Queue(), 897 | "status_q": status_queue, 898 | } 899 | 900 | producer = multiprocessing.Process(target=Producer(params, args)) 901 | atexit.register(producer.terminate) 902 | producer.start() 903 | 904 | for i in range(0, config["live_worker"]): 905 | params["id"] = i 906 | pid = multiprocessing.Process(target=Consumer(params)) 907 | atexit.register(pid.terminate) 908 | live_workers.append(pid) 909 | pid.start() 910 | 911 | # Workers will exit upon a None reception 912 | # When all of them are done, we are done 913 | for pid in live_workers: 914 | pid.join() 915 | 916 | if args.no_cleanup is True: 917 | Log.debug("not cleaning up as --no-cleanup is used") 918 | exit(0) 919 | 920 | with Status_updater( 921 | manager, "images cleaned up on live clusters" 922 | ) as status_queue: 923 | for cluster in config["live_clusters"]: 924 | if args.cluster is not None: 925 | if cluster["name"] != args.cluster: 926 | Log.debug( 927 | f"Skipping cluster {cluster['name']} due to --cluster" 928 | ) 929 | continue 930 | 931 | Log.debug( 932 | f"Expire snapshots from live {cluster['type']}: {cluster['name']}" 933 | ) 934 | if cluster["type"] == "proxmox": 935 | bidule = BackupProxmox(cluster, None, None, status_queue, args) 936 | else: 937 | bidule = BackupPlain(cluster, None, None, status_queue, args) 938 | bidule.expire_live() 939 | 940 | if ( 941 | args.cleanup 942 | or args.cluster is None 943 | and args.profile is None 944 | and args.vmid is None 945 | ): 946 | Log.debug("Expiring our snapshots") 947 | # Dummy Ceph object used to retrieve the real backup Object 948 | ceph = Ceph(None) 949 | 950 | with Status_updater( 951 | manager, "images cleaned up on backup cluster" 952 | ) as status_queue: 953 | data = [] 954 | for i in ceph.ls(): 955 | data.append( 956 | {"ceph": ceph, "image": i, "status_queue": status_queue} 957 | ) 958 | status_queue.put("add_item") 959 | with multiprocessing.Pool(config["backup_worker"]) as pool: 960 | for i in pool.imap_unordered(Backup.expire_backup, data): 961 | pass 962 | 963 | manager.shutdown() 964 | elif args.action == "ls": 965 | restore = Restore(args.rbd, None) 966 | data = restore.ls() 967 | if args.rbd is None: 968 | pt = pretty.Pt(["Ident", "Disk", "UUID"]) 969 | 970 | for i in data: 971 | row = [i["ident"], i["disk"], i["uuid"]] 972 | pt.add_row(row) 973 | else: 974 | pt = pretty.Pt(["Creation date", "UUID"]) 975 | 976 | for i in data: 977 | row = [i["creation"], i["uuid"]] 978 | pt.add_row(row) 979 | 980 | if args.json is True: 981 | print(json.dumps(data, default=str)) 982 | else: 983 | print(pt) 984 | elif args.action == "list-mapped": 985 | data = get_mapped(extended=False) 986 | if args.json is True: 987 | result = [] 988 | for tree in data: 989 | result.append(prepare_tree_to_json(tree)) 990 | print(json.dumps(result)) 991 | else: 992 | print_mapped(data) 993 | elif args.action == "map": 994 | Restore(args.rbd, args.snapshot, args.vmdk).mount() 995 | for i in get_mapped(extended=False): 996 | if i.name.parent_image != args.rbd or i.name.parent_snap != args.snapshot: 997 | continue 998 | print_mapped( 999 | [ 1000 | i, 1001 | ] 1002 | ) 1003 | return 1004 | 1005 | elif args.action == "unmap": 1006 | restore = Restore(args.rbd, args.snapshot) 1007 | restore.umount() 1008 | 1009 | 1010 | if __name__ == "__main__": 1011 | main() 1012 | --------------------------------------------------------------------------------