├── README.md ├── pg_archive2 ├── tests │ ├── vars.bash │ ├── functions.bash │ ├── docker-entrypoint.sh │ ├── 0003_test-restore_cmd.bats │ ├── 0002_test-wal-cleanup.bats │ └── 0001_test-archive_cmd.bats ├── docker-compose.yaml ├── Makefile ├── Dockerfile ├── wal-sync ├── README.md ├── archive_remote_cmd_2 ├── wal-cleanup_2 ├── base-backup_2 ├── archive_cmd_2 ├── restore_cmd_2 └── backup_queue.sql ├── pg_archive ├── README.md ├── restore_cmd ├── archive_remote_cmd └── archive_cmd ├── LICENSE ├── munin └── vacuum_queue └── tools └── optimize_table.pl /README.md: -------------------------------------------------------------------------------- 1 | # dba-utils -------------------------------------------------------------------------------- /pg_archive2/tests/vars.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # default values for pg_archive2 vars 4 | # 5 | PGVER=${PGVER%:*} 6 | 7 | BACKUP_DB_HOST='test' 8 | BACKUP_DB_NAME='test' 9 | BACKUP_DB_PORT='5432' 10 | BACKUP_DB_USER='postgres' 11 | 12 | AVITOOLS="/app/" 13 | -------------------------------------------------------------------------------- /pg_archive2/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: "2.3" 2 | services: 3 | 4 | test: 5 | hostname: test 6 | image: "${IMAGE_NAME}" 7 | command: test 8 | volumes: 9 | - "./:${APP_DIR}" 10 | 11 | test-archive02: 12 | hostname: test-archive02 13 | image: "${IMAGE_NAME}" 14 | volumes: 15 | - "./:${APP_DIR}" 16 | 17 | test-archive03: 18 | hostname: test-archive03 19 | image: "${IMAGE_NAME}" 20 | volumes: 21 | - "./:${APP_DIR}" 22 | -------------------------------------------------------------------------------- /pg_archive/README.md: -------------------------------------------------------------------------------- 1 | General description 2 | ================= 3 | 4 | * archive_cmd — executes on primary, archives and sends WAL with the help of ssh 5 | * archive_remote_cmd — synchronously called on archive server from primary and receives WAL 6 | * restore_cmd — executes on standby (recovery.conf) for receiving the WAL for replay 7 | 8 | ### Examples 9 | 10 | ```sh 11 | archive_command = '/usr/local/bin/archive_cmd arch-host /mnt/nfs/wals_logs %p %f' 12 | restore_command = '/usr/local/bin/restore_cmd /mnt/nfs/wals_logs %f %p' 13 | 14 | # two-day (172800 seconds) delay of replaying the WAL from archive 15 | restore_command = '/usr/local/bin/restore_cmd /mnt/nfs/wals_logs %f %p 172800' 16 | ``` 17 | -------------------------------------------------------------------------------- /pg_archive2/tests/functions.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | _psql () 4 | { 5 | command psql -h ${BACKUP_DB_HOST} -X -At -Ppager=off -vON_ERROR_STOP=1 "$@" 6 | } 7 | 8 | _copy_function () 9 | { 10 | local from_name=$1 11 | local to_name=$2 12 | # 13 | # https://mharrison.org/post/bashfunctionoverride/ 14 | # 15 | local orig_func=$(declare -f "$from_name") 16 | local newname_func="${to_name}${orig_func#$from_name}" 17 | eval "$newname_func" 18 | } 19 | 20 | _init_backup_task () 21 | { 22 | _psql -d test -f - <<'EOF' 23 | insert into backups.hosts (host, cluster_name, archiver_name, keep_backups_cnt, periodicity_days, directory) 24 | values ('test', 'master5', '{"test", "test-archive02"}', 2, 4, '/archive/'); 25 | EOF 26 | } 27 | 28 | _cleanup_backup_task () 29 | { 30 | _psql -d test -f - <<'EOF' 31 | truncate backups.hosts, backups.tasks restart identity; 32 | EOF 33 | } 34 | -------------------------------------------------------------------------------- /pg_archive2/tests/docker-entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # use 'docker logs test-pg-archive' for debug this script 4 | # 5 | TIMEOUT=10 6 | 7 | gosu () 8 | { 9 | /usr/sbin/gosu "$@" 10 | } 11 | 12 | set -x 13 | 14 | if (( $(id -u) != 0 )); then 15 | gosu root bash -l $0 "$@" 16 | else 17 | service ssh start & 18 | exit # root 19 | fi 20 | 21 | echo "wait ssh start ($TIMEOUT)" 22 | for ((i = $TIMEOUT; i > 0; i--)); do 23 | gosu root pidof /usr/sbin/sshd && break 24 | sleep 1 25 | done 26 | (( i )) || { echo "timeout"; exit 1; } 27 | 28 | # ssh-keyscan localhost test-archive02 test-archive03 > ~/.ssh/known_hosts 2> /dev/null 29 | for name in localhost test test-archive02 test-archive03; do 30 | for key in /etc/ssh/*.pub; do 31 | echo "$name $(< "$key")" >> ~/.ssh/known_hosts 32 | done 33 | done 34 | 35 | if [[ $1 = 'test' ]]; then 36 | pg_ctl start -w -o '-h 0.0.0.0 --fsync=off' 37 | fi 38 | echo "= docker-entrypoint.sh started =" 39 | 40 | sleep inf 41 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Avito Technology 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /pg_archive/restore_cmd: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | pmax=4 4 | local_dir=/var/local/WALs 5 | only_local= # use WALs only from local_dir, do not read it from NFS 6 | 7 | delay_time="$4" 8 | 9 | if [ -f /etc/default/restore_cmd ]; then 10 | . /etc/default/restore_cmd 11 | fi 12 | 13 | set -e 14 | 15 | src_dir="$1" 16 | fname="$2" 17 | dst_file="$3" 18 | 19 | if [ -z "$src_dir" -o -z "$fname" -o -z "$dst_file" ]; then 20 | echo -e "usage: restore_cmd SRC-DIR SRC-WAL-FILENAME DST-WAL-FILENAME-WITH-PATH [DELAY-SECONDS]\n" \ 21 | "\n" \ 22 | "SRC-DIR - archive directory with WALs\n" \ 23 | "SRC-WAL-FILENAME - %f (file name)\n" \ 24 | "DST-WAL-FILENAME-WITH-PATH - %p (file name with path)\n" 25 | "DELAY-SECONDS - copy WAL file only if it older than this seconds\n" 26 | exit 1 27 | fi 28 | 29 | if [ -d "$local_dir" ] && [ -f "$local_dir"/"$fname" ]; then 30 | src_dir="$local_dir" 31 | fi 32 | 33 | if [ "$only_local" ] && [ "$src_dir" != "$local_dir" ]; then 34 | # src_dir set to local_dir above, if not - requested WAL file does not exist in local_dir, exit now 35 | exit 1 36 | fi 37 | 38 | if [ "$delay_time" ] && [ "$delay_time" -gt 0 ]; then 39 | if [ -f "$src_dir"/"$fname" ]; then 40 | ftime="$(stat -c %Y "$src_dir"/"$fname")" 41 | apply_time="$(date --date="now - $delay_time seconds" +%s)" 42 | if [ "$ftime" -gt "$apply_time" ]; then 43 | # file is too new, skip it, replay it only after $delay_time seconds 44 | # show message only sometimes 45 | if [ $(( (ftime - apply_time) % (10 * 60) )) -lt 3 ]; then 46 | echo "file '$fname' is too new ($((ftime - apply_time))s), skip it" 47 | fi 48 | exit 1 49 | fi 50 | fi 51 | fi 52 | 53 | cp "$src_dir"/"$fname" "$dst_file" 54 | 55 | mime=$(file -m /etc/postgresql-common/compress.mime.mgc -b --mime-type "$dst_file") 56 | 57 | if [ "$mime" = "application/x-bzip2" ]; then 58 | pbzip2 -p"$pmax" -d < "$dst_file" > "$dst_file".bz2-tmp 59 | mv "$dst_file".bz2-tmp "$dst_file" 60 | elif [ "$mime" = "application/x-gzip" ]; then 61 | gunzip < "$dst_file" > "$dst_file".gz-tmp 62 | mv "$dst_file".gz-tmp "$dst_file" 63 | fi 64 | 65 | exit 0 66 | -------------------------------------------------------------------------------- /pg_archive2/Makefile: -------------------------------------------------------------------------------- 1 | SHELL := bash 2 | PGVER := 9.6 3 | IMAGE_NAME := postgresql-test 4 | CONTAINER_NAME := test 5 | CONTAINERS := $(CONTAINER_NAME) test-archive02 test-archive03 6 | APP_DIR := /app 7 | 8 | DBUSER := postgres 9 | DBNAME := test 10 | NOTERM ?= -T 11 | 12 | SRC_APP := $(wildcard *_2) wal-sync 13 | SRC_SQL := backup_queue.sql 14 | APP_TESTS_DIR := tests 15 | APP_TESTS := $(wildcard $(APP_TESTS_DIR)/*.bats $(APP_TESTS_DIR)/*.bash) 16 | 17 | export IMAGE_NAME APP_DIR 18 | 19 | all: test 20 | 21 | .PHONY: help test test-app clean docker-clean distclean 22 | 23 | help: 24 | @echo "DBG $(SRC_APP)" 25 | @echo "help - show this help" 26 | @echo "docker-build - build docker image" 27 | @echo "docker-clean - stop container and remove image" 28 | @echo "clean - remove artifacts (except image)" 29 | @echo "distclean - remove all artifacts" 30 | @echo "test - run tests" 31 | @echo "test-app - run app tests only" 32 | 33 | clean: .stopped 34 | docker-compose down -v -t1 35 | rm -f .started .stopped .migration 36 | 37 | docker-clean: .stopped 38 | docker rmi -f $(IMAGE_NAME) 39 | @echo 40 | rm -f docker-build 41 | 42 | distclean: clean docker-clean 43 | 44 | docker-build: Dockerfile 45 | docker build --build-arg PGVER=$(PGVER) --tag $(IMAGE_NAME) . 46 | rm -f .stopped 47 | @date > $@ 48 | 49 | .started: docker-build .stopped 50 | docker-compose up -d 51 | for host in $(CONTAINERS); do \ 52 | echo "wait for $$host to start ..."; \ 53 | for ((i = 10; i > 0; i--)); do \ 54 | [[ $$(docker-compose logs --tail 10 $$host) = *'= docker-entrypoint.sh started ='* ]] && break; \ 55 | sleep 1; \ 56 | done; \ 57 | (( i )) || { echo "$$host start timeout"; docker logs $$host; exit 1; }; \ 58 | done 59 | @date > $@ 60 | 61 | .stopped: 62 | docker-compose down -v -t1 63 | rm -f .started 64 | @date > $@ 65 | 66 | test: .migration test-app 67 | 68 | .migration: .started $(SRC_SQL) 69 | docker-compose exec $(NOTERM) $(CONTAINER_NAME) psql -X -At -Ppager=off -vON_ERROR_STOP=1 -1 -f "$(SRC_SQL)" test 70 | @date > $@ 71 | 72 | test-app: .started $(SRC_APP) $(APP_TESTS) 73 | docker-compose exec $(NOTERM) $(CONTAINER_NAME) bash -c 'bats $(APP_DIR)/$(APP_TESTS_DIR)' 74 | -------------------------------------------------------------------------------- /pg_archive2/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG PGVER 2 | FROM postgres:${PGVER} 3 | 4 | ARG PGVER 5 | ARG PGHOME=/var/lib/postgresql 6 | ENV PGDATA=/var/lib/postgresql/test 7 | ENV PGVER=${PGVER} 8 | ENV HOME=${PGHOME} 9 | 10 | SHELL ["/bin/bash", "-c"] 11 | 12 | # app and app tests 13 | USER root 14 | 15 | RUN export DEBIAN_FRONTEND=noninteractive \ 16 | && apt-get update \ 17 | && apt-get install -y --no-install-recommends \ 18 | ssh pbzip2 rsync file gosu \ 19 | bats wget ca-certificates \ 20 | less vim tree \ 21 | && apt-get clean \ 22 | && rm -rf /var/lib/apt/lists/* /tmp/* 23 | 24 | RUN chmod +s /usr/sbin/gosu \ 25 | && usermod --home ${PGHOME} postgres \ 26 | && echo -e '\nSSHD_OPTS=-De' >> /etc/default/ssh \ 27 | && install -v -o postgres -g postgres -d \ 28 | /archive \ 29 | /archive/wals \ 30 | /archive/logs.complete \ 31 | /lib/init/rw/pg_recv_sb \ 32 | && install -v -o postgres -g postgres \ 33 | /dev/null /etc/postgresql-common/compress.mime.mgc \ 34 | && echo -e 'remote_cmd=/app/archive_remote_cmd_2\n' >> /etc/default/archive_cmd \ 35 | && echo -e 'remote_cmd=/app/archive_remote_cmd_2\n' >> /etc/default/archive_remote_cmd 36 | 37 | # switch to app system user and setup db tests 38 | USER postgres 39 | 40 | RUN initdb \ 41 | && echo 'host all all 0.0.0.0/0 trust' >> ${PGDATA}/pg_hba.conf \ 42 | && echo 'host replication postgres 0.0.0.0/0 trust' >> ${PGDATA}/pg_hba.conf \ 43 | && echo 'wal_keep_segments=10' >> ${PGDATA}/postgresql.auto.conf \ 44 | && echo 'wal_level=replica' >> ${PGDATA}/postgresql.auto.conf \ 45 | && echo 'max_wal_senders=5' >> ${PGDATA}/postgresql.auto.conf \ 46 | && echo 'archive_mode=on' >> ${PGDATA}/postgresql.auto.conf \ 47 | && echo 'archive_command='\''test ! -f /archive/logs.complete/%f && cp %p /archive/logs.complete/%f'\' >> ${PGDATA}/postgresql.auto.conf \ 48 | && ssh-keygen -N '' -f ~/.ssh/id_rsa < /dev/null \ 49 | && cp ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys 50 | # ~/.ssh/known_hosts in docker-entrypoint.sh 51 | 52 | RUN pg_ctl -w start \ 53 | && createdb test \ 54 | && psql -X -At -d test -c " \ 55 | /* generate some WALs for tests */ \ 56 | create table payload(a,b,c,d,e) as select v,v,v,v,v from (select 'this is text for fill space ' || generate_series(1, 500000) v) _; \ 57 | drop table payload; \ 58 | select pg_xlogfile_name(pg_current_xlog_insert_location()); \ 59 | " \ 60 | && pg_ctl -w stop -m fast \ 61 | && cp -ar ${PGDATA}/pg_xlog /tmp 62 | 63 | RUN cd /tmp \ 64 | && wget -O compress.mime https://raw.githubusercontent.com/file/file/master/magic/Magdir/compress \ 65 | && file -m compress.mime -C \ 66 | && cp compress.mime.mgc /etc/postgresql-common/compress.mime.mgc \ 67 | && rm compress.mime.mgc 68 | 69 | WORKDIR /app 70 | 71 | ENTRYPOINT ["/app/tests/docker-entrypoint.sh"] 72 | -------------------------------------------------------------------------------- /munin/vacuum_queue: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | dbname_m1= 4 | dbhost_m1= 5 | dbport= 6 | dbuser= 7 | 8 | if [ "$1" = "config" ]; then 9 | cat <<- EOF 10 | graph_title vacuum queue size 11 | graph_args --base 1000 12 | graph_vlabel Count 13 | graph_category PostgreSQL 14 | graph_info vacuum queue size 15 | 16 | queue_size.label queue_size 17 | queue_size.info vacuum queue length 18 | queue_size.min 0 19 | queue_size.warning 100 20 | 21 | vacuum.label vacuum 22 | vacuum.info current VACUUM running 23 | vacuum.min 0 24 | 25 | analyze.label analyze 26 | analyze.info current ANALYZE running 27 | analyze.min 0 28 | 29 | vacuum_analyze.label vacuum_analyze 30 | vacuum_analyze.info current VACUUM ANALYZE running 31 | vacuum_analyze.min 0 32 | 33 | workers.label workers 34 | workers.info current workers running (analyze + vacuum + vacuum_analyze) 35 | workers.min 0 36 | 37 | workers_max.label workers_max 38 | workers_max.info max vacuum workers 39 | workers_max.min 0 40 | EOF 41 | else 42 | psql -X -At -h "$dbhost_m1" -p "$dbport" -U "$dbuser" -d "$dbname_m1" --field-separator ' ' -f- < "$tmp_dir"/"$fname_org".dup 37 | md5_dup=`md5sum "$tmp_dir"/"$fname_org".dup | awk '{print $1}'` 38 | if [[ $fname == ???????????????????????? && $remote_compress ]]; then 39 | # uncompress file from archive to compare its md5 value 40 | pbzip2 -d -p"$pmax" < "$dst_dir"/"$fname_org" > "$tmp_dir"/"$fname_org".org 41 | md5_orig=`md5sum "$tmp_dir"/"$fname_org".org | awk '{print $1}'` 42 | rm "$tmp_dir"/"$fname_org".org 43 | else 44 | md5_orig=`md5sum "$dst_dir"/"$fname_org" | awk '{print $1}'` 45 | fi 46 | if [[ $md5_orig != $md5_dup ]]; then 47 | echo "ERROR: $dst_dir/$fname_org already exist with different md5" 48 | # save the compressed copy with another md5 value for analyze purpose 49 | mv "$tmp_dir"/"$fname_org".dup "$tmp_dir"/"$fname_org".bad_md5 50 | touch --no-create --date "$ftime" "$tmp_dir"/"$fname_org".bad_md5 || true # ignore errors 51 | exit 1 52 | else 53 | # if md5 values is equal, then exit without error 54 | echo "WARN: $dst_dir/$fname_org already exist with same md5" 55 | rm "$tmp_dir"/"$fname_org".dup 56 | exit 0 57 | fi 58 | fi 59 | 60 | if [ -f "$dst_dir"/"$fname".tmp ]; then 61 | echo "ERROR: $dst_dir/$fname.tmp already exist" 62 | exit 1 63 | fi 64 | 65 | # to skip errors of recovering partial file 66 | # which is being archived (still in progress), let’s atomically create file in archive 67 | # with the help of move 68 | if [[ $fname == ???????????????????????? && $remote_compress ]]; then 69 | cat > "$tmp_dir"/"$fname".new 70 | size=$(stat -c '%s' "$tmp_dir"/"$fname".new) 71 | if [[ $size -ne 16777216 ]]; then 72 | mv "$tmp_dir"/"$fname".new "$tmp_dir"/"$fname".bad 73 | echo "ERROR: $tmp_dir/$fname.bad wrong size: $size" 74 | exit 1 75 | fi 76 | pbzip2 -1 -p"$pmax" < "$tmp_dir"/"$fname".new > "$dst_dir"/"$fname".tmp 77 | rm "$tmp_dir"/"$fname".new 78 | else 79 | cat > "$dst_dir"/"$fname".tmp 80 | fi 81 | 82 | touch --no-create --date "$ftime" "$dst_dir"/"$fname".tmp || true # ignore errors 83 | mv "$dst_dir"/"$fname".tmp "$dst_dir"/"$fname" 84 | 85 | # if 'coreutils-sync' is installed, fsync file and folder 86 | if [ -f "/usr/local/bin/sync" ]; then 87 | /usr/local/bin/sync "$dst_dir"/"$fname" 88 | /usr/local/bin/sync "$dst_dir" 89 | fi 90 | 91 | if [[ $send_status ]]; then 92 | echo -n > "$send_dir"/"$fname_org".new 93 | fi 94 | 95 | exit 0 96 | -------------------------------------------------------------------------------- /pg_archive2/wal-sync: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # wal-sync 4 | # synchronize WAL between two archives 5 | 6 | WALS_DIR=$1 7 | REMOTE_ARCHIVE=$2 8 | LOCK_FILE_NAME=sync-archives.lock 9 | LOG_FILE="/var/tmp/wal-sync_${PGHOST}.log" 10 | RSYNC_LOG_OUT="/var/tmp/wal-sync_OUT_${PGHOST}" # rsync from that server to remote 11 | RSYNC_LOG_IN="/var/tmp/wal-sync_IN_${PGHOST}" # rsync to that server from remote 12 | 13 | hostname=$(hostname) 14 | 15 | CWD=$(pwd) # save working dir for remote commands 16 | 17 | # options for rsync over ssh 18 | ssh_timeout_options="-o ConnectTimeout=10 -o ServerAliveInterval=6 -o ServerAliveCountMax=5" 19 | ssh_options="-o Compression=no -c aes128-gcm@openssh.com -o BatchMode=yes ${ssh_timeout_options}" 20 | ssh_master_socket='/tmp/ssh_%h_%p_%r' 21 | ssh_master_options='-o ControlMaster=auto -o ControlPersist=yes' 22 | ssh_options="$ssh_options -o ControlPath=$ssh_master_socket $ssh_master_options" 23 | 24 | function usage() { 25 | echo "$(basename "$0") is script for rsync wals between two archive hosts in both directions" 26 | echo 27 | echo "Usage:" 28 | echo " $(basename "$0") /path/to/wals/dir REMOTE-ARCHIVE-HOST" 29 | echo 30 | } 31 | 32 | if [ -z "$WALS_DIR" -o -z "$REMOTE_ARCHIVE" ]; then 33 | usage 34 | exit 1; 35 | fi 36 | 37 | LOCK_FILE="$WALS_DIR/../$LOCK_FILE_NAME" 38 | 39 | if [ -f "$LOCK_FILE" ]; then 40 | echo "lock file '$LOCK_FILE' from $(date -r "$LOCK_FILE") exists, abort" 41 | exit 1 42 | fi 43 | 44 | date +%s > "$LOCK_FILE" 45 | exec 3>&1 > "$LOG_FILE" 2>&1 46 | trap "cat \"$LOG_FILE\" >&3; rm -f \"$LOCK_FILE\";" EXIT 47 | 48 | # always add trailing slash to WALS_DIR to avoid catastrophic errors (rsync needs) 49 | WALS_DIR="$WALS_DIR"/ 50 | 51 | echo $(date +'%F %T') "start wal-sync: '$WALS_DIR' '$REMOTE_ARCHIVE'" 52 | 53 | echo "##############################" 54 | echo "OUT dry-run of rsync wals from '${hostname}' to '${REMOTE_ARCHIVE}' :" 55 | date > "$RSYNC_LOG_OUT" 56 | echo timeout 100 rsync -a -i --dry-run --ignore-existing -e \""ssh ${ssh_options}\"" --exclude '*.tmp' \""$CWD/$WALS_DIR\"" $REMOTE_ARCHIVE:\""$CWD/$WALS_DIR\"" 57 | timeout 100 rsync -a -i --dry-run --ignore-existing -e "ssh ${ssh_options}" --exclude '*.tmp' "$CWD/$WALS_DIR" $REMOTE_ARCHIVE:"$CWD/$WALS_DIR" >> "$RSYNC_LOG_OUT" 58 | echo will be synced \~ $(($(grep -F ' "$RSYNC_LOG_IN" 69 | echo timeout 100 rsync -a -i --dry-run --ignore-existing -e \""ssh ${ssh_options}\"" --exclude '*.tmp' $REMOTE_ARCHIVE:\""$CWD/$WALS_DIR\"" \""$CWD/$WALS_DIR\"" 70 | timeout 100 rsync -a -i --dry-run --ignore-existing -e "ssh ${ssh_options}" --exclude '*.tmp' $REMOTE_ARCHIVE:"$CWD/$WALS_DIR" "$CWD/$WALS_DIR" >> "$RSYNC_LOG_IN" 71 | echo will be synced \~ $(($(grep -F '>f' "$RSYNC_LOG_IN" | wc -l) )) WALs 72 | echo '---' 73 | head -3 "$RSYNC_LOG_IN" 74 | echo . . . 75 | tail -2 "$RSYNC_LOG_IN" 76 | echo '---' 77 | 78 | echo "Perform OUTGOING wal-sync ..." 79 | timeout 7200 rsync -a --ignore-existing -e "ssh ${ssh_options}" --exclude '*.tmp' "$CWD/$WALS_DIR" $REMOTE_ARCHIVE:"$CWD/$WALS_DIR" 80 | if [ "$?" -ne "0" ]; then 81 | echo "error from OUTGOING wal-sync, abort" 82 | exit 1 83 | fi 84 | 85 | echo "Perform INCOMING wal-sync ..." 86 | timeout 7200 rsync -a --ignore-existing -e "ssh ${ssh_options}" --exclude '*.tmp' $REMOTE_ARCHIVE:"$CWD/$WALS_DIR" "$CWD/$WALS_DIR" 87 | if [ "$?" -ne "0" ]; then 88 | echo "error from INCOMING wal-sync, abort" 89 | exit 1 90 | fi 91 | 92 | echo $(date +'%F %T') "done wal-sync: for '$WALS_DIR', host '$REMOTE_ARCHIVE'" 93 | 94 | exit 0 95 | 96 | -------------------------------------------------------------------------------- /pg_archive2/README.md: -------------------------------------------------------------------------------- 1 | # General description 2 | 3 | Archive is an inherent part of resilient PostgreSQL infrastructure. 4 | 5 | This component is responsible for: 6 | - sending, receiving, storing, and rotating the WAL files 7 | - backup tasks execution, queue with backup tasks, rotation of backups (validation of backup is not the part of Archive 2.0 and it is a standalone solution working on a different infrastructure) 8 | - PITR, recovery from backup (indirectly) 9 | - replication (as for Avito it is the majority of PostgreSQL clusters, apart from those using synchronous replication) 10 | 11 | One of the special features/advantages of Archive 2.0 comparing to other archive solutions is that it stores WAL files on two archive-servers simultaneously. 12 | 13 | The archive infrastructure continues working If one of two archive servers becomes unreachable/unavailable. The gaps in WAL files will be filled with the help of syncing-WAL procedure, that is executed after each backup. 14 | 15 | Backups are made in turn on both archive servers with the help of pg_basebackup. Thus there is a window for PITR (point in time recovery), the size of the recovery interval is set in a cluster settings. 16 | 17 | Log-shipping replication (without streaming) guarantees that archive is always in future in relation to standby, it excludes loss of the data needed for PITR (gaps in WAL). 18 | 19 | # Components 20 | 21 | - **two archive servers** store the WAL files and backups 22 | - **archive_cmd2** (archive command is set in postgresql.conf) transfers WAL and other files to either of the archive servers 23 | - **archive_remote_cmd_2** program which is executed on archive servers side, it gets the WALs, compress it and transfers them to the reserve archive server 24 | - **restore_cmd_2** (recovery.conf) with its help standby gets WAL from archive servers 25 | - **backup_queue** database that stores backup schedule, backup settings and statuses for all backup tasks 26 | - **base-backup_2** program scheduled with cron (e.g. every 10 minutes). Checks if there is a backup task in the queue and executes it. It consists of: 27 | - **base-backup_2** backup itself 28 | - **wal-cleanup_2** cleaning of unnecessary WAL. The backup for them has been rotated (there is no backup to use these WAL files) 29 | - **wal-sync** bidirectional synchronisation/merge of archive servers 30 | - **monitoring** at least there should be monitoring of the backup queue and alerts for failed backup tasks 31 | 32 | # Examples 33 | 34 | Example of the archive_command setting: 35 | 36 | ```sh 37 | archive_command = '/usr/local/bin/archive_cmd_2 \'hostname_archive_1 hostname_archive_2\' /storage/archive_directory/ %p %f cluster_name' 38 | ``` 39 | 40 | Parameters description: 41 | ```sh 42 | DST-HOSTNAMES - two archive host names in single quotes 43 | DST-DIR - archive directory for WALs (ssh path) 44 | SRC-WAL-FILENAME-WITH-PATH - %p (file name with path) 45 | SRC-WAL-FILENAME - %f (file name) 46 | CLUSTER-NAME - unique cluster name 47 | ``` 48 | Pay attention to DST-DIR - it is the path on the archive server. 49 | 50 | 51 | *archive_cmd_2* transfers WAL’s and other files to both of the archive hosts in the following way: 52 | ```sh 53 | [ master: archive_cmd ] -> [ archive1: archive_remote_cmd ] -> [ archive2: scp ] 54 | ``` 55 | *archive_cmd_2* transfers to the first DST-HOSTNAME, and then to the second one with the help of archive_remote_cmd_2 on remote host. 56 | 57 | *archive_remote_cmd* writes WAL locally, compress it and tries to transfer the WAL to the SYNC-HOST. 58 | 59 | If *archive_cmd_2* can’t transfer the WAL to the first host (from parameters), then after N retries it starts to transfer WAL only to the second host during N seconds (cooldown_time): 60 | ```sh 61 | retry_count="6" # see below 62 | cooldown_time="600" # do not try to send file to $dst_host1 for a '$cooldown_time' seconds after '$retry_count' attempts 63 | ``` 64 | 65 | For /etc/postgresql-common/compress.mime.mgc compress.mime is used where only compressing signatures are left in order to exclude false positive runs (on other file types). 66 | 67 | # Tests 68 | 69 | You need GNU `make`, `bash`, `docker` and `docker-compose`. 70 | ```sh 71 | make NOTERM='' 72 | ``` 73 | Empty NOTERM for pretty bats output. 74 | -------------------------------------------------------------------------------- /pg_archive/archive_cmd: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | pmax=4 5 | remote_cmd=/usr/local/bin/archive_remote_cmd 6 | ssh_options="-o Compression=no -c aes128-gcm@openssh.com -o BatchMode=yes" 7 | ssh_master_socket='/tmp/ssh_%h_%p_%r' 8 | ssh_master_options='-o ControlMaster=auto -o ControlPersist=yes' 9 | use_ssh_persist= 10 | remote_compress= 11 | send_status= 12 | pwals=5 13 | ready_wals_for_parallel=10 # ready WALs should be at least twice greater than pwals 14 | 15 | unset LANG LANGUAGE LC_ALL LC_CTYPE LC_COLLATE LC_MESSAGES LC_TIME LC_NUMERIC 16 | export LC_ALL=C 17 | 18 | if [ -f /etc/default/archive_cmd ]; then 19 | . /etc/default/archive_cmd 20 | fi 21 | 22 | if [ -n "$use_ssh_persist" ]; then 23 | ssh_options="$ssh_options -S $ssh_master_socket $ssh_master_options" 24 | fi 25 | 26 | dst_host="$1" 27 | dst_dir="$2" 28 | log_dir="$dst_dir"/../ 29 | src_file="$3" 30 | fname="$4" 31 | src_dir=${src_file%/$fname} 32 | arch_status_dir="$src_dir"/archive_status 33 | 34 | if [ -z "$dst_host" -o -z "$dst_dir" -o -z "$src_file" -o -z "$fname" ]; then 35 | echo -e "usage: archive_cmd DST-HOSTNAME DST-DIR SRC-WAL-FILENAME-WITH-PATH SRC-WAL-FILENAME\n" \ 36 | "\n" \ 37 | "DST-HOSTNAME - for scp\n" \ 38 | "DST-DIR - archive directory for WALs\n" \ 39 | "SRC-WAL-FILENAME-WITH-PATH - %p (file name with path)\n" \ 40 | "SRC-WAL-FILENAME - %f (file name)\n" 41 | exit 1 42 | fi 43 | 44 | set -o pipefail 45 | 46 | # checks if the file name is WAL file name 47 | is_wal() { 48 | local fname="$1" 49 | if [[ "$fname" == ???????????????????????? ]] && [[ "$fname" != *.* ]]; then 50 | return 0 # zero is true 51 | else 52 | return 1 # non-zero is false 53 | fi 54 | } 55 | 56 | # transfers WAL to standby using ssh 57 | send_wal() { 58 | local fname_with_path="$1" 59 | local file_name="$2" 60 | ftime=$(stat -c %y "$fname_with_path") 61 | # compress only WAL files, skip compressing of backup label and history files 62 | if [[ "$file_name" == ???????????????????????? && -z "$remote_compress" ]]; then 63 | pbzip2 -1 -p"$pmax" < "$fname_with_path" 64 | else 65 | cat "$fname_with_path" 66 | fi \ 67 | | ssh $ssh_options "$dst_host" "$remote_cmd" "$dst_dir" "$file_name" "'$ftime'" "'$remote_compress'" "$pmax" "'$send_status'" 68 | /usr/local/bin/vmtouch -q -e "$fname_with_path" 69 | } 70 | 71 | # check the existence of WAL file in the archive 72 | if [[ -r ${log_dir}/LASTLOG ]]; then 73 | prev_archived=$(< ${log_dir}/LASTLOG) 74 | else 75 | prev_archived="" 76 | fi 77 | # files like '000000010004EF04000000E1.00010DF0.backup' and '00000015.history' are always archived 78 | # compare only WAL files 79 | # if last archived file is not WAL, then single thread archiving (for correct comparing with the help of '<') 80 | if is_wal "$fname" && is_wal "$prev_archived"; then 81 | if [[ "$fname" < "$prev_archived" ]] || [[ "$fname" = "$prev_archived" ]]; then 82 | echo "File '$fname' was already sent to archive. Skipping..." 83 | exit 0 84 | fi 85 | fi 86 | 87 | # is multi-thread archiving needed? 88 | ready_count=$(find ${arch_status_dir}/ -maxdepth 1 -type f -name "????????????????????????.ready" | wc -l) 89 | 90 | # single thread archive if: 91 | # - ready WAL files cnt is less than threshold "ready_wals_for_parallel" 92 | # - file is not WAL (.backup, .history) 93 | # - previous archived file is not WAL file 94 | if [[ $ready_count -le $ready_wals_for_parallel ]] || ! is_wal "$fname" || ! is_wal "$prev_archived"; then 95 | send_wal "$src_file" "$fname" 96 | wal=$fname 97 | else 98 | # run multi-threaded archiving 99 | 100 | # take pwals files 101 | ready_wals=$(find ${arch_status_dir}/ -maxdepth 1 -type f -name "????????????????????????.ready" -printf '%f\n'\ 102 | | sort | grep -A "$(( pwals - 1 ))" -F ${fname}) 103 | 104 | # multi-threaded file transfer 105 | declare -A send_pids 106 | for wal_ready in $ready_wals ; do 107 | wal=${wal_ready%.ready} 108 | send_wal "$src_dir"/"$wal" "$wal" & send_pids[$wal]=$! 109 | done 110 | 111 | # for each thread check exit code 112 | for wal_pid in ${!send_pids[@]}; do 113 | exit_code=0 114 | wait ${send_pids[$wal_pid]} || exit_code=$? 115 | if [[ $exit_code -ne 0 ]] ; then 116 | echo "ERROR: can't send '$wal_pid' to archive. Exit code: '$exit_code'" 117 | exit 1 118 | fi 119 | done 120 | fi 121 | 122 | echo "$wal" > "$log_dir"/LASTLOG 123 | 124 | exit 0 125 | -------------------------------------------------------------------------------- /pg_archive2/tests/0003_test-restore_cmd.bats: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bats 2 | # -*- mode: sh; -*- 3 | 4 | load vars 5 | load functions 6 | 7 | _init () 8 | { 9 | local h 10 | 11 | pbzip2 < /tmp/pg_xlog/000000010000000000000001 > /tmp/000000010000000000000001 12 | 13 | for h in test-archive02 test-archive03; do 14 | scp /tmp/000000010000000000000001 "$h":/archive/wals/000000010000000000000001 15 | done 16 | 17 | stat -c %s /tmp/000000010000000000000001 18 | rm /tmp/000000010000000000000001 19 | } 20 | 21 | _cleanup () 22 | { 23 | ssh test-archive02 rm -f /archive/wals/000000010000000000000001 24 | ssh test-archive03 rm -f /archive/wals/000000010000000000000001 25 | rm -f /lib/init/rw/pg_recv_sb/ssh-errors_* 26 | rm -f /tmp/000000010000000000000001 27 | } 28 | 29 | setup () 30 | { 31 | case "$BATS_TEST_DESCRIPTION" in 32 | "restore_cmd") 33 | _init 34 | ;; 35 | "restore_cmd, test-archive02 not exist") 36 | _init 37 | ssh test-archive02 rm -f /archive/wals/000000010000000000000001 38 | ;; 39 | "restore_cmd, test-archive02 corrupt") 40 | _init 41 | ssh test-archive02 dd conv=nocreat,notrunc bs=1 seek=1024 count=42 status=none \ 42 | if=/dev/urandom of=/archive/wals/000000010000000000000001 43 | ;; 44 | "restore_cmd, test-archive02 unreachable, test-archive03 corrupt") 45 | _init 46 | ssh test-archive03 dd conv=nocreat,notrunc bs=1 seek=10240 count=42 status=none \ 47 | if=/dev/urandom of=/archive/wals/000000010000000000000001 48 | ;; 49 | "restore_cmd, both corrupt") 50 | local size=$(_init) 51 | 52 | ssh test-archive02 truncate -s-42 /archive/wals/000000010000000000000001 53 | ssh test-archive03 dd conv=nocreat,notrunc bs=1 seek=$((size - 4242)) count=42 status=none \ 54 | if=/dev/urandom of=/archive/wals/000000010000000000000001 55 | ;; 56 | esac 57 | } 58 | 59 | teardown () 60 | { 61 | case "$BATS_TEST_DESCRIPTION" in 62 | "restore_cmd") ;& 63 | "restore_cmd, test-archive02 not exist") ;& 64 | "restore_cmd, test-archive02 corrupt") ;& 65 | "restore_cmd, test-archive02 unreachable, test-archive03 corrupt") ;& 66 | "restore_cmd, both corrupt") 67 | _cleanup 68 | ;; 69 | esac 70 | } 71 | 72 | 73 | @test "restore_cmd" { 74 | run /app/restore_cmd_2 'test-archive02 test-archive03' /archive/wals 000000010000000000000001 /tmp/000000010000000000000001 75 | 76 | echo "$output" >&2 77 | 78 | [[ $status -eq 0 ]] 79 | [[ ! $output ]] 80 | test -f /tmp/000000010000000000000001 81 | } 82 | 83 | @test "restore_cmd, test-archive02 not exist" { 84 | run /app/restore_cmd_2 'test-archive02 test-archive03' /archive/wals 000000010000000000000001 /tmp/000000010000000000000001 85 | 86 | echo "$output" >&2 87 | echo "*** DBG ${lines[-1]}" >&2 88 | 89 | [[ $status -eq 0 ]] 90 | test -f /tmp/000000010000000000000001 91 | [[ ${lines[-1]} = "WARNING: can't find wal '000000010000000000000001' at host 'test-archive02'" ]] 92 | } 93 | 94 | @test "restore_cmd, test-archive02 corrupt" { 95 | run /app/restore_cmd_2 'test-archive02 test-archive03' /archive/wals 000000010000000000000001 /tmp/000000010000000000000001 96 | 97 | echo "$output" >&2 98 | echo "*** DBG ${lines[-1]}" >&2 99 | 100 | [[ $status -eq 0 ]] 101 | test -f /tmp/000000010000000000000001 102 | [[ ${lines[-1]} = "WARNING: decompress error from host test-archive02, trying other" ]] 103 | } 104 | 105 | @test "restore_cmd, test-archive02 unreachable, test-archive03 corrupt" { 106 | run /app/restore_cmd_2 'test-UNREACHABLE-archive02 test-archive03' /archive/wals 000000010000000000000001 /tmp/000000010000000000000001 107 | 108 | echo "$output" >&2 109 | echo "*** DBG ${lines[-5]}" >&2 110 | echo "*** DBG ${lines[-1]}" >&2 111 | 112 | [[ $status -ne 0 ]] 113 | test ! -f /tmp/000000010000000000000001 114 | [[ ${lines[-5]} = 'WARNING: decompress error from host test-archive03, trying other' ]] 115 | [[ ${lines[-1]} = "ERROR: can't fetch wal from all hosts: test-UNREACHABLE-archive02 test-archive03" ]] 116 | } 117 | 118 | @test "restore_cmd, both corrupt" { 119 | run /app/restore_cmd_2 'test-archive02 test-archive03' /archive/wals 000000010000000000000001 /tmp/000000010000000000000001 120 | 121 | echo "$output" >&2 122 | echo "*** DBG ${lines[-4]}" >&2 123 | echo "*** DBG ${lines[-1]}" >&2 124 | 125 | [[ $status -ne 0 ]] 126 | test ! -f /tmp/000000010000000000000001 127 | [[ ${lines[-4]} = 'WARNING: decompress error from host test-archive02, trying other' ]] 128 | [[ ${lines[-1]} = 'ERROR: cannot unpack wal from all hosts' ]] 129 | } 130 | -------------------------------------------------------------------------------- /tools/optimize_table.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | # -*- indent-tabs-mode: nil -*- 3 | # 4 | 5 | use strict; 6 | use warnings; 7 | use DBI; 8 | #use Data::Dump qw(dump); 9 | 10 | $| = 1; # autoflush 11 | 12 | my $DBPORT = 5432; 13 | my $DBUSER = 'postgres'; 14 | my $BULKVACUUM = 4000; # pages 15 | # 16 | my $PAGESTUPLES = 8192 / 26; # 26 байт - системные атрибуты всегда есть, даже в пустой таблице без полей 17 | my $BULKPAGES = 3; 18 | 19 | sub usage(); 20 | sub vacuum(); 21 | sub wait_txid($); 22 | sub xmin(); 23 | sub get_txid(); 24 | sub pages_count(); 25 | sub max_page(); 26 | sub die_with_vacuum($); 27 | sub gen_ctids($); 28 | sub move($$); 29 | 30 | usage if $#ARGV != 3; 31 | 32 | my $DBHOST = $ARGV[0]; 33 | my $DBNAME = $ARGV[1]; 34 | my $DBTABLE = $ARGV[2]; 35 | my $XFIELD = $ARGV[3]; 36 | 37 | #DBI->trace('SQL'); # DBD 38 | 39 | my $dbh = DBI->connect("dbi:Pg:host='$DBHOST';port=$DBPORT;dbname=$DBNAME;", $DBUSER, '', 40 | {AutoCommit => 1, RaiseError => 1}); # , pg_server_prepare => 0 41 | 42 | # пометить мёртвые как свободные 43 | vacuum(); 44 | 45 | my $p_cnt = pages_count(); 46 | my $file_pages = pages_count(); 47 | my $total = 0; 48 | my $i_total = 0; 49 | my $i_BULKVACUUM = $BULKVACUUM / $BULKPAGES; 50 | my $txid = 0; 51 | while (1) 52 | { 53 | print "page $p_cnt of $file_pages ($total) - "; 54 | 55 | $dbh->begin_work(); 56 | # отключить все триггеры на сжимаемой таблице 57 | $dbh->do("set local session_replication_role to 'replica'"); 58 | $txid = get_txid(); 59 | my $r = move($p_cnt, gen_ctids($p_cnt)); 60 | print "\n"; 61 | if ($r < 0) { 62 | # сдвинуть не получилось, "забудем и вернём назад" попавших возможно на уже очищенные страницы 63 | $dbh->rollback(); 64 | } else { 65 | $dbh->commit(); 66 | } 67 | 68 | $total += $BULKPAGES; 69 | $i_total++; 70 | 71 | # print "no rows\n" if $r == 0; 72 | die_with_vacuum("no space left in table $DBTABLE, exit") if $r < 0; # плюс отрезать свободный конец файла таблицы если он пуст 73 | 74 | $p_cnt = $p_cnt - $BULKPAGES; 75 | 76 | # 77 | # Убирать этот постоянный VACUUM отсюда и делатьего его один раз в конце нельзя! 78 | # Иначе при усечении файла будет очень долгая блокировка. PostgreSQL просматривает 79 | # отрезаемую часть файла с конца к максимальному найденному номеру не пустой страницы 80 | # находясь в эксклюзивной блокировке на сжимаемой таблице. 81 | # 82 | if ($i_total % $i_BULKVACUUM == 0) 83 | { 84 | wait_txid($txid); 85 | vacuum(); 86 | } 87 | } 88 | 89 | $dbh->disconnect; 90 | 91 | sub usage() 92 | { 93 | # xfield - любая простая колонка, не TOAST и не varlen 94 | print "usage: dbhost dbname table xfield\n"; 95 | exit 1; 96 | } 97 | 98 | sub wait_txid($) 99 | { 100 | my ($txid) = @_; 101 | print "waiting for transactions with xmin < $txid\n"; 102 | while(xmin() < $txid) 103 | { 104 | print "."; 105 | sleep 1; 106 | } 107 | print "\n" 108 | } 109 | 110 | sub vacuum() 111 | { 112 | print "vacuum verbose $DBTABLE\n"; 113 | $dbh->do("set vacuum_cost_delay to 0"); 114 | $dbh->do("vacuum verbose $DBTABLE"); 115 | } 116 | 117 | sub xmin() 118 | { 119 | my $row = $dbh->selectrow_hashref(qq{ 120 | select txid_snapshot_xmin(txid_current_snapshot()) as xmin 121 | }); 122 | return $row->{'xmin'}; 123 | } 124 | 125 | sub analyze() 126 | { 127 | print "analyze verbose $DBTABLE\n"; 128 | $dbh->do("set vacuum_cost_delay to 0"); 129 | $dbh->do("analyze verbose $DBTABLE"); 130 | } 131 | 132 | sub move($$) 133 | { 134 | my ($page, $ptrs) = @_; 135 | my $min_page = $page - $BULKPAGES + 1; 136 | 137 | my $sth = $dbh->prepare(qq{ 138 | update $DBTABLE set $XFIELD = $XFIELD 139 | where ctid = any (?) 140 | returning ctid as ptr 141 | }); 142 | 143 | while ($#{$ptrs} >= 0) 144 | { 145 | my $rows = $dbh->selectcol_arrayref($sth, undef, $ptrs); 146 | # dump($rows); 147 | print $sth->rows,'.'; 148 | return 0 if $#{$rows} < 0; # -1 - запрос не обновил ни одной строки, все обновляемые страницы уже пусты? 149 | 150 | # выкинуть из списка все tid у которых страница меньше исходной 151 | # если массив tid пуст - успех 152 | # если есть хотя бы один tid у которого страница больше исходной - неудача, кончились дырки в файле 153 | $ptrs = []; 154 | foreach my $tid (@$rows) 155 | { 156 | my $p = $1 if $tid =~ /^\((\d+),\d+\)$/; 157 | next if $p < $min_page; 158 | return -1 if $p > $page; 159 | # страница всё ещё та же, пробуем снова обновить этот tid 160 | push @$ptrs, "'$tid'"; 161 | } 162 | } 163 | 164 | return 1; 165 | } 166 | 167 | sub get_txid() 168 | { 169 | my $row = $dbh->selectrow_hashref(qq{ 170 | select txid_current() as txid 171 | }); 172 | return $row->{'txid'}; 173 | } 174 | 175 | sub pages_count() 176 | { 177 | my $rows = $dbh->selectrow_hashref(qq{ 178 | select relpages as pages 179 | from pg_class 180 | where oid = '${DBTABLE}'::regclass 181 | }); 182 | return $rows->{'pages'}; 183 | } 184 | 185 | sub max_page() 186 | { 187 | my $row = $dbh->selectrow_hashref(qq{ 188 | select max(ctid) as ptr 189 | from $DBTABLE 190 | }); 191 | my $page = 0; 192 | if ($row->{'ptr'} and $row->{'ptr'} =~ /^\((\d+),\d+\)$/) { 193 | $page = $1; 194 | } 195 | return $page; 196 | } 197 | 198 | sub die_with_vacuum($) 199 | { 200 | vacuum(); 201 | analyze(); 202 | die @_; 203 | } 204 | 205 | sub gen_ctids($) 206 | { 207 | my ($p_cnt) = @_; 208 | if ($p_cnt < 4) { 209 | $dbh->commit(); # gen_ctids run in transaction 210 | die_with_vacuum("cannot vacuum less then 4 pages"); 211 | } 212 | my @ptrs = map { 213 | my $a = $_; 214 | map { "'($a,$_)'" } 1 .. $PAGESTUPLES; 215 | } $p_cnt-$BULKPAGES+1 .. $p_cnt; 216 | return \@ptrs; 217 | } 218 | -------------------------------------------------------------------------------- /pg_archive2/archive_remote_cmd_2: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # archive_remote_cmd is executed on archive servers 4 | # Saves WALs and associated files to the host on which it is run and to the SYNC-HOST (if it’s set). 5 | # At SYNC-HOST executes itself, without REMOTE-COMPRESS, as WALs have been already compressed. 6 | 7 | set -e 8 | set -o pipefail 9 | 10 | error_handler() { 11 | scriptname=$(basename $0) 12 | hostname=$(hostname) 13 | echo "^^^ ERROR at [host: '${hostname}', file: '${scriptname}', line: '${BASH_LINENO[0]}']" 14 | } 15 | trap error_handler ERR 16 | 17 | remote_cmd=/usr/local/bin/archive_remote_cmd_2 # itslef 18 | 19 | dst_dir="$1" 20 | fname_org="$2" 21 | ftime="$3" 22 | skip_compress="$4" 23 | pmax="$5" 24 | cluster_name="$6" 25 | sync_host="$7" # second wal archive server hostname 26 | fsize="$8" # source file size 27 | fname=$fname_org 28 | tmp_dir="/lib/init/rw/pg_recv_sb/$cluster_name" # tmpfs for wals 29 | 30 | # options for sending WAL to second archive host 31 | cmd_timeout="6" # timeout parameter for ssh, cat 32 | ssh_timeout_options="-o ConnectTimeout=10 -o ServerAliveInterval=6 -o ServerAliveCountMax=5" 33 | ssh_options="-o Compression=no -c aes128-gcm@openssh.com -o BatchMode=yes ${ssh_timeout_options}" 34 | ssh_master_socket='/tmp/ssh_%h_%p_%r' 35 | ssh_master_options='-o ControlMaster=auto -o ControlPersist=yes' 36 | ssh_options="$ssh_options -o ControlPath=$ssh_master_socket $ssh_master_options" 37 | cooldown_time="600" # don’t try to send files to $sync_host after an error during $cooldown_time seconds 38 | 39 | recv_checked() { 40 | local timeout=$1 41 | local fname=$2 42 | local fsize=$3 43 | local res=0 44 | local received 45 | 46 | timeout "${timeout}" cat > "$fname" || res=$? 47 | received=$(stat -c '%s' "$fname") 48 | 49 | # zero file size not supported 50 | if (( res || ! received || received != fsize )); then 51 | mv "$fname" "$fname".bad 52 | echo "ERROR: $fname.bad size $received, expected $fsize bytes, cat exit $res" 53 | exit 1 54 | fi 55 | } 56 | 57 | if [ -z "$dst_dir" -o -z "$fname" -o -z "$cluster_name" -o -z "$fsize" ]; then 58 | cat <<'EOF' 59 | usage: archive_remote_cmd DST-DIR DST-WAL-FILENAME SRC-FILE-TIMESTAMP SKIP-COMPRESS PMAX CLUSTER-NAME SYNC-HOST SIZE 60 | DST-DIR - archive directory for WALs 61 | DST-WAL-FILENAME - %f (file name) 62 | SRC-FILE-TIMESTAMP - date and time of src WAL file in 'touch --date' format 63 | SKIP-COMPRESS - received file is compressed, skip compress it 64 | PMAX - if SKIP-COMPRESS empty - use PMAX compress threads 65 | CLUSTER-NAME - unique cluster name 66 | SYNC-HOST - host to mirror WAL (upload via ssh), may be empty 67 | SIZE - source file size for check receive 68 | EOF 69 | exit 1 70 | fi 71 | 72 | if [ -f /etc/default/archive_remote_cmd ]; then 73 | . /etc/default/archive_remote_cmd 74 | fi 75 | 76 | if ! [ -d "$tmp_dir" ]; then 77 | mkdir -p "$tmp_dir" 78 | fi 79 | 80 | # don’t overwrite previously saved files with wrong md5 81 | if [ -f "$tmp_dir"/"$fname_org".bad_md5 ]; then 82 | echo "ERROR: $dst_dir/$fname_org already exist with different md5" 83 | exit 1 84 | fi 85 | 86 | # if there is the same WAL file in archchive, check md5 87 | if [ -f "$dst_dir"/"$fname_org" ] && [ "$fname_org" != "LASTLOG" ]; then 88 | recv_checked "${cmd_timeout}" "$tmp_dir"/"$fname_org".dup "$fsize" 89 | md5_dup=`md5sum "$tmp_dir"/"$fname_org".dup | awk '{print $1}'` 90 | if [[ $fname == ???????????????????????? && ! $skip_compress ]]; then 91 | # uncompress file from archive to compare its md5 value 92 | pbzip2 -d -p"$pmax" < "$dst_dir"/"$fname_org" > "$tmp_dir"/"$fname_org".org 93 | md5_orig=`md5sum "$tmp_dir"/"$fname_org".org | awk '{print $1}'` 94 | rm "$tmp_dir"/"$fname_org".org 95 | else 96 | md5_orig=`md5sum "$dst_dir"/"$fname_org" | awk '{print $1}'` 97 | fi 98 | if [[ $md5_orig != $md5_dup ]]; then 99 | echo "ERROR: $dst_dir/$fname_org already exist with different md5" 100 | # save the compressed copy with another md5 value for analyze purpose 101 | mv "$tmp_dir"/"$fname_org".dup "$tmp_dir"/"$fname_org".bad_md5 102 | touch --no-create --date "$ftime" "$tmp_dir"/"$fname_org".bad_md5 || true # ignore errors 103 | exit 1 104 | else 105 | # if md5 values match, then exit without error 106 | echo "WARN: $dst_dir/$fname_org already exist with same md5" 107 | rm "$tmp_dir"/"$fname_org".dup 108 | exit 0 109 | fi 110 | fi 111 | 112 | if [ -f "$dst_dir"/"$fname".tmp ]; then 113 | echo "ERROR: $dst_dir/$fname.tmp already exist" 114 | exit 1 115 | fi 116 | 117 | # to skip errors of recovering partial file 118 | # which is being archived (still in progress), let’s atomically create file in archive 119 | # with the help of move 120 | if [[ $fname == ???????????????????????? && ! $skip_compress ]]; then 121 | recv_checked "${cmd_timeout}" "$tmp_dir"/"$fname".new "$fsize" 122 | pbzip2 -1 -p"$pmax" < "$tmp_dir"/"$fname".new > "$dst_dir"/"$fname".tmp 123 | rm "$tmp_dir"/"$fname".new 124 | else 125 | recv_checked "${cmd_timeout}" "$dst_dir"/"$fname".tmp "$fsize" 126 | fi 127 | 128 | touch --no-create --date "$ftime" "$dst_dir"/"$fname".tmp || true # ignore errors 129 | 130 | mv "$dst_dir"/"$fname".tmp "$dst_dir"/"$fname" 131 | 132 | # if 'coreutils-sync' is installed, fsync file and folder 133 | if [ -f "/usr/local/bin/sync" ]; then 134 | /usr/local/bin/sync "$dst_dir"/"$fname" 135 | /usr/local/bin/sync "$dst_dir" 136 | else 137 | sync "$dst_dir"/"$fname" 138 | sync "$dst_dir" 139 | fi 140 | 141 | # check if it the time to start retrying writing to second archive ($cooldown_time passed) 142 | if [ -f "${tmp_dir}"/"sync_wal_error_flag" ]; then 143 | now_time=$(date +%s) 144 | last_err_time=$(stat -c%Y "${tmp_dir}"/"sync_wal_error_flag") 145 | 146 | if [[ $(( $now_time - $last_err_time)) -le $cooldown_time ]]; then 147 | sync_host= 148 | fi 149 | fi 150 | # try to transfer file to second archive 151 | if ! [ -z "$sync_host" ]; then 152 | size_compressed=$(stat -c '%s' "$dst_dir"/"$fname") 153 | set +e 154 | exit_code=0 155 | timeout ${cmd_timeout} cat "$dst_dir"/"$fname" \ 156 | | timeout ${cmd_timeout} ssh $ssh_options "$sync_host" "$remote_cmd" \ 157 | "$dst_dir" "$fname" "'$ftime'" \ 158 | "'skip'" "$pmax" "$cluster_name" \ 159 | "''" "'$size_compressed'" 160 | exit_code=$? 161 | set -e 162 | if [[ $exit_code -ne 0 ]] ; then 163 | echo "ERROR: can't sync file '$fname' to host '$sync_host'" 164 | touch "${tmp_dir}"/"sync_wal_error_flag" 165 | fi 166 | fi 167 | 168 | exit 0 169 | 170 | -------------------------------------------------------------------------------- /pg_archive2/wal-cleanup_2: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | : ${PGVER:=9.2} 4 | 5 | LOCK_FILE_NAME=wal-cleanup.lock 6 | LOG_FILE="/var/tmp/wal-cleanup_${PGHOST}.log" 7 | SRC_DIR=$1 8 | WALS_DIR=$2 9 | REMOTE_ARCHIVE=$3 10 | PGTOOLS=/usr/lib/postgresql/$PGVER/bin 11 | PG_ARCHIVECLEANUP_LOG_LOCAL="/var/tmp/wal-cleanup-pg_archivecleanup_LOCAL_${PGHOST}.log" 12 | PG_ARCHIVECLEANUP_LOG_REMOTE="/var/tmp/wal-cleanup-pg_archivecleanup_REMOTE_${PGHOST}.log" 13 | 14 | # options for verify the last need WAL on the mirror archive host, and remote cleanup 15 | cmd_timeout="1200" # timeout (ssh) 16 | cmd_cleanup_timeout=$((4 * 60 * 60)) # timeout (ssh) when pg_archivecleanup running for cleanup 17 | retry_cnt=1 18 | ssh_timeout_options="-o ConnectTimeout=10 -o ServerAliveInterval=6 -o ServerAliveCountMax=5" 19 | ssh_options="-o Compression=no -c aes128-gcm@openssh.com -o BatchMode=yes ${ssh_timeout_options}" 20 | ssh_master_socket='/tmp/ssh_%h_%p_%r' 21 | ssh_master_options='-o ControlMaster=auto -o ControlPersist=yes' 22 | ssh_options="$ssh_options -o ControlPath=$ssh_master_socket $ssh_master_options" 23 | 24 | CWD=$(pwd) # save working dir for remote commands 25 | 26 | unset LANG LANGUAGE LC_ALL LC_CTYPE LC_COLLATE LC_MESSAGES LC_TIME LC_NUMERIC 27 | export LC_ALL=C 28 | 29 | function usage() { 30 | echo "$0 is script for remove old WAL files" 31 | echo "usage: wal-cleanup /path/to/walshipping/src_backup /path/to/walshipping/logs REMOTE-ARCHIVE-HOST" 32 | } 33 | 34 | function say() { 35 | echo $(date +'%F %T') "$@" 36 | } 37 | 38 | if [ -z "$SRC_DIR" -o -z "$WALS_DIR" -o -z "$REMOTE_ARCHIVE" ]; then 39 | usage 40 | exit 1 41 | fi 42 | 43 | LOCK_FILE=$(readlink -f "$WALS_DIR/../$LOCK_FILE_NAME") 44 | 45 | set -e # check redirect 46 | 47 | main () { 48 | set +e # redirect checked 49 | 50 | flock -n 9 || { echo "lock file '$LOCK_FILE' from $(< "$LOCK_FILE") locked, abort"; exit 1; } 51 | { echo -n "$$ "; date +'%F %T'; } > "$LOCK_FILE" 52 | 53 | exec 3>&1 > "$LOG_FILE" 2>&1 54 | trap "cat \"$LOG_FILE\" >&3" EXIT 55 | 56 | say "start wal-cleanup: '$SRC_DIR' '$WALS_DIR'" 57 | 58 | if [ ! -r "$SRC_DIR"/backup_label ]; then 59 | echo "cannot read LOCAL $SRC_DIR/backup_label, abort" 60 | exit 1 61 | fi 62 | 63 | LAST_NEEDED_WAL_LOCAL=$(grep 'START WAL LOCATION.\+(file .\+)' "$SRC_DIR"/backup_label 2> /dev/null | sed -e 's/.\+(file \(.\+\))/\1/') 64 | echo "LOCAL backup label: '$SRC_DIR/backup_label', last LOCAL needed wal: '$WALS_DIR/$LAST_NEEDED_WAL_LOCAL'" 65 | 66 | if [ -z "$LAST_NEEDED_WAL_LOCAL" ]; then 67 | echo "cannot find last needed LOCAL wal, abort" 68 | exit 1 69 | fi 70 | 71 | if [ ! -f "$WALS_DIR"/"$LAST_NEEDED_WAL_LOCAL" ]; then 72 | echo "something wrong, last needed LOCAL wal not exists, abort" 73 | exit 1 74 | fi 75 | 76 | # check if backup_label on remote server exists and check that there is no ssh error 77 | no_remote_backup_label=$(timeout ${cmd_timeout} ssh $ssh_options $REMOTE_ARCHIVE "test -f \"$CWD/$SRC_DIR\"/backup_label || echo 'no backup label'") 78 | if [ "$no_remote_backup_label" = "no backup label" ]; then 79 | # if there is no backup_label, read LAST_NEEDED_WAL_OLDEST from local backup_label 80 | echo "WARNING: backup_label at REMOTE host '$REMOTE_ARCHIVE' does not exist. No backups?" 81 | LAST_NEEDED_WAL_REMOTE="GGGGGGGGGGGGGGGGGGGGGGGG" 82 | else 83 | # with the help of ssh read backup_label 84 | LAST_NEEDED_WAL_REMOTE=$(timeout ${cmd_timeout} ssh $ssh_options $REMOTE_ARCHIVE "grep 'START WAL LOCATION.\+(file .\+)' \"$CWD/$SRC_DIR\"/backup_label | sed -e 's/.\+(file \(.\+\))/\1/'") 85 | fi 86 | 87 | if [[ "$LAST_NEEDED_WAL_REMOTE" != ???????????????????????? ]]; then 88 | echo "something wrong, can't read last needed REMOTE wal" 89 | exit 1 90 | else 91 | echo "REMOTE backup label: '$SRC_DIR/backup_label', last REMOTE needed wal: '$WALS_DIR/$LAST_NEEDED_WAL_REMOTE'" 92 | fi 93 | 94 | # choose the oldest WAL 95 | if [[ "$LAST_NEEDED_WAL_LOCAL" < "$LAST_NEEDED_WAL_REMOTE" ]]; then 96 | LAST_NEEDED_WAL_OLDEST=$LAST_NEEDED_WAL_LOCAL 97 | else 98 | LAST_NEEDED_WAL_OLDEST=$LAST_NEEDED_WAL_REMOTE 99 | fi 100 | 101 | echo "oldest needed wal is: '$LAST_NEEDED_WAL_OLDEST'" 102 | 103 | echo "##############################" 104 | echo "LOCAL dry-run of pg_archivecleanup" 105 | echo "$PGTOOLS"/pg_archivecleanup -n "$WALS_DIR" "$LAST_NEEDED_WAL_OLDEST" 106 | date > "$PG_ARCHIVECLEANUP_LOG_LOCAL" 107 | "$PGTOOLS"/pg_archivecleanup -n "$WALS_DIR" "$LAST_NEEDED_WAL_OLDEST" | sort >> "$PG_ARCHIVECLEANUP_LOG_LOCAL" 108 | echo will be cleaned \~ $(($(wc -l "$PG_ARCHIVECLEANUP_LOG_LOCAL" | cut -d' ' -f1) - 1)) WALs 109 | echo '---' 110 | head -3 "$PG_ARCHIVECLEANUP_LOG_LOCAL" 111 | echo . . . 112 | tail -2 "$PG_ARCHIVECLEANUP_LOG_LOCAL" 113 | echo '---' 114 | 115 | echo "##############################" 116 | echo "REMOTE dry-run of pg_archivecleanup" 117 | date > "$PG_ARCHIVECLEANUP_LOG_REMOTE" 118 | echo timeout ${cmd_timeout} ssh $ssh_options $REMOTE_ARCHIVE "\"$PGTOOLS\"/pg_archivecleanup -n \"$CWD/$WALS_DIR\" \"$LAST_NEEDED_WAL_OLDEST\"" 119 | timeout ${cmd_timeout} ssh $ssh_options $REMOTE_ARCHIVE "\"$PGTOOLS\"/pg_archivecleanup -n \"$CWD/$WALS_DIR\" \"$LAST_NEEDED_WAL_OLDEST\"" | sort >> "$PG_ARCHIVECLEANUP_LOG_REMOTE" 120 | echo will be cleaned \~ $(($(wc -l "$PG_ARCHIVECLEANUP_LOG_REMOTE" | cut -d' ' -f1) - 1)) WALs 121 | echo '---' 122 | head -3 "$PG_ARCHIVECLEANUP_LOG_REMOTE" 123 | echo . . . 124 | tail -2 "$PG_ARCHIVECLEANUP_LOG_REMOTE" 125 | echo '---' 126 | 127 | # pg_archivecleanup don't return error code when there is an error in WAL delete operation, it always return 0, that is why 128 | # result of pg_archivecleanup can't be checked 129 | say "Perform LOCAL pg_archivecleanup ..." 130 | if ! "$PGTOOLS"/pg_archivecleanup "$WALS_DIR" "$LAST_NEEDED_WAL_OLDEST"; then 131 | echo "error from LOCAL pg_archivecleanup, abort" 132 | exit 1 133 | fi 134 | for ((i = 0; i <= retry_cnt; i++)); do 135 | say "Perform REMOTE pg_archivecleanup ..." 136 | if ! timeout ${cmd_cleanup_timeout} \ 137 | ssh $ssh_options $REMOTE_ARCHIVE \ 138 | flock -nE 125 "$LOCK_FILE" "'$PGTOOLS'/pg_archivecleanup '$CWD/$WALS_DIR' '$LAST_NEEDED_WAL_OLDEST'" 139 | then 140 | res=${PIPESTATUS[0]} 141 | if (( res == 124 )); then 142 | if ((i != retry_cnt)); then 143 | msg="retrying ($((i+1)) of $retry_cnt)" 144 | else 145 | msg="abort" 146 | fi 147 | echo "REMOTE pg_archivecleanup timeout, $msg" 148 | continue 149 | elif (( res == 125 )); then 150 | echo "REMOTE pg_archivecleanup still running, abort" 151 | exit 1 152 | else 153 | echo "error with REMOTE pg_archivecleanup, abort" 154 | exit 1 155 | fi 156 | else 157 | break 158 | fi 159 | done 160 | 161 | say "done wal-cleanup: '$SRC_DIR' '$WALS_DIR'" 162 | 163 | } 9>> "$LOCK_FILE" 164 | 165 | main 166 | 167 | exit 0 168 | 169 | -------------------------------------------------------------------------------- /pg_archive2/base-backup_2: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # base-backup.sh script takes backup task from queue in database 3 | # and performs backup, rotate old, clean unneded wals and rsync wals 4 | # between two archivers 5 | 6 | set -e 7 | 8 | # database with backup queue 9 | BACKUP_DB_HOST="bachup_host" 10 | BACKUP_DB_NAME="backup_db" 11 | BACKUP_DB_PORT=5432 12 | BACKUP_DB_USER="postgres" 13 | 14 | CMD_TIMEOUT="600" # timeout (psql) 15 | MY_HOSTNAME=$(hostname) # archive server name 16 | LOCK_FILE="/var/tmp/base-backup.lock" 17 | BACKUPS_DIR=$1 18 | KEEP_BACKUPS=$2 19 | REMOTE_ARCHIVE=$3 20 | AVITOOLS="/usr/local/bin/" 21 | BACKUP_PREFIX=data.master 22 | WALS_DIR=logs.complete 23 | BAREOS_DIR=bareos.dump 24 | SUCCESS_FILE_NAME=SUCCESS 25 | LANG=C 26 | USE_TAR=1 27 | TRY_COUNT=3 28 | 29 | function say() { 30 | echo $(date +'%F %T') "$@" 31 | } 32 | 33 | # flock whole function 34 | function main() { 35 | flock -n 9 || exit 1 36 | { echo -n "$$ "; date +'%F %T'; } > "$LOCK_FILE" 37 | 38 | # get backup from queue and mark backup as started 39 | backup_settings=$(timeout ${CMD_TIMEOUT} psql -1 -v ON_ERROR_STOP=1 -v MY_HOSTNAME="$MY_HOSTNAME" \ 40 | -h ${BACKUP_DB_HOST} -d ${BACKUP_DB_NAME} -p ${BACKUP_DB_PORT} -U ${BACKUP_DB_USER} \ 41 | -Atq -f - <<'EOF' 42 | \pset null 'NULL' 43 | select o_backip_id, o_pghost, o_pgport, o_remote_archiver, 44 | o_keep_backups_cnt, o_backups_dir, o_bareos_on from backups.get_next(:'MY_HOSTNAME'); 45 | EOF 46 | ) 47 | if [ -z "$backup_settings" ]; then 48 | # no tasks found, exit 49 | exit 0 50 | fi 51 | 52 | # parse backup settings and validate 53 | SAVE_IFS="$IFS" 54 | IFS='|' 55 | read -r backup_id PGHOST PGPORT REMOTE_ARCHIVE KEEP_BACKUPS BACKUPS_DIR BAREOS_ON <<< "${backup_settings}" 56 | IFS="$SAVE_IFS" 57 | for i in "$backup_id" "$PGHOST" "$PGPORT" "$REMOTE_ARCHIVE" "$KEEP_BACKUPS" "$BACKUPS_DIR" "$BAREOS_ON"; do 58 | if [ "$i" = "NULL" -o -z "$i" ]; then 59 | say "ERROR: bad data returned from backups.get_next():" 60 | say "null or emtpy cell at tuple: '${backup_settings}'" 61 | exit 1 62 | fi 63 | done 64 | 65 | export PGHOST=$PGHOST PGPORT=$PGPORT 66 | PGVER=$(timeout ${CMD_TIMEOUT} psql -XAt -c "select split_part(setting, '.', 1) || '.' || \ 67 | split_part(setting, '.', 2) from pg_settings where name = 'server_version'") 68 | 69 | export PGVER=$PGVER 70 | export PGSSLMODE=disable 71 | 72 | PGTOOLS=/usr/lib/postgresql/$PGVER/bin 73 | LOG_FILE="/var/tmp/base-backup_${PGHOST}.log" 74 | 75 | say "Chosen backup: '${PGHOST}', backup_id: '${backup_id}'" 76 | say "$backup_settings" 77 | 78 | if [ -z "$PGHOST" ]; then 79 | say "\$PGHOST environment variable is not set, abort" 80 | exit 1 81 | fi 82 | if [ "$KEEP_BACKUPS" -le "0" ]; then 83 | say "\$KEEP_BACKUPS must be >= 0" 84 | exit 1 85 | fi 86 | 87 | SUCCESS_FILE="$BACKUPS_DIR/$SUCCESS_FILE_NAME" 88 | 89 | exec 3>&1 > "$LOG_FILE" 2>&1 90 | trap "cat \"$LOG_FILE\" >&3;" EXIT 91 | 92 | say "start base-backup: '$BACKUPS_DIR' '$KEEP_BACKUPS'" 93 | 94 | cd "$BACKUPS_DIR" 95 | say "current dir: '$(pwd)'" 96 | 97 | backups=$(find -maxdepth 1 -name "$BACKUP_PREFIX.*" -type d) 98 | backups_cnt=0 99 | 100 | if [ -n "$backups" ]; then 101 | backups_cnt=$(echo "$backups" | wc -l) 102 | 103 | # sanity check, all backup dirs must be exists 104 | for ((i = 0; i < $backups_cnt; i++)); do 105 | dir="$BACKUP_PREFIX.$i"; 106 | if [ ! -d "$dir" ]; then 107 | say "something wrong, backup dir '$dir' not found, abort" 108 | exit 1 109 | fi 110 | done 111 | fi 112 | 113 | new_name="$BACKUP_PREFIX.$(($backups_cnt))" # suffix from 0 114 | say "new backup: '$new_name'" 115 | 116 | USE_TAR=${USE_TAR:+-Ft} 117 | 118 | say $PGTOOLS/pg_basebackup $USE_TAR -D "'$new_name'" -v 119 | $PGTOOLS/pg_basebackup $USE_TAR -D "$new_name" -v 120 | if [ -n "$USE_TAR" ]; then 121 | say tar format: untar backup_label for wal-cleanup and global/pg_control for "backup-restore -w" 122 | tar -C "$new_name" -xf "$new_name"/base.tar backup_label 123 | tar -C "$new_name" -xf "$new_name"/base.tar global/pg_control 124 | fi 125 | say "pg_basebackup is completed" 126 | 127 | # new backup arrived! 128 | backups_cnt=$((backups_cnt + 1)) 129 | 130 | last_remove=$(($backups_cnt - 1 - $KEEP_BACKUPS)) # for suffix from 0 and do not count current 131 | if (( KEEP_BACKUPS < backups_cnt )); then # need rotate 132 | say "rotate backups: $backups_cnt -> $KEEP_BACKUPS" 133 | for ((i = 0; i <= $last_remove; i++)); do 134 | dir="$BACKUP_PREFIX.$i" 135 | say "remove old: '$dir'" 136 | # NFS client can resend command (if network connection was lost or answer timeout, for example), 137 | # Linux implementation cannot handle it corectly and rm can return "No such file or directory" 138 | # error, try protect from it 139 | tryed= 140 | for ((j = 1; j <= $TRY_COUNT; j++)); do 141 | if [ -d "$dir" ]; then 142 | tryed=1 143 | if rm -r "$dir"; then 144 | break 145 | else 146 | say "rm error, try again: $j" 147 | fi 148 | else 149 | break 150 | fi 151 | done 152 | if [ -z "$tryed" ] || [ "$j" -gt "$TRY_COUNT" ]; then 153 | say "cannot remove old dir, abort" 154 | exit 1 155 | fi 156 | done 157 | for ((i = $last_remove + 1, j = 0; i < $backups_cnt; i++, j++)); do 158 | say "move: '$BACKUP_PREFIX.$i' '$BACKUP_PREFIX.$j'" 159 | # TODO: protect from NFS double mv too? 160 | mv "$BACKUP_PREFIX.$i" "$BACKUP_PREFIX.$j" 161 | done 162 | else 163 | say "skip rotation: $backups_cnt <= $KEEP_BACKUPS, need another $(($KEEP_BACKUPS - $backups_cnt)) backups for start rotation" 164 | fi 165 | 166 | # now the oldes backup is with suffix .0, so cleanup WALs at it at both archive servers 167 | say $AVITOOLS/wal-cleanup_2 "'$BACKUP_PREFIX.0'" "'$WALS_DIR'" "'$REMOTE_ARCHIVE'" 168 | $AVITOOLS/wal-cleanup_2 "$BACKUP_PREFIX.0" "$WALS_DIR" "$REMOTE_ARCHIVE" 169 | say "wal-cleanup is completed" 170 | 171 | # now rsync missing wals 172 | say $AVITOOLS/wal-sync "'$WALS_DIR'" "'$REMOTE_ARCHIVE'" 173 | $AVITOOLS/wal-sync "$WALS_DIR" "$REMOTE_ARCHIVE" 174 | 175 | # create hardlink for bareos external backup system (bareos MUST remove this directory after its backup) 176 | if [ "$MY_HOSTNAME" = "$BAREOS_ON" ]; then 177 | if [ -d "$BAREOS_DIR" ]; then 178 | say "bareos: directory exists, skip hardlinking" 179 | else 180 | bareos_tmp=$(mktemp -p "." -d ${BAREOS_DIR}-XXXX) || true # errors here is not backup fail 181 | if [ ! -d "$bareos_tmp" ]; then 182 | say "bareos: fail to create temp directory" 183 | else 184 | last=$(( $(find -maxdepth 1 -name "$BACKUP_PREFIX.*" -type d | wc -l) - 1)) 185 | cp --link --recursive "$BACKUP_PREFIX.$last/." "$bareos_tmp" && mv "$bareos_tmp" "$BAREOS_DIR" 186 | fi 187 | say "bareos: hardlinking done" 188 | fi 189 | else 190 | if [ -d "$BAREOS_DIR" ]; then 191 | # remove stale bareos dir if we moved bareos backup to another archiver 192 | say "bareos: removing stale backup dir '$BAREOS_DIR' from '`pwd`'" 193 | rm -rf "$BAREOS_DIR" && say "bareos: removing stale backup dir is done" 194 | fi 195 | fi 196 | 197 | # mark backup as stopped in database 198 | say "Mark backup '${backup_id}' as done in backups database..." 199 | backup_done=$(timeout ${CMD_TIMEOUT} psql -1 -v ON_ERROR_STOP=1 -v BACKUP_ID="$backup_id" -Atq \ 200 | -h ${BACKUP_DB_HOST} -d ${BACKUP_DB_NAME} -p ${BACKUP_DB_PORT} -U ${BACKUP_DB_USER} \ 201 | -f - <<'EOF' 202 | select backups.stop(:BACKUP_ID); 203 | EOF 204 | ) 205 | say "OK" 206 | say "done base-backup: '$BACKUPS_DIR' '$KEEP_BACKUPS'" 207 | say > "$SUCCESS_FILE" 208 | 209 | } 9>> "$LOCK_FILE" 210 | 211 | # do not run on tests 212 | if [[ $BATS_TEST_FILENAME ]]; then 213 | return 214 | fi 215 | 216 | main 217 | 218 | exit 0 219 | 220 | -------------------------------------------------------------------------------- /pg_archive2/archive_cmd_2: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # archive_cmd 4 | # Transfers WALs to two archive hosts: 5 | # [ master: archive_cmd ] -> [ archive1: archive_remote_cmd ] -> [ archive2: scp ]. 6 | # archive_cmd writes on either of DST-HOSTNAMEs. 7 | # And with the help of archive_remote_cmd to remote host 8 | # archive_remote_cmd (on the 1st archive) saves WAL and tries to transfer WAL to SYNC-HOST. 9 | # If archive_cmd can't write WAL on first host with archive_remote_cmd, then after N retries, 10 | # and continues to write on the second host during N seconds (cooldown_time). 11 | 12 | set -e 13 | set -o pipefail 14 | 15 | error_handler() { 16 | scriptname=$(basename $0) 17 | hostname=$(hostname) 18 | echo "^^^ ERROR at [host: '${hostname}', file: '${scriptname}', line: '${BASH_LINENO[0]}']" 19 | } 20 | trap error_handler ERR 21 | 22 | pmax=4 # the number of compressing threads 23 | remote_cmd=/usr/local/bin/archive_remote_cmd_2 24 | local_lastlog="/var/lib/postgresql/LASTLOG" # local copy of the file with the name of last successfully sent file 25 | 26 | cmd_timeout="6" # parameter with timeout (for ssh/cat/compress) 27 | ssh_timeout_options="-o ConnectTimeout=10 -o ServerAliveInterval=6 -o ServerAliveCountMax=5" 28 | ssh_options="-o Compression=no -c aes128-gcm@openssh.com -o BatchMode=yes ${ssh_timeout_options}" 29 | ssh_master_socket='/tmp/ssh_%h_%p_%r' 30 | ssh_master_options='-o ControlMaster=auto -o ControlPersist=yes' 31 | use_ssh_persist=yes 32 | pwals=5 33 | ready_wals_for_parallel=10 # ready-wals must be at least twice greater than pwals 34 | 35 | retry_count="6" # see below 36 | cooldown_time="600" # do not try to send file to $dst_host1 for a '$cooldown_time' seconds after '$retry_count' attemps 37 | 38 | unset LANG LANGUAGE LC_ALL LC_CTYPE LC_COLLATE LC_MESSAGES LC_TIME LC_NUMERIC 39 | export LC_ALL=C 40 | 41 | dst_hosts="$1" 42 | dst_dir="$2" 43 | src_file="$3" 44 | fname="$4" 45 | cluster_name="$5" 46 | src_dir=${src_file%/$fname} 47 | arch_status_dir="$src_dir"/archive_status 48 | tmp_dir=/tmp 49 | 50 | if [ -f /etc/default/archive_cmd ]; then 51 | . /etc/default/archive_cmd 52 | fi 53 | 54 | if [ -n "$use_ssh_persist" ]; then 55 | ssh_options="$ssh_options -S $ssh_master_socket $ssh_master_options" 56 | fi 57 | 58 | if [ -z "$dst_hosts" -o -z "$dst_dir" -o -z "$src_file" -o -z "$fname" -o -z "$cluster_name" ]; then 59 | echo -e "usage: archive_cmd 'DST-HOSTNAMES' DST-DIR SRC-WAL-FILENAME-WITH-PATH SRC-WAL-FILENAME\n" \ 60 | "\n" \ 61 | "DST-HOSTNAMES - two archive host names in single quotes\n" \ 62 | "DST-DIR - archive directory for WALs (ssh path)\n" \ 63 | "SRC-WAL-FILENAME-WITH-PATH - %p (file name with path)\n" \ 64 | "SRC-WAL-FILENAME - %f (file name)\n" \ 65 | "CLUSTER-NAME - unique cluster name\n" 66 | exit 1 67 | fi 68 | 69 | dst_hosts_array=( $dst_hosts ) 70 | dst_host0=${dst_hosts_array[0]} # first host from list 71 | dst_host1=${dst_hosts_array[1]} # second (reserve) host 72 | 73 | # checks if file name is WAL name 74 | is_wal() { 75 | local fname="$1" 76 | if [[ "$fname" == ???????????????????????? ]] && [[ "$fname" != *.* ]]; then 77 | return 0 # zero is true 78 | else 79 | return 1 # non-zero is false 80 | fi 81 | } 82 | 83 | # transfers WAL to archive with ssh 84 | send_wal() { 85 | local fname_with_path="$1" 86 | local file_name="$2" 87 | local dir=$dst_dir 88 | local ftime fsize 89 | IFS='|' read -r ftime fsize <<<"$(stat -c '%y|%s' "$fname_with_path")" 90 | 91 | # LASTLOG put in another catalog 92 | if [ "$file_name" = "LASTLOG" ]; then 93 | # this part can’t be executed in parallel neither here nor in remote_cmd 94 | dir="${dst_dir}/.." 95 | fi 96 | 97 | timeout ${cmd_timeout} cat "$fname_with_path" \ 98 | | timeout ${cmd_timeout} ssh $ssh_options "$current_dst_host" "$remote_cmd" \ 99 | "$dir" "$file_name" "'$ftime'" \ 100 | "''" "$pmax" "$cluster_name" \ 101 | "'$sync_host'" "'$fsize'" 102 | } 103 | 104 | # increments error counter for $dst_host0 in file (ignore $dst_host1) 105 | increment_wal_errors_cnt() { 106 | local was_failed_cnt 107 | [ "$current_dst_host" = "$dst_host1" ] && return 0 108 | 109 | if [[ -f "${tmp_dir}/send_wal_errors_cnt_${dst_host0}_" ]]; then 110 | was_failed_cnt=$(<"${tmp_dir}/send_wal_errors_cnt_${dst_host0}_") 111 | was_failed_cnt=$(( was_failed_cnt + 1 )) 112 | echo "$was_failed_cnt" > "${tmp_dir}/send_wal_errors_cnt_${dst_host0}_" 113 | else 114 | echo 1 > "${tmp_dir}/send_wal_errors_cnt_${dst_host0}_" 115 | fi 116 | } 117 | 118 | # sets current $current_dst_host value to one on which to send files 119 | # first check if we should try to send file to $dst_host0 or directly send it to $dst_host1 120 | if [ -f "${tmp_dir}/send_wal_errors_cnt_${dst_host0}_" ]; then 121 | was_failed_cnt=$(<"${tmp_dir}/send_wal_errors_cnt_${dst_host0}_") 122 | if [ "$was_failed_cnt" -lt "$retry_count" ]; then 123 | current_dst_host="$dst_host0" 124 | # having reached $retry_count, don’t try to write WAL to $dst_host0 during $cooldown_time seconds 125 | # instead of it switch to $dst_host1 during $cooldown_time seconds 126 | else 127 | now_time=$(date +%s) 128 | last_err_time=$(stat -c%Y "${tmp_dir}/send_wal_errors_cnt_${dst_host0}_") 129 | if [[ $(( $now_time - $last_err_time)) -le $cooldown_time ]]; then 130 | # try to write to dst_host1 131 | current_dst_host="$dst_host1" 132 | # try to write to 1st host after $cooldown_time and reset error counter 133 | else 134 | rm "${tmp_dir}/send_wal_errors_cnt_${dst_host0}_" 135 | current_dst_host="$dst_host0" 136 | fi 137 | fi 138 | else 139 | # there is no error file, so send to 1st host 140 | current_dst_host="$dst_host0" 141 | fi 142 | 143 | # set sync_host parameter for archive_remote_cmd 144 | # if 1st is down, don’t sync to it 145 | if [ "$current_dst_host" = "$dst_host0" ]; then 146 | sync_host="$dst_host1" 147 | else 148 | sync_host='' 149 | fi 150 | 151 | # checks if that file has already been sent to archive by reading local LASTLOG 152 | if [[ -r "$local_lastlog" ]]; then 153 | prev_archived=$(< ${local_lastlog}) 154 | else 155 | prev_archived="" 156 | fi 157 | # files like '000000010004EF04000000E1.00010DF0.backup' and '00000015.history' are always archived together 158 | # compare only WAL 159 | # If previously archived file is not WAL, execute archiving procedure in one thread to make it possible to run a compare operation ('<') 160 | if is_wal "$fname" && is_wal "$prev_archived"; then 161 | if [[ "$fname" < "$prev_archived" ]] || [[ "$fname" = "$prev_archived" ]]; then 162 | echo "File '$fname' was already sent to archive. Skipping..." 163 | exit 0 164 | fi 165 | fi 166 | 167 | # should we turn on multi thread archiving 168 | ready_count=$(find ${arch_status_dir}/ -maxdepth 1 -type f -name "????????????????????????.ready" | wc -l) 169 | 170 | # archive with one thread if: 171 | # - ready WAL files number is lower than the threshold ready_wals_for_parallel 172 | # - it is not WAL file (.backup, .history) 173 | # - previously archived file is not WAL file 174 | if [[ $ready_count -le $ready_wals_for_parallel ]] || ! is_wal "$fname" || ! is_wal "$prev_archived"; then 175 | exit_code=0 176 | send_wal "$src_file" "$fname" || exit_code=$? 177 | if [[ $exit_code -ne 0 ]] ; then 178 | echo "ERROR: can't send '$fname' to archive host '$current_dst_host'. Exit code: '$exit_code'" 179 | increment_wal_errors_cnt 180 | exit 1 181 | fi 182 | wal=$fname 183 | else 184 | # turn on parallel archiving 185 | 186 | # take pwals number of WAL files 187 | ready_wals=$(find ${arch_status_dir}/ -maxdepth 1 -type f -name "????????????????????????.ready" -printf '%f\n'\ 188 | | sort | grep -A "$(( pwals - 1 ))" -F ${fname}) 189 | 190 | # archive these WAL files in parallel 191 | declare -A send_pids 192 | for wal_ready in $ready_wals ; do 193 | wal=${wal_ready%.ready} 194 | send_wal "$src_dir"/"$wal" "$wal" & send_pids[$wal]=$! 195 | done 196 | 197 | # check exit code for each thread 198 | for wal_pid in ${!send_pids[@]}; do 199 | exit_code=0 200 | wait ${send_pids[$wal_pid]} || exit_code=$? 201 | if [[ $exit_code -ne 0 ]] ; then 202 | echo "ERROR: can't send '$wal_pid' to archive host '$current_dst_host'. Exit code: '$exit_code'" 203 | increment_wal_errors_cnt 204 | exit 1 205 | fi 206 | done 207 | fi 208 | 209 | # save LASTLOG locally 210 | echo "$wal" > "$local_lastlog" 211 | # send LASTLOG to remote server 212 | send_wal "$local_lastlog" "LASTLOG" 213 | 214 | exit 0 215 | 216 | -------------------------------------------------------------------------------- /pg_archive2/restore_cmd_2: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # try to get specific WAL file from src_hosts in cycle 4 | set -e 5 | 6 | error_handler() { 7 | scriptname=$(basename $0) 8 | hostname=$(hostname) 9 | echo "^^^ ERROR at [host: '${hostname}', file: '${scriptname}', line: '${BASH_LINENO[0]}']" 10 | } 11 | trap error_handler ERR 12 | 13 | saved_args=( "$@" ) 14 | saved_self_name=$0 15 | pmax=4 16 | local_dir=/var/local/WALs # for putting wal manually 17 | only_local= # use WALs only from local_dir, do not read it from NFS 18 | 19 | src_hosts="$1" # ssh hosts 20 | buffer_dir=/var/lib/postgresql/wals_buffer # first load file by ssh here, not tmpfs 21 | 22 | tmp_dir='/lib/init/rw/pg_recv_sb' # file with error counter, created by puppet, tmpfs 23 | 24 | cmd_timeout="6" 25 | ssh_timeout_options="-o ConnectTimeout=10 -o ServerAliveInterval=6 -o ServerAliveCountMax=5" 26 | ssh_options="-o Compression=no -c aes128-gcm@openssh.com -o BatchMode=yes ${ssh_timeout_options}" 27 | ssh_master_socket='/tmp/ssh_%h_%p_%r' 28 | ssh_master_options='-o ControlMaster=auto -o ControlPersist=yes' 29 | use_ssh_persist=yes 30 | 31 | src_dir="$2" 32 | fname="$3" 33 | dst_file="$4" # always 'pg_xlog/RECOVERYXLOG' 34 | delay_time="$5" # wal apply delay 35 | 36 | retry_count="6" # see below 37 | cooldown_time="600" # do not try to catch file from host for a '$cooldown_time' seconds after $retry_count attemps 38 | 39 | if [ -f /etc/default/restore_cmd ]; then 40 | . /etc/default/restore_cmd 41 | fi 42 | 43 | if [ -n "$use_ssh_persist" ]; then 44 | ssh_options="$ssh_options -o ControlPath=$ssh_master_socket $ssh_master_options" 45 | fi 46 | 47 | if [ -z "$src_hosts" -o -z "$src_dir" -o -z "$fname" -o -z "$dst_file" ]; then 48 | echo -e "usage: restore_cmd 'SRC-HOSTNAMES' SRC-DIR SRC-WAL-FILENAME DST-WAL-FILENAME-WITH-PATH [DELAY-SECONDS]\n" \ 49 | "\n" \ 50 | "SRC-HOSTNAMES - two archive host names in single quotes\n" \ 51 | "SRC-DIR - archive directory with WALs\n" \ 52 | "SRC-WAL-FILENAME - %f (file name)\n" \ 53 | "DST-WAL-FILENAME-WITH-PATH - %p (file name with path)\n" \ 54 | "DELAY-SECONDS - copy WAL file only if it older than this seconds\n" 55 | exit 1 56 | fi 57 | 58 | decompress() { 59 | local dst_file=$1 60 | mime=$(file -m /etc/postgresql-common/compress.mime.mgc -b --mime-type "$dst_file") 61 | if [ "$mime" = "application/x-bzip2" ]; then 62 | pbzip2 -p"$pmax" -d < "$dst_file" > "$dst_file".bz2-tmp || { decompress_error=$?; return 1; } 63 | mv "$dst_file".bz2-tmp "$dst_file" 64 | elif [ "$mime" = "application/x-gzip" ]; then 65 | gunzip < "$dst_file" > "$dst_file".gz-tmp || { decompress_error=$?; return 1; } 66 | mv "$dst_file".gz-tmp "$dst_file" 67 | fi 68 | } 69 | 70 | # decides whether to apply current wal from buffer or not 71 | apply_wal() { 72 | local received_from=$1 73 | local src_dir=$2 74 | local fname=$3 75 | local dst_file=$4 76 | local decompress_error 77 | 78 | if [ "$delay_time" ] && [ "$delay_time" -gt 0 ]; then 79 | ftime="$(stat -c %Y "$src_dir"/"$fname")" 80 | apply_time="$(date --date="now - $delay_time seconds" +%s)" 81 | if [ "$ftime" -gt "$apply_time" ]; then 82 | # file is too new, skip it, replay it only after $delay_time seconds 83 | # show message only sometimes 84 | if [ $(( (ftime - apply_time) % (10 * 60) )) -lt 3 ]; then 85 | echo "file '$fname' is too new ($((ftime - apply_time))s), skip it" 86 | fi 87 | exit 1 88 | fi 89 | fi 90 | 91 | mv "$src_dir"/"$fname" "$dst_file" 92 | decompress_error='' 93 | if ! decompress "$dst_file"; then 94 | # dst_file can be "destroyed" during decompress, 95 | # that's why ignore possible error when delete it 96 | rm "$dst_file" || true 97 | 98 | # if error, try another host, only if it is not local operation 99 | # not local_dir or buffer_dir 100 | if (( decompress_error )) && [[ $received_from ]]; then 101 | if [[ $RESTORE_CMD_TRY_OTHER ]]; then 102 | echo "ERROR: cannot unpack wal from all hosts" 103 | exit 1 104 | fi 105 | 106 | # reexec and try other server 107 | echo "WARNING: decompress error from host $received_from, trying other" 108 | RESTORE_CMD_TRY_OTHER=$received_from \ 109 | exec "$saved_self_name" "${saved_args[@]}" 110 | else 111 | # mv error or something unknown 112 | echo "ERROR: decompress error" 113 | exit 1 114 | fi 115 | fi 116 | } 117 | 118 | if [ -d "$local_dir" ] && [ -f "$local_dir"/"$fname" ]; then 119 | src_dir="$local_dir" 120 | fi 121 | 122 | if ! [ -d "$buffer_dir" ]; then 123 | mkdir -p "$buffer_dir" 124 | fi 125 | 126 | # check that wal file exists and its size is not zero 127 | if [ -s "$buffer_dir"/"$fname" ]; then 128 | src_dir="$buffer_dir" 129 | fi 130 | 131 | if [ "$only_local" ] && [ "$src_dir" != "$local_dir" ]; then 132 | # src_dir set to local_dir above, if not - requested WAL file does not exist in local_dir, exit now 133 | exit 1 134 | fi 135 | 136 | # take wal from 'special local dir' or 'ssh buffer dir' 137 | if [ "$src_dir" = "$local_dir" ] || [ "$src_dir" = "$buffer_dir" ]; then 138 | apply_wal '' "$src_dir" "$fname" "$dst_file" 139 | exit 0 140 | fi 141 | 142 | src_hosts_count=$(wc -w <<< $src_hosts) 143 | 144 | # fetch wal via ssh to buffer dir 145 | fetch_errors=0 # any receiving wal errors 146 | conn_errors=0 # only ssh transfer errors 147 | for host in $src_hosts ; do 148 | # if it is retry and this is the host from which corrupted wal had been received, then try another host 149 | if [[ $RESTORE_CMD_TRY_OTHER && $host = $RESTORE_CMD_TRY_OTHER ]]; then 150 | fetch_errors=$(($fetch_errors + 1)) 151 | continue 152 | fi 153 | 154 | # check if retry is nedeed for that host 155 | if [ -f "${tmp_dir}/ssh-errors_${host}_" ]; then 156 | # if not reached $retry_count limit, then continue to read from that host 157 | was_failed_cnt=$(<"${tmp_dir}/ssh-errors_${host}_") 158 | if [ "$was_failed_cnt" -lt "$retry_count" ]; then 159 | true 160 | # reaching $retry_count retries limit, stop try to read wal from that host during $cooldown_time seconds 161 | else 162 | now_time=$(date +%s) 163 | last_err_time=$(stat -c%Y "${tmp_dir}/ssh-errors_${host}_") 164 | if [[ $(( $now_time - $last_err_time)) -le $cooldown_time ]]; then 165 | fetch_errors=$(($fetch_errors + 1)) 166 | # try to read from next host 167 | continue 168 | # repeat again trying after $cooldown_time, with setting counter of errors to 0 169 | else 170 | rm "${tmp_dir}/ssh-errors_${host}_" 171 | fi 172 | fi 173 | fi 174 | 175 | # read by ssh 176 | # in case of error, empty file will be created (!) 177 | set +e 178 | timeout ${cmd_timeout} ssh ${ssh_options} ${host} \ 179 | "test -f '$src_dir'/'$fname' && cat '$src_dir'/'$fname' || exit 148" \ 180 | > "$buffer_dir"/"$fname".part 181 | ssh_code=$? 182 | set -e 183 | # if successfully get file then exit 184 | if [ "$ssh_code" -eq "0" ]; then 185 | # in case of success, set error counter value to 0 by removing file with counter 186 | [ -f "${tmp_dir}/ssh-errors_${host}_" ] && rm "${tmp_dir}/ssh-errors_${host}_" 187 | mv "$buffer_dir"/"$fname".part "$buffer_dir"/"$fname" 188 | break; 189 | # file still not exist on archive server 190 | elif [ "$ssh_code" -eq "148" ]; then 191 | echo "WARNING: can't find wal '$fname' at host '$host'" 192 | # zero the file-counter, as connection is restored 193 | [ -f "${tmp_dir}/ssh-errors_${host}_" ] && rm "${tmp_dir}/ssh-errors_${host}_" 194 | fetch_errors=$(($fetch_errors + 1)) 195 | # delete empty temp file 196 | test -f "$buffer_dir"/"$fname".part && rm "$buffer_dir"/"$fname".part 197 | # all other exit codes are considered to be connection error 198 | else 199 | echo "WARNING: connection error, can't fetch wal '$fname' from host '$host'" 200 | fetch_errors=$(($fetch_errors + 1)) 201 | conn_errors=$(($conn_errors + 1)) 202 | 203 | # increment counter of connection errors 204 | if [ -f "${tmp_dir}/ssh-errors_${host}_" ]; then 205 | failed_cnt=$(<"${tmp_dir}/ssh-errors_${host}_") 206 | failed_cnt=$(( failed_cnt + 1 )) 207 | echo "$failed_cnt" > "${tmp_dir}/ssh-errors_${host}_" 208 | else 209 | echo 1 > "${tmp_dir}/ssh-errors_${host}_" 210 | fi 211 | # remove empty temp file (size 0) 212 | test -f "$buffer_dir"/"$fname".part && rm "$buffer_dir"/"$fname".part 213 | fi 214 | done 215 | received_from=$host 216 | 217 | # if can't connect to both hosts, then remove files counters of errors, 218 | # so as not to "fall asleep" in vain and for a long time 219 | if [ "$conn_errors" -ge "$src_hosts_count" ]; then 220 | for host in $src_hosts ; do 221 | if [ -f "${tmp_dir}/ssh-errors_${host}_" ]; then 222 | rm "${tmp_dir}/ssh-errors_${host}_" 223 | fi 224 | done 225 | fi 226 | 227 | if [ "$fetch_errors" -ge "$src_hosts_count" ]; then 228 | echo "ERROR: can't fetch wal from all hosts: ${src_hosts}" 229 | exit 1 230 | fi 231 | 232 | apply_wal "$received_from" "$buffer_dir" "$fname" "$dst_file" 233 | 234 | exit 0 235 | 236 | -------------------------------------------------------------------------------- /pg_archive2/backup_queue.sql: -------------------------------------------------------------------------------- 1 | create schema if not exists backups; 2 | 3 | create table backups.hosts ( 4 | id serial primary key, 5 | host text not null, 6 | port int default 5432, 7 | cluster_name text not null, 8 | archiver_name text[2] not null, 9 | bareos_on text default 'archive_1', 10 | keep_backups_cnt text not null, 11 | periodicity_days int not null, 12 | directory text not null, 13 | last_archiver text, 14 | last_backup_id int, 15 | last_backup_start_txtime timestamptz default 'epoch', 16 | constraint host unique(host), 17 | constraint cluster_name unique(cluster_name) 18 | ); 19 | ALTER TABLE backups.hosts 20 | OWNER TO postgres; 21 | 22 | --- 23 | 24 | create table backups.tasks ( 25 | backup_id serial primary key, 26 | host text not null, 27 | archiver_name text not null, 28 | start_txtime timestamptz, 29 | end_txtime timestamptz, 30 | is_failed boolean default false not null 31 | ); 32 | create index ON backups.tasks (start_txtime); 33 | ALTER TABLE backups.tasks 34 | OWNER TO postgres; 35 | 36 | -- insert into backups.hosts (host, cluster_name, archiver_name, bareos_on, keep_backups_cnt, periodicity_days, directory) values 37 | -- ('master7-sb', 'master7', '{"archive_1", "archive_2"}', 'archive_1', 2, 4, '/archive_path7/'); 38 | -- insert into backups.hosts (host, cluster_name, archiver_name, bareos_on, keep_backups_cnt, periodicity_days, directory) values 39 | -- ('master2-sb', 'master2', '{"archive_1", "archive_2"}', 'archive_1', 2, 4, '/archive_path2/') ; 40 | -- insert into backups.hosts (host, cluster_name, archiver_name, bareos_on, keep_backups_cnt, periodicity_days, directory) values 41 | -- ('master1-sb', 'master1', '{"archive_1", "archive_2"}', 'archive_1', 2, 8, '/archive_path/') ; 42 | 43 | --- 44 | 45 | CREATE OR REPLACE FUNCTION backups.get_next(i_archiver text, OUT o_backip_id int, OUT o_pghost text, OUT o_pgport int, OUT o_remote_archiver text, 46 | OUT o_keep_backups_cnt int, OUT o_backups_dir text, OUT o_bareos_on text) 47 | RETURNS SETOF record 48 | LANGUAGE plpgsql 49 | ROWS 1 50 | AS $function$ 51 | -- Backups queue for backup postgres servers to two archive servers in turn. 52 | 53 | -- Function returns which backup must be executed by base-backup and mark it in table tasks as in progress (end_txtime = NULL). 54 | -- Backup script must mark successful backup by executing select * backups.stop(backup_id), 55 | -- backip_id - o_backip_id is an out parameter of that function. 56 | -- periodicity_days - periodicity(in days) of backup for specific(!) archive 57 | 58 | -- Returns 0 strings if there is no backup tasks for specific archive 59 | -- Backups are alternating between 2 archive servers. 60 | -- If previous backup was made to 1st archive server, then next will be made by 2nd archive server(and vice versa) if there is no crashes 61 | -- If the backup is failed on specific archive server, it will not being retried till 62 | -- the row with failed status would be deleted(manually or by 'select * from backups.stop(backup_id)') from backups.tasks 63 | -- Meanwhile on second archive backups operations will continue being successfully executed 64 | 65 | -- RESTRICTIONS: 66 | -- First backup must be run on the 1st archive server (second archive starts work only after 1st one) 67 | 68 | 69 | -- EXAMPLE of adding new cluster to backup queu: 70 | -- insert into backups.hosts (host, cluster_name, archiver_name, keep_backups_cnt, periodicity_days, directory) values 71 | -- ('master7-sb', 'master7', '{"archive_1", "archive_2"}', 2, 4, '/archive_path7/'); 72 | 73 | -- host - standby, from which backups will be taken 74 | -- cluster_name - cluster name (e.g. master7) 75 | -- archiver_name - array with two archive servers (destination of backup) 76 | -- keep_backups_cnt - the number of backups to keep on one server (recommended value 2) 77 | -- periodicity_days - schedule for one(!) archive server (recommended value 4 or more and it must be multiple of 2) 78 | -- directory - destination 79 | 80 | DECLARE 81 | BACKUP_START_TIME constant time := '03:07'; -- don't start backup befor this time 82 | v_host_r record; 83 | v_chosen_archiver text; 84 | v_is_found boolean := false; 85 | v_days_delimiter integer; 86 | v_expected_backup_date timestamptz; 87 | begin 88 | -- Mark backup tasks as failed if there was no backups.done() call between backups.get_next() for specific archive server 89 | update 90 | backups.tasks t 91 | set 92 | is_failed = true 93 | from 94 | backups.hosts b 95 | where 96 | t.backup_id = b.last_backup_id 97 | and t.archiver_name = i_archiver 98 | and t.end_txtime is NULL 99 | and t.is_failed = false; 100 | if found then 101 | raise notice '!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'; 102 | raise notice 'WARNING: one of backups marked as failed!'; 103 | raise notice '!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'; 104 | end if; 105 | 106 | -- backup queue 107 | for v_host_r in select * from backups.hosts order by last_backup_start_txtime desc 108 | loop 109 | -- try to derive next archive server name (take the opposite one) 110 | -- if there was fail on previous one, backup still needs to be done 111 | -- check existence failed backup task on opposite archive: 112 | perform * from 113 | backups.tasks t 114 | where 115 | t.archiver_name = coalesce((array_remove(v_host_r.archiver_name, v_host_r.last_archiver))[1], v_host_r.archiver_name[1]) 116 | and t.host = v_host_r.host 117 | and t.is_failed = true 118 | order by 119 | start_txtime desc 120 | limit 1; 121 | -- if failed then continue execute backup tasks on previous one archive server 122 | if found then 123 | v_chosen_archiver := v_host_r.last_archiver; 124 | v_days_delimiter := 1; 125 | else 126 | v_chosen_archiver := coalesce((array_remove(v_host_r.archiver_name, v_host_r.last_archiver))[1], v_host_r.archiver_name[1]); 127 | v_days_delimiter := 2; 128 | end if; 129 | 130 | -- skip if input parameter does not match with reusult of function 131 | if i_archiver != v_chosen_archiver then 132 | continue; 133 | end if; 134 | 135 | -- if there is no such backup_id in tasks, then chose that host 136 | if v_host_r.last_backup_id is NULL then 137 | v_is_found := true; 138 | exit; 139 | end if; 140 | perform * from backups.tasks t where t.backup_id = v_host_r.last_backup_id; 141 | if not found then 142 | -- TODO: cope with deleting of row in tasks 143 | v_is_found := true; 144 | exit; 145 | end if; 146 | 147 | -- check for errors in backup tasks for chosen archive server 148 | perform * 149 | from 150 | backups.tasks t 151 | where 152 | t.archiver_name = v_chosen_archiver 153 | and t.host = v_host_r.host 154 | and t.is_failed = true 155 | order by 156 | start_txtime desc 157 | limit 1; 158 | -- skip backup execution till error will be fixed and record with error will be removed from tasks 159 | if found then 160 | continue; 161 | end if; 162 | 163 | -- v_days_multiplier dependency on before last backup task fail is done to alternate archives in right way 164 | -- e.g.: if periodicity_days = 4, then backup will be made each 2 days on different archive servers 165 | -- while on each archive server backup task is executed each 4 days 166 | 167 | -- check if periodicity_days are passed since last backup on that archive server 168 | v_expected_backup_date := v_host_r.last_backup_start_txtime::date + BACKUP_START_TIME + v_host_r.periodicity_days * '1 days'::interval / v_days_delimiter; 169 | -- raise notice 'DEBUG: v_expected_backup_date: %, v_days_delimiter: %', v_expected_backup_date, v_days_delimiter; 170 | perform * 171 | from 172 | backups.tasks t 173 | where 174 | t.backup_id = v_host_r.last_backup_id 175 | and v_expected_backup_date::timestamptz <= now() 176 | order by 177 | start_txtime desc 178 | limit 1; 179 | if found then 180 | v_is_found := true; 181 | exit; 182 | end if; 183 | end loop; 184 | 185 | -- exit with error if don't find appropriate backup candidate 186 | if v_is_found = false then 187 | -- raise notice 'there is no tasks for ''%''', i_archiver; 188 | return; 189 | end if; 190 | 191 | o_pghost := v_host_r.host; 192 | o_pgport := v_host_r.port; 193 | o_keep_backups_cnt := v_host_r.keep_backups_cnt; 194 | o_backups_dir := v_host_r.directory; 195 | o_bareos_on := v_host_r.bareos_on; 196 | 197 | -- choose host for syncing (wal-sync, wal-cleanup) 198 | select (array_remove(v_host_r.archiver_name, v_chosen_archiver))[1] into o_remote_archiver from backups.hosts h; 199 | 200 | -- add record with backup task 201 | insert into backups.tasks 202 | (host, archiver_name, start_txtime, end_txtime) 203 | values 204 | (v_host_r.host, v_chosen_archiver, now(), NULL) 205 | returning backup_id into o_backip_id; 206 | 207 | -- update hosts status 208 | update backups.hosts set last_archiver = v_chosen_archiver, last_backup_start_txtime = now(), last_backup_id = o_backip_id where host = v_host_r.host; 209 | 210 | return next; 211 | 212 | end; 213 | $function$; 214 | 215 | alter function backups.get_next ( text) owner to postgres ; 216 | 217 | --- 218 | 219 | CREATE OR REPLACE FUNCTION backups.stop(i_backup_id integer, OUT o_info boolean) 220 | RETURNS boolean 221 | LANGUAGE plpgsql 222 | AS $function$ 223 | -- mark backup_id (which was get by backups.i_archiver()) as successful 224 | DECLARE 225 | begin 226 | o_info := false; 227 | update 228 | backups.tasks 229 | set 230 | is_failed = false, 231 | end_txtime = now() 232 | where 233 | backup_id = i_backup_id; 234 | if found then 235 | o_info := true; 236 | end if; 237 | 238 | end; 239 | $function$; 240 | 241 | alter function backups.stop ( integer) owner to postgres ; 242 | 243 | --- 244 | -------------------------------------------------------------------------------- /pg_archive2/tests/0002_test-wal-cleanup.bats: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bats 2 | # -*- mode: sh; -*- 3 | 4 | load vars 5 | load functions 6 | 7 | _wals_list_before="000000010000000000000001 8 | 000000010000000000000002 9 | 000000010000000000000003 10 | 000000010000000000000004 11 | 000000010000000000000005 12 | 000000010000000000000006 13 | 000000010000000000000007" 14 | 15 | _wals_list_after="000000010000000000000005 16 | 000000010000000000000006 17 | 000000010000000000000007" 18 | 19 | _init_backup () 20 | { 21 | source /app/base-backup_2 22 | 23 | # overwrite vars 24 | load vars 25 | load functions 26 | 27 | _init_backup_task 28 | 29 | # create test backup, logs in /var/tmp/base-backup_test.log 30 | sub_main () { 31 | ( main ) > /dev/null 32 | } 33 | # subfunction for correct error report in bats 34 | sub_main 35 | 36 | # create more old files 37 | cp -u /tmp/pg_xlog/0000* /archive/logs.complete/ 38 | } 39 | 40 | _cleanup_backup () 41 | { 42 | _cleanup_backup_task 43 | 44 | rm -f /archive/SUCCESS 45 | rm -f /archive/logs.complete/* 46 | rm -rf /archive/data.master.0 47 | } 48 | 49 | timeout_always_mock () 50 | { 51 | local res 52 | local mock_timeout=$((4 * 60 * 60)) 53 | 54 | # echo "*** DBG 1: ${1}" >&2 55 | # for ((i = 20; i < 30; i++)); do 56 | # echo "*** DBG $i: ${!i}" >&2 57 | # done 58 | 59 | if [[ $1 = ${mock_timeout} && ${26} == *"/pg_archivecleanup '/archive/logs.complete/' "* ]]; then 60 | echo "*** DBG timeout emulate long ssh call: $@" >&3 61 | command timeout 1 sleep 3 62 | res=$? 63 | else 64 | command timeout "$@" 65 | res=$? 66 | fi 67 | 68 | return $res 69 | } 70 | 71 | timeout_lock_mock () 72 | { 73 | local res 74 | local mock_timeout=$((4 * 60 * 60)) 75 | local lock_startup_timeout=10 76 | 77 | if [[ $1 = ${mock_timeout} && ${26} == *"/pg_archivecleanup '/archive/logs.complete/' "* ]]; then 78 | if (( timeout_lock_mock_calls == 0 )); then 79 | echo "*** DBG timeout emulate long ssh call: $@" >&3 80 | command timeout 1 sleep 3 81 | res=$? 82 | elif (( timeout_lock_mock_calls == 1 )); then 83 | echo "*** DBG timeout emulate remote lock: $@" >&3 84 | 85 | coproc ssh { 86 | ssh test-archive03 \ 87 | flock -n /archive/wal-cleanup.lock \ 88 | bash -c "'echo locked; read -rt ${lock_startup_timeout};'" 89 | } 90 | # wait for flock started 91 | read -rt "${lock_startup_timeout}" -u "${ssh[0]}" line 92 | if [[ $line != 'locked' ]]; then 93 | echo "timeout_lock_mock: cannot create remote lock" >&3 94 | exit 255 95 | fi 96 | 97 | shift 98 | command timeout 10 "$@" 99 | res=$? 100 | 101 | echo done >&${ssh[1]} || true # ignore timeout 102 | wait $ssh_PID # no quote, ignore if pid empty 103 | jobs >&3 104 | fi 105 | (( timeout_lock_mock_calls++ )) 106 | else 107 | command timeout "$@" 108 | res=$? 109 | fi 110 | 111 | return $res 112 | } 113 | 114 | timeout_neterror_mock () 115 | { 116 | local res 117 | local mock_timeout=$((4 * 60 * 60)) 118 | 119 | if [[ $1 = ${mock_timeout} && ${26} == *"/pg_archivecleanup '/archive/logs.complete/' "* ]]; then 120 | # timeout_neterror_mock pass to wal-cleanup via export 121 | # that's why the functions from functions.bash is not available here 122 | _argv_replace () 123 | { 124 | local old=$1; shift 125 | local new=$1; shift 126 | local v 127 | 128 | for v in "$@"; do 129 | if [[ $v = "$old" ]]; then 130 | v=$new 131 | fi 132 | # echo eat args like -n 133 | printf "%s\n" "$v" 134 | done 135 | } 136 | 137 | # remove timeout 138 | shift 139 | # replace hostname 140 | set -- $(_argv_replace test-archive03 test-UNREACHABLE-archive03 "$@") 141 | echo "*** DBG timeout emulate ssh network error: $@" >&3 142 | 143 | command timeout 10 "$@" 144 | res=$? 145 | else 146 | command timeout "$@" 147 | res=$? 148 | fi 149 | 150 | return $res 151 | } 152 | 153 | setup () 154 | { 155 | case "$BATS_TEST_DESCRIPTION" in 156 | "wal-cleanup") 157 | _init_backup 158 | 159 | rsync -av /archive/logs.complete/ test-archive03:/archive/logs.complete/ >&3 160 | rsync -av /archive/data.master.0 test-archive03:/archive/ >&3 161 | 162 | cat > /archive/data.master.0/backup_label < /archive/data.master.0/backup_label' <&2 243 | [[ $out = "$_wals_list_before" ]] 244 | out=$(ssh test-archive03 ls /archive/logs.complete/ | head -7); echo "$out" >&2 245 | [[ $out = "$_wals_list_before" ]] 246 | 247 | cd /archive/ 248 | run /app/wal-cleanup_2 data.master.0/ logs.complete/ test-archive03 249 | 250 | echo "$output" >&2 251 | echo "*** DBG ${lines[-1]}" >&2 252 | 253 | [[ $status -eq 0 ]] 254 | [[ ${lines[-1]} = *'done wal-cleanup: '\''data.master.0/'\'' '\''logs.complete/'\' ]] 255 | out=$(ls /archive/logs.complete/ | head -3); echo "$out" >&2 256 | [[ $out = "$_wals_list_after" ]] 257 | out=$(ssh test-archive03 ls /archive/logs.complete/ | head -3); echo "$out" >&2 258 | [[ $out = "$_wals_list_after" ]] 259 | } 260 | 261 | @test "wal-cleanup, remote timeout" { 262 | cd /archive/ 263 | run /app/wal-cleanup_2 data.master.0/ logs.complete/ test-archive03 264 | 265 | echo "$output" >&2 266 | echo "*** DBG ${lines[-4]}" >&2 267 | echo "*** DBG ${lines[-2]}" >&2 268 | echo "*** DBG ${lines[-1]}" >&2 269 | 270 | [[ $status -eq 0 ]] 271 | [[ ${lines[-4]} = *'REMOTE pg_archivecleanup timeout, retrying (1 of 1)' ]] 272 | [[ ${lines[-2]} = *'REMOTE pg_archivecleanup timeout, abort' ]] 273 | [[ ${lines[-1]} = *'done wal-cleanup: '\''data.master.0/'\'' '\''logs.complete/'\' ]] 274 | } 275 | 276 | @test "wal-cleanup, remote timeout, lock" { 277 | cd /archive/ 278 | run /app/wal-cleanup_2 data.master.0/ logs.complete/ test-archive03 279 | 280 | echo "$output" >&2 281 | echo "*** DBG ${lines[-3]}" >&2 282 | echo "*** DBG ${lines[-1]}" >&2 283 | 284 | [[ $status -eq 1 ]] 285 | [[ ${lines[-3]} = *'REMOTE pg_archivecleanup timeout, retrying (1 of 1)' ]] 286 | [[ ${lines[-1]} = *'REMOTE pg_archivecleanup still running, abort' ]] 287 | } 288 | 289 | @test "wal-cleanup, remote network error" { 290 | cd /archive/ 291 | run /app/wal-cleanup_2 data.master.0/ logs.complete/ test-archive03 292 | 293 | echo "$output" >&2 294 | echo "*** DBG ${lines[-1]}" >&2 295 | 296 | [[ $status -eq 1 ]] 297 | [[ ${lines[-1]} = *'error with REMOTE pg_archivecleanup, abort' ]] 298 | } 299 | 300 | @test "wal-cleanup, local locked" { 301 | local lock_startup_timeout=10 302 | 303 | coproc lock { 304 | flock -n /archive/wal-cleanup.lock \ 305 | bash -c "echo locked; read -rt ${lock_startup_timeout};" 306 | } 307 | # wait for flock started 308 | read -rt "${lock_startup_timeout}" -u "${lock[0]}" line 309 | if [[ $line != 'locked' ]]; then 310 | echo "'wal-cleanup, local locked': cannot start lock" >&2 311 | exit 255 312 | fi 313 | 314 | cd /archive/ 315 | run /app/wal-cleanup_2 data.master.0/ logs.complete/ test-archive03 316 | 317 | echo done >&${lock[1]} || true # ignore timeout 318 | wait $lock_PID # no quote, ignore if pid empty 319 | jobs >&3 320 | 321 | echo "$output" >&2 322 | echo "*** DBG ${lines[-1]}" >&2 323 | 324 | [[ $status -eq 1 ]] 325 | [[ ${lines[-1]} = *'lock file '\''/archive/wal-cleanup.lock'\'' from '*' locked, abort' ]] 326 | } 327 | 328 | @test "wal-cleanup, local lock access error" { 329 | cd /archive/ 330 | run /app/wal-cleanup_2 data.master.0/ /tmp/ test-archive03 331 | 332 | echo "$output" >&2 333 | echo "*** DBG ${lines[-1]}" >&2 334 | 335 | [[ $status -eq 1 ]] 336 | [[ ${lines[-1]} = *' /wal-cleanup.lock: Permission denied' ]] 337 | } 338 | -------------------------------------------------------------------------------- /pg_archive2/tests/0001_test-archive_cmd.bats: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bats 2 | # -*- mode: sh; -*- 3 | 4 | load vars 5 | load functions 6 | 7 | _archive_cleanup () 8 | { 9 | ssh test-archive02 rm -f /archive/wals/000000010000000000000001 10 | ssh test-archive03 rm -f /archive/wals/000000010000000000000001 11 | ssh test-archive02 rm -fr /lib/init/rw/pg_recv_sb/master5/ 12 | ssh test-archive02 rm -f /archive/LASTLOG 13 | ssh test-archive03 rm -f /archive/LASTLOG 14 | rm -f /var/lib/postgresql/LASTLOG 15 | rm -f /tmp/send_wal_errors_cnt* 16 | } 17 | 18 | stat_wrong_size_mock () 19 | { 20 | if [[ $3 = '/tmp/pg_xlog/000000010000000000000001' ]]; then 21 | echo "*** DBG stat_wrong_size_mock: $@" >&3 22 | echo '2019-04-23 22:10:11.673849012 +0300|4096' 23 | return 0 24 | else 25 | command stat "$@" 26 | fi 27 | } 28 | 29 | stat_size_lost_mock () 30 | { 31 | if [[ $3 = '/tmp/pg_xlog/000000010000000000000001' ]]; then 32 | echo "*** DBG stat_size_lost_mock: $@" >&3 33 | echo '2019-04-23 22:10:11.673849012 +0300' 34 | return 0 35 | else 36 | command stat "$@" 37 | fi 38 | } 39 | 40 | # use /archive/wals for this tests, do not destroy /archive/logs.complete/ 41 | # it will be used in other tests 42 | setup () 43 | { 44 | case "$BATS_TEST_DESCRIPTION" in 45 | "archive_cmd") 46 | # refresh IP in known_hosts, this remove future "Permanently added the ECDSA host key for IP address" 47 | # from test output 48 | ssh test-archive02 true > /dev/null 2>&1 49 | ssh test-archive02 ssh test-archive03 true > /dev/null 2>&1 50 | ssh test-archive03 true > /dev/null 2>&1 51 | # cleanup test stand 52 | _archive_cleanup 53 | ;; 54 | "archive_cmd, second archive unreachable") 55 | ;; 56 | "archive_cmd, skip already sent") 57 | ;; 58 | "archive_cmd, duplicate, same md5") 59 | ;; 60 | "archive_cmd, duplicate, different md5") 61 | mkdir /tmp/archive_status/ 62 | dd conv=swab status=none if=/tmp/pg_xlog/000000010000000000000001 of=/tmp/000000010000000000000001 63 | md5sum /tmp/pg_xlog/000000010000000000000001 /tmp/000000010000000000000001 \ 64 | | sed \ 65 | -e 's@/tmp/pg_xlog/000000010000000000000001@/tmp/unpacked@' \ 66 | -e 's@/tmp/000000010000000000000001@/lib/init/rw/pg_recv_sb/master5/000000010000000000000001.bad_md5@' \ 67 | > /tmp/md5 68 | ;; 69 | "archive_cmd, wrong size") 70 | _copy_function stat_wrong_size_mock stat 71 | export -f stat 72 | ;; 73 | "archive_cmd, size lost") 74 | _copy_function stat_size_lost_mock stat 75 | export -f stat 76 | ;; 77 | "archive_cmd, size zero") 78 | mkdir /tmp/archive_status/ 79 | touch /tmp/000000010000000000000001 80 | ;; 81 | esac 82 | } 83 | 84 | teardown () 85 | { 86 | case "$BATS_TEST_DESCRIPTION" in 87 | "archive_cmd") 88 | _archive_cleanup 89 | ;; 90 | "archive_cmd, second archive unreachable") 91 | _archive_cleanup 92 | ;; 93 | "archive_cmd, skip already sent") 94 | _archive_cleanup 95 | ;; 96 | "archive_cmd, duplicate, same md5") 97 | _archive_cleanup 98 | ;; 99 | "archive_cmd, duplicate, different md5") 100 | _archive_cleanup 101 | rm -f /tmp/000000010000000000000001 102 | rmdir /tmp/archive_status/ 103 | rm -f /tmp/md5 104 | ssh test-archive02 rm -f /tmp/unpacked 105 | ;; 106 | "archive_cmd, wrong size") 107 | _archive_cleanup 108 | unset -f stat 109 | ;; 110 | "archive_cmd, size lost") 111 | _archive_cleanup 112 | unset -f stat 113 | ;; 114 | "archive_cmd, size zero") 115 | _archive_cleanup 116 | rm -f /tmp/000000010000000000000001 117 | rmdir /tmp/archive_status/ 118 | ;; 119 | esac 120 | } 121 | 122 | 123 | @test "archive_cmd" { 124 | # /tmp/pg_xlog saved in Dockerfile 125 | run /app/archive_cmd_2 'test-archive02 test-archive03' /archive/wals /tmp/pg_xlog/000000010000000000000001 000000010000000000000001 master5 126 | 127 | local local=$(< /var/lib/postgresql/LASTLOG) 128 | local remote1=$(ssh test-archive02 cat /archive/LASTLOG) 129 | local remote2=$(ssh test-archive03 cat /archive/LASTLOG) 130 | 131 | echo "$output" >&2 132 | 133 | echo "local" >&2 134 | tree -Dspug /archive/ >&2 135 | 136 | echo "test-archive02" >&2 137 | ssh test-archive02 tree -Dspug /archive/ >&2 138 | 139 | echo "test-archive03" >&2 140 | ssh test-archive03 tree -Dspug /archive/ >&2 141 | 142 | echo "local: $local" >&2 143 | echo "remote1: $remote1" >&2 144 | echo "remote2: $remote2" >&2 145 | 146 | [[ $status -eq 0 ]] 147 | [[ ! $output ]] 148 | 149 | ssh test-archive02 test -f /archive/wals/000000010000000000000001 150 | ssh test-archive03 test -f /archive/wals/000000010000000000000001 151 | 152 | [[ $local = '000000010000000000000001' ]] 153 | [[ $remote1 = '000000010000000000000001' ]] 154 | [[ $remote2 = '000000010000000000000001' ]] 155 | } 156 | 157 | @test "archive_cmd, second archive unreachable" { 158 | run /app/archive_cmd_2 'test-archive02 test-UNREACHABLE-archive03' /archive/wals /tmp/pg_xlog/000000010000000000000001 000000010000000000000001 master5 159 | 160 | echo "$output" >&2 161 | echo "*** DBG ${lines[-1]}" >&2 162 | 163 | [[ $status -eq 0 ]] 164 | ssh test-archive02 test -f /archive/wals/000000010000000000000001 165 | [[ ${lines[-1]} = *' can'\''t sync file '\''000000010000000000000001'\'' to host '\''test-UNREACHABLE-archive03'\' ]] 166 | } 167 | 168 | @test "archive_cmd, skip already sent" { 169 | run /app/archive_cmd_2 'test-archive02 ""' /archive/wals /tmp/pg_xlog/000000010000000000000001 000000010000000000000001 master5 170 | run /app/archive_cmd_2 'test-archive02 ""' /archive/wals /tmp/pg_xlog/000000010000000000000001 000000010000000000000001 master5 171 | 172 | echo "$output" >&2 173 | echo "*** DBG ${lines[-1]}" >&2 174 | 175 | [[ $status -eq 0 ]] 176 | ssh test-archive02 test -f /archive/wals/000000010000000000000001 177 | [[ ${lines[-1]} = 'File '\''000000010000000000000001'\'' was already sent to archive. Skipping...' ]] 178 | } 179 | 180 | @test "archive_cmd, duplicate, same md5" { 181 | run /app/archive_cmd_2 'test-archive02 test-archive03' /archive/wals /tmp/pg_xlog/000000010000000000000001 000000010000000000000001 master5 182 | # force duplicate send 183 | rm -f /var/lib/postgresql/LASTLOG 184 | run /app/archive_cmd_2 'test-archive02 test-archive03' /archive/wals /tmp/pg_xlog/000000010000000000000001 000000010000000000000001 master5 185 | 186 | echo "$output" >&2 187 | echo "*** DBG ${lines[-1]}" >&2 188 | 189 | [[ $status -eq 0 ]] 190 | ssh test-archive02 test -f /archive/wals/000000010000000000000001 191 | [[ ${lines[-1]} = 'WARN: /archive/wals/000000010000000000000001 already exist with same md5' ]] 192 | } 193 | 194 | @test "archive_cmd, duplicate, different md5" { 195 | run /app/archive_cmd_2 'test-archive02 test-archive03' /archive/wals /tmp/pg_xlog/000000010000000000000001 000000010000000000000001 master5 196 | # force duplicate send 197 | rm -f /var/lib/postgresql/LASTLOG 198 | run /app/archive_cmd_2 'test-archive02 test-archive03' /archive/wals /tmp/000000010000000000000001 000000010000000000000001 master5 199 | 200 | ssh test-archive02 'pbzip2 -d < /archive/wals/000000010000000000000001 > /tmp/unpacked' 201 | 202 | echo "$output" >&2 203 | echo "*** DBG ${lines[-2]}" >&2 204 | echo "*** DBG ${lines[-1]}" >&2 205 | 206 | echo "WAL local:" >&2 207 | ls -l /tmp/pg_xlog/000000010000000000000001 >&2 208 | echo "WAL remote:" >&2 209 | ssh test-archive02 ls -l /archive/wals/000000010000000000000001 /tmp/unpacked >&2 210 | echo "md5 local:" >&2 211 | md5sum \ 212 | /tmp/pg_xlog/000000010000000000000001 \ 213 | /tmp/000000010000000000000001 >&2 214 | echo "md5 remote:" >&2 215 | ssh test-archive02 md5sum \ 216 | /tmp/unpacked \ 217 | /lib/init/rw/pg_recv_sb/master5/000000010000000000000001.bad_md5 \ 218 | /archive/wals/000000010000000000000001 >&2 219 | 220 | [[ $status -eq 1 ]] 221 | ssh test-archive02 test -f /archive/wals/000000010000000000000001 222 | ssh test-archive02 test -f /lib/init/rw/pg_recv_sb/master5/000000010000000000000001.bad_md5 223 | ssh test-archive02 md5sum -c < /tmp/md5 >&2 224 | [[ ${lines[-2]} = 'ERROR: /archive/wals/000000010000000000000001 already exist with different md5' ]] 225 | [[ ${lines[-1]} = "ERROR: can't send '000000010000000000000001' to archive host 'test-archive02'. Exit code: '1'" ]] 226 | } 227 | 228 | @test "archive_cmd, wrong size" { 229 | run /app/archive_cmd_2 'test-archive02 test-archive03' /archive/wals /tmp/pg_xlog/000000010000000000000001 000000010000000000000001 master5 230 | 231 | echo "$output" >&2 232 | echo "*** DBG ${lines[-2]}" >&2 233 | echo "*** DBG ${lines[-1]}" >&2 234 | 235 | [[ $status -eq 1 ]] 236 | ssh test-archive02 test -f /lib/init/rw/pg_recv_sb/master5/000000010000000000000001.new.bad 237 | ssh test-archive02 test ! -e /archive/wals/000000010000000000000001 238 | [[ ${lines[-2]} = 'ERROR: /lib/init/rw/pg_recv_sb/master5/000000010000000000000001.new.bad size 16777216, expected 4096 bytes, cat exit 0' ]] 239 | [[ ${lines[-1]} = "ERROR: can't send '000000010000000000000001' to archive host 'test-archive02'. Exit code: '1'" ]] 240 | } 241 | 242 | @test "archive_cmd, size lost" { 243 | run /app/archive_cmd_2 'test-archive02 test-archive03' /archive/wals /tmp/pg_xlog/000000010000000000000001 000000010000000000000001 master5 244 | 245 | echo "$output" >&2 246 | echo "*** DBG ${lines[-1]}" >&2 247 | 248 | [[ $status -eq 1 ]] 249 | ssh test-archive02 test ! -e /archive/wals/000000010000000000000001 250 | [[ ${lines[-1]} = "ERROR: can't send '000000010000000000000001' to archive host 'test-archive02'. Exit code: '1'" ]] 251 | } 252 | 253 | @test "archive_cmd, size zero" { 254 | run /app/archive_cmd_2 'test-archive02 test-archive03' /archive/wals /tmp/000000010000000000000001 000000010000000000000001 master5 255 | 256 | echo "$output" >&2 257 | echo "*** DBG ${lines[-2]}" >&2 258 | echo "*** DBG ${lines[-1]}" >&2 259 | 260 | [[ $status -eq 1 ]] 261 | ssh test-archive02 test ! -e /archive/wals/000000010000000000000001 262 | [[ ${lines[-2]} = 'ERROR: /lib/init/rw/pg_recv_sb/master5/000000010000000000000001.new.bad size 0, expected 0 bytes, cat exit 0' ]] 263 | [[ ${lines[-1]} = "ERROR: can't send '000000010000000000000001' to archive host 'test-archive02'. Exit code: '1'" ]] 264 | } 265 | --------------------------------------------------------------------------------