├── .gitignore ├── LICENSE ├── README ├── bin ├── cmd_archiver ├── cmd_queue ├── cmd_standby ├── cmd_worker.py └── threaded_rsync.py ├── doc ├── cmd_archiver.README ├── cmd_queue.README └── cmd_standby.README ├── etc ├── cmd_archiver.ini.sample ├── cmd_standby.ini.sample ├── pg_conf │ └── pg_conf.README └── pg_conf_failover │ └── pg_conf_failover.README └── scripts └── cmd_standby.92.sql /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright Command Prompt, Inc. 2 | 3 | Permission to use, copy, modify, and distribute this software and its 4 | documentation for any purpose, without fee, and without a written agreement 5 | is hereby granted, provided that the above copyright notice and this 6 | paragraph and the following two paragraphs appear in all copies. 7 | 8 | IN NO EVENT SHALL THE COMMAND PROMPT, INC. BE LIABLE TO ANY PARTY FOR 9 | DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING 10 | LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, 11 | EVEN IF THE COMMAND PROMPT, INC. HAS BEEN ADVISED OF THE POSSIBILITY OF 12 | SUCH DAMAGE. 13 | 14 | THE COMMAND PROMPT, INC. SPECIFICALLY DISCLAIMS ANY WARRANTIES, 15 | INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 16 | FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN 17 | "AS IS" BASIS, AND THE COMMAND PROMPT, INC. HAS NO OBLIGATIONS TO 18 | PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. 19 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | Please see the other .readme files and website[1] for detailed 2 | documentation. 3 | 4 | pitrtools is a Python + PostgreSQL application that assists in the 5 | provisioning of Hot Standby, Warm Standby, Cold Standby, Streaming 6 | Replication clusters and multiple standbys (slaves). It supports all 7 | PostgreSQL versions starting from 9.2. 8 | 9 | Requirements: 10 | 11 | * Rsync 12 | * Python >= 2.6 13 | * SSH 14 | * PostgreSQL 9.2+ 15 | 16 | Website: 17 | 18 | [1] https://public.commandprompt.com/projects/pitrtools/wiki 19 | -------------------------------------------------------------------------------- /bin/cmd_archiver: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ LICENSE 4 | 5 | Copyright Command Prompt, Inc. 6 | 7 | Permission to use, copy, modify, and distribute this software and its 8 | documentation for any purpose, without fee, and without a written agreement 9 | is hereby granted, provided that the above copyright notice and this 10 | paragraph and the following two paragraphs appear in all copies. 11 | 12 | IN NO EVENT SHALL THE COMMAND PROMPT, INC. BE LIABLE TO ANY PARTY FOR 13 | DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING 14 | LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, 15 | EVEN IF THE COMMAND PROMPT, INC. HAS BEEN ADVISED OF THE POSSIBILITY OF 16 | SUCH DAMAGE. 17 | 18 | THE COMMAND PROMPT, INC. SPECIFICALLY DISCLAIMS ANY WARRANTIES, 19 | INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 20 | FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN 21 | "AS IS" BASIS, AND THE COMMAND PROMPT, INC. HAS NO OBLIGATIONS TO 22 | PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. 23 | 24 | """ 25 | 26 | # $Id$ 27 | 28 | import os 29 | import re 30 | import sys 31 | import subprocess 32 | from cmd_worker import CMDWorker 33 | 34 | argslist = (("-C", "--config", 35 | dict(dest="configfilename", 36 | action="store", 37 | help="the name of the archiver config file", 38 | metavar="FILE", 39 | default='cmd_archiver.ini')), 40 | ('-F', '--file', 41 | dict(dest="archivefile", 42 | action="store", 43 | help="Archive file", 44 | metavar="FILE")), 45 | ("-I", "--init", 46 | dict(dest="init", 47 | action="store_true", 48 | help="Initialize master environment"))) 49 | 50 | classdict = (('state', 's', None), 51 | ('rsync_flags', 's', ""), 52 | ('slaves', 's', None), 53 | ('user', 's', None), 54 | ('r_archivedir', 's', None), 55 | ('l_archivedir', 's', None), 56 | ('ssh_timeout', 'i', None), 57 | ('notify_ok', 's', None), 58 | ('notify_warning', 's', None), 59 | ('notify_critical', 's', None), 60 | ('debug', 'b', False), 61 | ('pgdata', 's', None), 62 | ('pgcontroldata', 's', ""), 63 | ('rsync_version', 'i', None), 64 | ('includepath', 's', None), 65 | ('ssh_debug', 'b', False)) 66 | 67 | 68 | class CMDArchiver(CMDWorker): 69 | 70 | def init_env_func(self): 71 | """ 72 | Initialize the local queues so we can check each directory for left 73 | over files 74 | """ 75 | self.debuglog("init_env_func(): entered") 76 | # bail out if archivedir exists and not empty or inaccessible. 77 | if os.access(self.l_archivedir, os.F_OK): 78 | if not os.access(self.l_archivedir, os.R_OK | os.W_OK | os.X_OK): 79 | raise Exception("init_env_func(): l_archivedir %s must have r/w/x bits set for the current user" % self.l_archivedir) 80 | 81 | elif any(os.listdir(self.l_archivedir)): 82 | raise Exception("init_env_func(): l_archivedir %s must be be empty" % self.l_archivedir) 83 | 84 | else: 85 | self.log("init_env_func(): l_archivedir %s already exists" % self.l_archivedir, "WARNING") 86 | 87 | mode = 0700 88 | if not os.path.exists(self.l_archivedir): 89 | os.mkdir(self.l_archivedir, mode) 90 | for slave in self.slaves_list: 91 | os.mkdir(os.path.join(self.l_archivedir, slave), mode) 92 | 93 | def check_config(self): 94 | super(CMDArchiver, self).check_config() 95 | 96 | self.cmd_queue = os.path.join(self.pitr_bin_path, "cmd_queue") 97 | pathvars = [self.pgdata, self.cmd_queue] 98 | if not self.options.init: 99 | pathvars.append(self.l_archivedir) 100 | self.check_paths(pathvars) 101 | 102 | if not os.access(self.cmd_queue, os.X_OK): 103 | raise Exception("The cmd_queue file must have executable flag set.") 104 | 105 | def get_pgcontroldata_func(self): 106 | """ 107 | get_pgcontroldata_func doesn't actually do anything yet. This is more 108 | for archival purposes so we can remember the regex 109 | """ 110 | if not self.pgcontroldata: 111 | print 'WARNING: path to pg_controldata utility is not set, assuming it\'s in PATH' 112 | pgcontroldata = 'pg_controldata' 113 | else: 114 | pgcontroldata = self.pgcontroldata 115 | try: 116 | cmd = os.popen("%s %s" % (str(pgcontroldata), str(self.pgdata))) 117 | #return cmd.readlines 118 | for row in cmd: 119 | match = re.search('^Prior checkpoint location: *.{1,}', '%s' % (str(row))) 120 | if match != None: 121 | print match 122 | except OSError, e: 123 | self.log("get_pgcontroldata_func(): %s" % e, "ERROR") 124 | exit(1) 125 | 126 | def archive_func(self): 127 | self.log("Archiving %s" % self.options.archivefile) 128 | 129 | if self.state != "online": 130 | self.notify_external(log=True, warning=True, message="cmd_archiver offline, queuing archives") 131 | return False 132 | 133 | try: 134 | # It is tempting to just hardlink it, but we need to make 135 | # at least one full copy since postgres might want to 136 | # recycle the WAL file at the same inode. The cmd_queue 137 | # will establish hardlinks to individual slave subdirs, 138 | # thus no extra space is taken by the WAL queue. 139 | 140 | # Local rsync does a better job when the file is already 141 | # there (e.g. when cmd_queue is at fault). 142 | self.debuglog("local rsync %s to %s/" % (self.options.archivefile, self.l_archivedir)) 143 | rsync = [self.rsync, 144 | os.path.join(self.pgdata, self.options.archivefile), 145 | os.path.join(self.l_archivedir, "")] 146 | ret = subprocess.call(rsync) 147 | if ret != 0: 148 | self.notify_external(log=True, critical=True, message=("local rsync returned error status: %d" % ret)) 149 | return False 150 | 151 | # We call cmd_queue every time and let it figure itself if 152 | # there's a copy running already. In case there is none, 153 | # the daemon flag comes handy. 154 | cmd_queue = [self.cmd_queue, "-C", self.options.configfilename, 155 | "--daemon"] 156 | self.debuglog("running cmd_queue as: %s" % repr(cmd_queue)) 157 | ret = subprocess.call(cmd_queue) 158 | if ret != 0: 159 | self.notify_external(log=True, critical=True, message=("cmd_queue returned error status: %d" % ret)) 160 | return False 161 | return True 162 | except Exception, e: 163 | self.notify_external(log=True, critical=True, message=("Failed to archive file '%s': %s" % (self.options.archivefile, e))) 164 | return False 165 | 166 | def main(self): 167 | # before we do anything, let's just check who we are 168 | if os.geteuid() == 0: 169 | exit("\nBad Mojo... no root access for this script\n") 170 | 171 | retval = 0 172 | try: 173 | self.parse_commandline_arguments(argslist) 174 | self.load_configuration_file() 175 | 176 | if self.options.init: 177 | print "We are initializing queues, one moment." 178 | self.init_env_func() 179 | else: 180 | self.debuglog("Archiver running") 181 | if self.options.archivefile: 182 | self.archive_func() 183 | else: 184 | print "Config OK. Use -F FILE to actually archive one." 185 | 186 | except Exception, e: 187 | self.log(e, "ERROR") 188 | retval = 1 189 | 190 | self.debuglog("Archiver exiting with status %d" % retval) 191 | return retval 192 | 193 | 194 | if __name__ == '__main__': 195 | archiver = CMDArchiver(classdict) 196 | retval = archiver.main() 197 | exit(retval) 198 | -------------------------------------------------------------------------------- /bin/cmd_queue: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ LICENSE 4 | 5 | Copyright Command Prompt, Inc. 6 | 7 | Permission to use, copy, modify, and distribute this software and its 8 | documentation for any purpose, without fee, and without a written agreement 9 | is hereby granted, provided that the above copyright notice and this 10 | paragraph and the following two paragraphs appear in all copies. 11 | 12 | IN NO EVENT SHALL THE COMMAND PROMPT, INC. BE LIABLE TO ANY PARTY FOR 13 | DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING 14 | LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, 15 | EVEN IF THE COMMAND PROMPT, INC. HAS BEEN ADVISED OF THE POSSIBILITY OF 16 | SUCH DAMAGE. 17 | 18 | THE COMMAND PROMPT, INC. SPECIFICALLY DISCLAIMS ANY WARRANTIES, 19 | INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 20 | FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN 21 | "AS IS" BASIS, AND THE COMMAND PROMPT, INC. HAS NO OBLIGATIONS TO 22 | PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. 23 | 24 | """ 25 | 26 | import os 27 | import errno 28 | import subprocess 29 | from cmd_worker import CMDWorker 30 | 31 | argslist = (("-C", "--config", 32 | dict(dest="configfilename", 33 | action="store", 34 | help="the name of the archiver config file", 35 | metavar="FILE", 36 | default='cmd_archiver.ini')), 37 | ("-d", "--daemon", 38 | dict(dest="daemon", 39 | action="store_true", 40 | help="background daemon mode"))) 41 | 42 | classdict = (('rsync_flags', 's', ""), 43 | ('slaves', 's', None), 44 | ('user', 's', None), 45 | ('r_archivedir', 's', None), 46 | ('l_archivedir', 's', None), 47 | ('ssh_timeout', 'i', None), 48 | ('notify_ok', 's', None), 49 | ('notify_warning', 's', None), 50 | ('notify_critical', 's', None), 51 | ('debug', 'b', False), 52 | ('pgdata', 's', None), 53 | ('rsync_version', 'i', None), 54 | ('includepath', 's', None), 55 | ('ssh_debug', 'b', False), 56 | ('queue_user', 's', None), 57 | ('max_queue_workers', 'i', None), 58 | ('queue_wait', 'i', None)) 59 | 60 | pid_file_name = "cmd_queue.pid" 61 | 62 | class CMDQueue(CMDWorker): 63 | 64 | def check_config(self): 65 | super(CMDQueue, self).check_config() 66 | 67 | if self.max_queue_workers <= 0: 68 | raise Exception("The max_queue_workers setting should be greater than zero.") 69 | 70 | import pwd 71 | if os.geteuid() != pwd.getpwnam(self.queue_user).pw_uid: 72 | raise Exception("Only user '%s' is allowed to run cmd_queue according to the config file." % self.queue_user) 73 | 74 | self.pid_file = os.path.join(self.l_archivedir, pid_file_name) 75 | 76 | pathvars = [self.pgdata, self.l_archivedir] 77 | for slave in self.slaves_list: 78 | pathvars.append(os.path.join(self.l_archivedir, slave)) 79 | self.check_paths(pathvars) 80 | 81 | def list_pending_slaves(self): 82 | hosts = [] 83 | for slave in self.slaves_list: 84 | try: 85 | if os.listdir(os.path.join(self.l_archivedir, slave)): 86 | hosts.append(slave) 87 | self.debuglog("slave `%s' queue not empty" % slave) 88 | except OSError, e: 89 | self.log(e, "ERROR") 90 | return hosts 91 | 92 | def ship_logs_to_slave(self, slave): 93 | cmd = [self.rsync] 94 | cmd.append("--remove-sent-files" if self.rsync_version == 2 else "--remove-source-files") 95 | archivepath = os.path.join(self.l_archivedir, slave, "") # ensure trailing / 96 | cmd.extend(['-e', '%s %s' % (self.ssh, self.ssh_flags), 97 | '-r', archivepath, 98 | '%s@%s:%s/' % (self.user, slave, self.r_archivedir)]) 99 | # extra flags should follow default ones to take effect 100 | cmd.extend(self.rsync_flags.split()) 101 | if self.debug: 102 | cmd.append("-v") 103 | else: 104 | cmd.append("-q") 105 | self.debuglog("Shipping pending logs via: %s" % cmd) 106 | return subprocess.Popen(cmd) 107 | 108 | def wait_for_worker(self): 109 | pid, ret = os.wait() 110 | self.debuglog("a child rsync worker PID %d finished with status %d" % (pid, ret)) 111 | child = self.workers.pop(pid) 112 | if ret != 0: 113 | self.notify_external(log=True, critical=True, message="rsync error %d when shipping to %s" % (ret, child.slave_name)) 114 | 115 | def process_new_archive_files(self): 116 | """ 117 | Hard-link any files found under local archive dir to the slave 118 | subdirs, removing the original link name. Skip our .pid file. 119 | 120 | Occasionally, we might grab a rsync's temp file that it's going 121 | to rename to the final destination. Fortunately, these temp 122 | file names are starting with a dot. Let's ignore any "hidden" 123 | files anyway. 124 | """ 125 | self.debuglog("checking for new archive files in %s" % self.l_archivedir) 126 | files = os.listdir(self.l_archivedir) 127 | for name in files: 128 | file = os.path.join(self.l_archivedir, name) 129 | # process regular files only, exclude our own .pid file and 130 | # any "hidden" files 131 | if not name.startswith('.') and name != pid_file_name and os.path.isfile(file): 132 | self.debuglog("Found new archive file: %s" % name) 133 | # count the number of links to the original name 134 | linked = 0 135 | for slave in self.slaves_list: 136 | target = os.path.join(self.l_archivedir, slave, name) 137 | try: 138 | os.link(file, target) 139 | linked += 1 140 | except OSError, e: 141 | if e.errno == errno.EEXIST: 142 | linked += 1 143 | else: 144 | self.notify_external(log=True, critical=True, message=("Failed to link archive file: %s" % e)) 145 | # Only unlink the original name when every of the 146 | # slaves has got a link. 147 | self.debuglog("linked to %d slave dirs out of %d" % (linked, len(self.slaves_list))) 148 | if linked == len(self.slaves_list): 149 | os.unlink(file) 150 | 151 | def update_slaves(self): 152 | self.process_new_archive_files() 153 | 154 | slaves = self.list_pending_slaves() 155 | self.debuglog("list of slaves pending sync: %s" % repr(slaves)) 156 | for slave in slaves: 157 | self.debuglog("going to ship logs to slave '%s'" % slave) 158 | # Check if we've already running the maximum allowed 159 | # number of worker rsync processes, if so wait for one of 160 | # them to finish. 161 | if len(self.workers) >= self.max_queue_workers: 162 | self.debuglog("we're already running %d child workers, waiting..." % len(self.workers)) 163 | self.wait_for_worker() 164 | 165 | # there's at least one free worker slot, start a worker: 166 | try: 167 | child = self.ship_logs_to_slave(slave) 168 | self.debuglog("started child rsync worker PID %d" % child.pid) 169 | child.slave_name = slave 170 | self.workers[child.pid] = child 171 | except Exception, e: 172 | self.log(e, "ERROR") 173 | 174 | if len(self.workers) > 0: 175 | self.debuglog("waiting for any remaining workers to finish...") 176 | while len(self.workers) > 0: 177 | self.wait_for_worker() 178 | self.debuglog("all workers finished") 179 | 180 | def check_postmaster_alive(self): 181 | return os.path.exists(os.path.join(self.pgdata, "postmaster.pid")) 182 | 183 | def run(self): 184 | import time 185 | 186 | self.workers = dict() # a mapping of PIDs to Popen objects 187 | while True: 188 | self.update_slaves() 189 | if not self.check_postmaster_alive(): 190 | # this is not the loop condition to make sure we run 191 | # at least once 192 | self.log("postmaster isn't running anymore, exiting", "NOTICE") 193 | break 194 | self.debuglog("sleeping for %d seconds" % self.queue_wait) 195 | time.sleep(self.queue_wait) 196 | 197 | def check_pid_file(self): 198 | file = None 199 | try: 200 | self.debuglog("trying to open pid file: %s" % self.pid_file) 201 | file = open(self.pid_file) 202 | other_pid = int(file.readline()) 203 | self.debuglog("checking PID %d" % other_pid) 204 | os.kill(other_pid, 0) 205 | return False 206 | except IOError, e: 207 | self.debuglog(repr(e)) 208 | if e.errno != errno.ENOENT: 209 | self.log("Failed to read the PID file: %s", e) 210 | return False 211 | except ValueError, e: 212 | # assume no other instance running 213 | self.debuglog(repr(e)) 214 | except OSError, e: 215 | self.debuglog(repr(e)) 216 | if e.errno != errno.ESRCH: # No such process 217 | return False 218 | except Exception, e: 219 | self.log(e, "ERROR") 220 | finally: 221 | if file: 222 | file.close() 223 | return True # this means no other pid is running 224 | 225 | def make_pid_file(self): 226 | file = None 227 | try: 228 | self.debuglog("writing pid file: %s" % self.pid_file) 229 | file = open(self.pid_file, "w") 230 | file.write("%s\n" % os.getpid()) 231 | finally: 232 | if file: 233 | file.close() 234 | 235 | def remove_pid_file(self): 236 | if os.path.exists(self.pid_file): 237 | os.unlink(self.pid_file) 238 | 239 | def main(self): 240 | # before we do anything, let's just check who we are 241 | if os.geteuid() == 0: 242 | exit("\nBad Mojo... no root access for this script\n") 243 | 244 | retval = 0 245 | try: 246 | self.parse_commandline_arguments(argslist) 247 | self.load_configuration_file() 248 | 249 | if not self.check_pid_file(): 250 | print "An instance of cmd_queue is already running?" 251 | # this should return success as cmd_archiver is 252 | # running us every time 253 | else: 254 | if self.options.daemon: 255 | self.debuglog("going into background daemon mode...") 256 | # set SIGHUP handler beforehand to avoid race 257 | # condition after fork() 258 | import signal 259 | signal.signal(signal.SIGHUP, signal.SIG_IGN) 260 | 261 | pid = os.fork() 262 | if pid != 0: 263 | self.debuglog("forked PID %d" % pid) 264 | exit(0) 265 | # the child process goes on 266 | 267 | try: 268 | self.make_pid_file() 269 | self.run() 270 | finally: 271 | self.remove_pid_file() 272 | 273 | except Exception, e: 274 | self.log(e, "ERROR") 275 | retval = 1 276 | 277 | self.debuglog("cmd_queue exiting with status %d" % retval) 278 | return retval 279 | 280 | 281 | if __name__ == '__main__': 282 | queue = CMDQueue(classdict) 283 | retval = queue.main() 284 | exit(retval) 285 | -------------------------------------------------------------------------------- /bin/cmd_standby: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # 4 | # cmd_standby copyright command prompt inc 5 | # 6 | # 7 | # $Id$ 8 | """ 9 | License 10 | 11 | Copyright Command Prompt, Inc. 12 | 13 | Permission to use, copy, modify, and distribute this software and its 14 | documentation for any purpose, without fee, and without a written agreement 15 | is hereby granted, provided that the above copyright notice and this 16 | paragraph and the following two paragraphs appear in all copies. 17 | 18 | IN NO EVENT SHALL COMMAND PROMPT, INC BE LIABLE TO ANY PARTY FOR 19 | DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING 20 | LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, 21 | EVEN IF COMMAND PROMPT, INC HAS BEEN ADVISED OF THE POSSIBILITY OF 22 | SUCH DAMAGE. 23 | 24 | COMMAND PROMPT, INC SPECIFICALLY DISCLAIMS ANY WARRANTIES, 25 | INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 26 | FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN 27 | "AS IS" BASIS, AND COMMAND PROMPT, INC HAS NO OBLIGATIONS TO 28 | PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. 29 | """ 30 | 31 | import os 32 | import shutil 33 | import sys 34 | import re 35 | import subprocess 36 | 37 | from sys import * 38 | from os import system 39 | from cmd_worker import CMDWorker 40 | from time import sleep 41 | 42 | 43 | argslist = (("-A", "--action", dict(dest="pgctl_action", action="store", help="Start/Stop PostgreSQL", metavar="start|stop|stop_basebackup")), 44 | ("-B", "--basebackup", dict(dest="base_backup", action="store_true", help="Start a base backup", metavar="FILE")), 45 | ("-C", "--config", dict(dest="configfilename", action="store", help="Name of the archiver config file", metavar="FILE", default='cmd_standby.ini')), 46 | ("-F", "--failover", dict(dest="failover", action="store", help="Value is a 3 digit number 999", metavar="VALUE")), 47 | ("-I", "--dbinit", dict(dest="dbinit", action="store_true", help="Use before -B", metavar="FILE")), 48 | ("-P", "--ping", dict(dest="ping_check", action="store_true", help="Is my master alive?", metavar="FILE")), 49 | ("-R", "--recovertotime", dict(dest="recovertotime", action="store", help="To restore to a specific point in time", metavar="TIMESTAMP")), 50 | ("-S", "--standby", dict(dest="standby", action="store_true", help="Enter standby mode", metavar="FILE"))) 51 | 52 | classdict = (('pgversion', 's', None), 53 | ('rsync_flags', 's', ""), 54 | ('rsync_threads', 'i', ""), 55 | ('master_public_ip', 's', None), 56 | ('master_local_ip', 's', ""), 57 | ('user', 's', None), 58 | ('debug', 'b', False), 59 | ('ssh_debug', 'b', False), 60 | ('port', 'i', None), 61 | ('ssh_timeout', 'i', None), 62 | ('archivedir', 's', None), 63 | ('includepath', 's', None), 64 | ('pgdata', 's', None), 65 | ('pg_conf', 's', ""), 66 | ('pg_hba_conf', 's', ""), 67 | ('pg_conf_failover', 's', ""), 68 | ('pg_hba_conf_failover', 's', ""), 69 | ('no_copy_conf', 'b', False), 70 | ('recovery_conf', 's', ""), 71 | ('logfile', 's', ""), 72 | ('action_failover', 's', ""), 73 | ('use_streaming_replication', 'b', False), 74 | ('trigger_file', 's', ""), 75 | ('repl_db_user', 's', 'postgres'), 76 | ('repl_db_password', 's', ""), 77 | ('sslmode', 's', 'prefer'), 78 | ('notify_ok', 's', None), 79 | ('notify_warning', 's', None), 80 | ('notify_critical', 's', None)) 81 | 82 | 83 | class RsyncFailure(Exception): 84 | """ Class to propagate rsync exceptions""" 85 | pass 86 | 87 | 88 | class SSHFailure(Exception): 89 | """ Class to propagate SSH errors, i.e. wrong password """ 90 | pass 91 | 92 | 93 | class RemoteNoData(Exception): 94 | """ Class to propagate errors when remote host returns empty data """ 95 | pass 96 | 97 | 98 | class CMDStandby(CMDWorker): 99 | 100 | # Validate the command line 101 | # XXX: replace those parser.error with exceptions in the future 102 | # for now, they seem to be fine, cause they only happen in the very 103 | # beginning, when to resources are allocated yet. 104 | @staticmethod 105 | def command_line_check_func(parser, options): 106 | if options.configfilename == None: 107 | parser.error("option -C is required") 108 | 109 | if options.recovertotime and not (options.failover == '999'): 110 | parser.error("option -R requires open -F999") 111 | 112 | if options.pgctl_action: 113 | valid_action = ['start', 'stop', 'stop_basebackup', 'start_basebackup'] 114 | if options.pgctl_action not in valid_action: 115 | parser.error("option -A requires one of start, stop, stop_basebackup or start_basebackup") 116 | 117 | def get_remote_psql(self): 118 | """Get path to psql on master""" 119 | if 'master_public_ip' in vars(self): 120 | if 'includepath' not in vars(self): 121 | raise Exception("CONFIG: No path in config file, can't find executables") 122 | path = " ".join(self.includepath.split(os.pathsep)) 123 | ssh = '%s -o PasswordAuthentication=no %s' % (self.ssh, self.master_public_ip) 124 | getpaths = subprocess.check_output('%s "find %s -ignore_readdir_race -name psql"' % (ssh, path), shell=True) 125 | getpaths = getpaths.strip().split("\n") 126 | if getpaths: 127 | self.r_psql = getpaths[0] 128 | else: 129 | raise Exception("Couldn't find remote psql") 130 | 131 | def locate_binaries(self): 132 | exes = ["pg_ctl"] 133 | if not self.use_streaming_replication: 134 | exes.append("pg_standby") 135 | else: 136 | if self.options.recovertotime: 137 | raise Exception('CONFIG: Unable to use recovery_target_time with streaming replication') 138 | exes.append("pg_archivecleanup") 139 | 140 | super(CMDStandby, self).locate_binaries(CMDWorker.COMMON_BIN_NAMES + exes) 141 | 142 | # Let's make sure archive directory and config files can be reached 143 | def check_config(self): 144 | super(CMDStandby, self).check_config() 145 | 146 | if float(self.pgversion) < 9.2: 147 | raise Exception('CONFIG: PITRTools only supports PostgreSQL versions 9.2+') 148 | 149 | pathvars = [self.pg_conf, self.pg_hba_conf] 150 | self.check_paths(pathvars) 151 | 152 | # make a new directory if archivedir doesn't exist. 153 | if not os.access(self.archivedir, os.R_OK | os.W_OK | os.X_OK): 154 | os.makedirs(self.archivedir, 0700) 155 | 156 | if self.options.failover: 157 | # check that the configuration files are all set. 158 | if not self.pg_conf_failover or not self.pg_hba_conf_failover: 159 | raise Exception("CONFIG: failover is requested but pg_conf_failover and/or pg_hba_conf_failover are not set, exiting") 160 | else: 161 | for f in (self.pg_conf_failover, self.pg_hba_conf_failover): 162 | if not os.access(f, os.F_OK | os.R_OK): 163 | raise Exception("CONFIG: unable to read the failover file %s", str(f)) 164 | 165 | # Create pg_xlog dir if missing, follow symlink if present 166 | # XXX: change os.stat to os.access 167 | def check_pgxlog_path_func(self): 168 | pg_xlog_dir = "%s/%s" % (str(self.pgdata), str('pg_xlog')) 169 | success = False 170 | try: 171 | pg_xlog_dir = os.path.realpath(pg_xlog_dir) 172 | # XXX: we might be able to stat the directory, but still get the 173 | # undersired permissions (i.e no write access). 174 | os.stat(pg_xlog_dir) 175 | success = True 176 | except: 177 | try: 178 | self.debuglog("check_pgxlog_path_func(): pg_xlog_dir = %s" % pg_xlog_dir) 179 | os.makedirs(pg_xlog_dir, 0700) 180 | except OSError, e: 181 | self.log("check_pgxlog_path_func(): %s" % e, "ERROR") 182 | self.log("You may have permission problems; Make sure user %s can create directory %s" % (self.user, pg_xlog_dir)) 183 | except Exception, e: 184 | self.log("check_pgxlog_path_func(): %s" % e, "ERROR") 185 | else: 186 | success = True 187 | return success 188 | 189 | def set_rsync_options(self): 190 | self.rsync_flags = "-al %s --delete " % (self.rsync_flags) 191 | if self.debug: 192 | self.rsync_flags += '-v --stats ' 193 | 194 | def set_pg_standby_options(self): 195 | self.pg_standby_flags = '-s5 -w0 -c ' 196 | if self.debug: 197 | self.pg_standby_flags += '-d ' 198 | self.pg_standby_args = "%f %p %r " 199 | 200 | def set_connect_and_copy_options(self): 201 | # Yes the odd counted " is needed because of the way we have to quote within the command 202 | # There may be a better way to do this, but I got tired of fighting. 203 | ssh_connect = """%s %s %s@%s """ % (self.ssh, self.ssh_flags, self.user, self.master_public_ip) 204 | psql_connect = """ "%s -A -t -U%s -p%s -dpostgres """ % (self.r_psql, self.user, self.port) 205 | 206 | if self.master_local_ip: 207 | psql_connect += '-h%s ' % (self.master_local_ip,) 208 | self.copy_dirs = [self.rsync] 209 | self.copy_dirs.extend(self.rsync_flags.split()) 210 | self.copy_dirs.extend(["--exclude=pg_log/", "--exclude=pg_xlog/", "--exclude=postgresql.conf", "--exclude=pg_hba.conf", "--exclude=postmaster.pid", "-e", 211 | self.ssh, "%s@%s:" % (self.user, self.master_public_ip)]) 212 | self.ssh_psql = ssh_connect + psql_connect 213 | self.pgctl_base = [self.pg_ctl, "-D", self.pgdata] 214 | if self.no_copy_conf: 215 | # we don't copy configuration files to the data directory, instead, 216 | # set command-line options to get them from their pristine locations 217 | if not self.options.failover: 218 | pg_conf = self.pg_conf 219 | pg_hba_conf = self.pg_hba_conf 220 | else: 221 | pg_conf = self.pg_conf_failover 222 | pg_hba_conf = self.pg_hba_conf_failover 223 | # XXX: pg_ctl is sensitive to the order of -c options: 224 | # config_file always go first, additional '-o' options doesn't work, 225 | # so both postgresql.conf and pg_hba.conf should be in a single -o. 226 | config_file_opts = "" 227 | if pg_conf and os.access(pg_conf, os.F_OK | os.R_OK): 228 | config_file_opts += "-c config_file=%s " % (pg_conf,) 229 | if pg_hba_conf and os.access(pg_hba_conf, os.F_OK | os.R_OK): 230 | config_file_opts += "-c hba_file=%s " % (pg_hba_conf,) 231 | if config_file_opts: 232 | self.pgctl_base += ["-o",config_file_opts] 233 | if self.logfile: 234 | self.pgctl_base += ["-l",self.logfile] 235 | 236 | def set_recovery_options(self): 237 | # set more sane value for trigger and recovery files now that we know 238 | # pgdata value: 239 | if self.trigger_file == "": 240 | self.trigger_file = "%s/cmd_end_recovery" % (self.pgdata,) 241 | if self.recovery_conf == "": 242 | self.recovery_conf = """%s/recovery.conf""" % (self.pgdata,) 243 | # Recovery string for recovery.conf 244 | self.recovery_stopped_file = re.sub(r'recovery.conf$', 'recovery.stopped', self.recovery_conf) 245 | 246 | # 3 different recovery_string options: 247 | # no streaming replication and no failover - use pg_standby with restore_command 248 | # SR - include both primary_conninfo and restore_command w/o pg_standby. 249 | # failover - use only resore_command w/o pg_standby. 250 | 251 | if self.use_streaming_replication and self.options.failover != '9999': 252 | restore_command = """'cp %s/%%f "%%p"' """ % (self.archivedir,) 253 | cleanup_string = """'%s %s %%r'""" % (self.pg_archivecleanup, self.archivedir) 254 | primary_conninfo_string = "host=%s port=%s user=%s sslmode=%s " % (self.master_public_ip, self.port, self.repl_db_user, self.sslmode) 255 | # streaming replication and not failover, if password is not supplied 256 | # it's expected to be found in .pgpass or via PGPASSWORD variable. 257 | # see http://www.postgresql.org/docs/current/static/libpq-envars.html 258 | if len(self.repl_db_password) > 0: 259 | primary_conninfo_string += "password=%s " % (self.repl_db_password,) 260 | self.recovery_string = """standby_mode = 'on'\nprimary_conninfo = '%s'\ntrigger_file = '%s'\nrestore_command = %s\narchive_cleanup_command = %s """ % (primary_conninfo_string, self.trigger_file, restore_command, cleanup_string) 261 | elif self.options.failover == '999': 262 | self.recovery_string = """restore_command = 'cp %s/%%f "%%p"' """ % (self.archivedir,) 263 | if self.options.recovertotime: 264 | self.recovery_string += """\nrecovery_target_time = '%s' """ % (self.options.recovertotime,) 265 | else: 266 | self.recovery_string = """restore_command = '%s %s %s %s' """ % (self.pg_standby, self.pg_standby_flags, self.archivedir, self.pg_standby_args) 267 | 268 | def set_options(self): 269 | self.set_rsync_options() 270 | self.set_pg_standby_options() 271 | self.set_connect_and_copy_options() 272 | self.set_recovery_options() 273 | 274 | # Check the master for being alive 275 | def ping_check_func(self): 276 | success = False 277 | try: 278 | output = self.exec_query_on_primary("""'SELECT 1'""", False) 279 | except RemoteNoData, e: 280 | self.log("ping_check_func(): remote host returned no data for query: %s" % e, "ERROR") 281 | except SSHFailure: 282 | self.log("ping_check_func(): Received error code 255 from ssh, likely due to the wrong password", "ERROR") 283 | except Exception, e: 284 | self.log("ping_check_func(): %s" % e, "ERROR") 285 | else: 286 | for row in output: 287 | row = row.rstrip('\n') 288 | self.debuglog("ping_check_func(): row = %s" % row) 289 | if str(row) != "1": 290 | self.notify_external(critical=True, message="no response from master") 291 | else: 292 | success = True 293 | if success: 294 | print "Got response from the master" 295 | else: 296 | print "ERROR: no response from the master" 297 | return success 298 | 299 | # TODO: replace popen with subprocess module calls. 300 | def exec_query_on_primary(self, query, emptyok=True): 301 | """ Runs a database query on the primary node and returns the psql output""" 302 | self.debuglog("exec_query_on_primary: executing query `%s' via: %s" % (query, self.ssh_psql)) 303 | p = os.popen("%s -c %s\"" % (self.ssh_psql, query)) 304 | result = p.readlines() 305 | exitstatus = p.close() 306 | if exitstatus: 307 | # check whether ssh has terminated due to an error occurred, likely an incorrect password 308 | if (exitstatus>>8) == 255: 309 | raise SSHFailure 310 | raise Exception("command returned non-zero exit status") 311 | if len(result) == 0 and not emptyok: 312 | raise RemoteNoData(query) 313 | return result 314 | 315 | # This function gives us all the non pgdata directories required 316 | # for operation, such as table spaces 317 | def primary_get_tablespace_paths(self): 318 | try: 319 | paths = self.exec_query_on_primary("""'SELECT * FROM cmd_get_data_dirs()'""") 320 | except: 321 | self.log("primary_get_tablespace_paths(): Unable to get namespace paths; did you apply the helper scripts in cmd_standby.sql?", "ERROR") 322 | raise 323 | return paths 324 | 325 | def primary_get_datadir_path(self): 326 | try: 327 | path = self.exec_query_on_primary("""'SELECT * FROM cmd_get_pgdata() LIMIT 1'""", False) 328 | except RemoteNoData: 329 | self.log("primary_get_datadir_path(): Unable to get namespace paths; did you apply the helper scripts in cmd_standby.sql?", "ERROR") 330 | raise 331 | return path[0] 332 | 333 | # Start a base backup on the master 334 | # First we issue a checkpoint and then a start backup 335 | def start_backup_func(self): 336 | retval = os.system("rm -rf %s/*" % (self.archivedir,)) 337 | if retval: 338 | self.log("Unable to rm old archives: return code %d" % retval, "WARNING") 339 | output = self.exec_query_on_primary(""" 'checkpoint' """, False) 340 | for row in output: 341 | row = row.rstrip('\n') 342 | self.debuglog("start_backup_func(): row = %s" % row) 343 | if str(row) != "CHECKPOINT": 344 | self.notify_external(log=True, critical=True, message="Unable to execute CHECKPOINT") 345 | raise Exception("ERROR: Unable to execute CHECKPOINT") 346 | output = self.exec_query_on_primary(""" 'SELECT cmd_pg_start_backup()' """, False) 347 | for row in output: 348 | row = row.rstrip('\n') 349 | self.debuglog("start_backup_func(): cmd_pg_start_backup: row = %s" % row) 350 | if str(row) != "1": 351 | raise Exception("ERROR: Unable to start base backup") 352 | 353 | def stop_backup_func(self): 354 | """ 355 | Stop base backup. This function catches all incoming exception inside and 356 | indicates errors with return status, unlike other _backup counterparts. The 357 | rationale is that since it's the last call in a function we can avoid an 358 | extra try..catch block. 359 | """ 360 | success = False 361 | try: 362 | output = self.exec_query_on_primary(""" 'SELECT cmd_pg_stop_backup()' """) 363 | except RemoteNoData, e: 364 | self.log("stop_backup_func(): remote host returned no data for query: %s" % e, "ERROR") 365 | except SSHFailure: 366 | self.log("stop_backup_func(): received error code 255 from ssh, likely due to the wrong password", "ERROR") 367 | except Exception, e: 368 | self.log("stop_backup_func(): %s" % e, "ERROR") 369 | else: 370 | for row in output: 371 | row = row.rstrip('\n') 372 | self.debuglog("stop_backup_func(): cmd_pg_stop_backup: row = %s" % row) 373 | if str(row) == "1": 374 | success = True 375 | if not success: 376 | self.log("stop_backup_func(): Unable to stop base backup", "ERROR") 377 | return success 378 | 379 | # Simple function to help ensure we have all paths created for postgresql 380 | def dbinit_func(self): 381 | check = self.check_pgpid_func() 382 | if check == 0: 383 | self.log("dbinit_func(): Can not execute --dbinit with PG running locally", "ERROR") 384 | return False 385 | # check whether pgdata and tablespace paths exist, create if not. 386 | try: 387 | paths = self.primary_get_tablespace_paths() 388 | paths.insert(0, self.pgdata) 389 | for row in paths: 390 | if self.debug: 391 | print "DEBUG: " + row 392 | row = row.rstrip('\n') 393 | if not os.path.isdir(row): 394 | os.makedirs(row, 0700) 395 | except OSError, e: 396 | self.log("dbinit_func(): %s " % e, "ERROR") 397 | self.log("You may have permission problems; make sure user %s can create directory %s" % (self.user, self.pgdata)) 398 | except Exception, e: 399 | self.log("dbinit_func(): %s " % e, "ERROR") 400 | else: 401 | self.log("dbinit_func(): Standby filesystem is ready") 402 | return True 403 | self.log("dbinit_func(): failed", "ERROR") 404 | return False 405 | 406 | # Takes a base backup of master. This function is tricky because 407 | # there is a possibility of a non 0 exit status even when successful 408 | def base_backup_func(self): 409 | # first, copy tablespaces 410 | paths = self.primary_get_tablespace_paths() 411 | for row in paths: 412 | row = row.rstrip('\n') 413 | self.debuglog("base_backup_func(): row = %s" % row) 414 | retval = self.rsync_dir(row) 415 | if retval and retval != 23 and retval != 24: # 23, 24 incomplete or vanished source file. 416 | self.log("base_backup_func(): Couldn't rsync", "ERROR") 417 | raise RsyncFailure 418 | # finally, copy over pgdata 419 | # Before, doing the rsync, make sure we cleanup pg_xlog for streaming replication 420 | # XXX: does it makes sense at all, given that we don't copy pg_xlog from master? 421 | if self.use_streaming_replication: 422 | self.debuglog("base_backup_func(): Cleaning up master pg_xlog directory before rsync") 423 | os.system("rm -rf " + self.pgdata + "/pg_xlog/*") 424 | master_pgdata = self.primary_get_datadir_path().strip('\n') 425 | retval = self.rsync_dir(master_pgdata, self.pgdata) 426 | if retval and retval != 23 and retval != 24: 427 | self.log("base_backup_func(): Couldn't rsync", "ERROR") 428 | raise RsyncFailure 429 | 430 | def rsync_dir(self, dirfrom, dirto): 431 | cmd_args = list(self.copy_dirs) # make a copy 432 | # last arg of copy_dirs is "user@master:", concat the remote dir 433 | cmd_args[-1] += dirfrom + "/" 434 | cmd_args += [dirto + "/"] 435 | if self.rsync_threads > 1: 436 | trsync_path = os.path.join(self.pitr_bin_path, "threaded_rsync.py") 437 | cmd_args = [trsync_path, "--num_threads", str(self.rsync_threads), " ".join(cmd_args)] 438 | self.debuglog("running rsync as: %s" % cmd_args) 439 | return subprocess.call(cmd_args) 440 | 441 | # Start postgresql 442 | def start_postgresql_func(self): 443 | pgctl = self.pgctl_base 444 | pgctl.insert(1, "start") 445 | retval = subprocess.call(pgctl) 446 | if retval: 447 | self.notify_external(log=True, warning=True, message="Unable to start PostgreSQL") 448 | return False 449 | # check whether PostgreSQL has already started. Sleep to give it time to write 450 | # the pid file. 451 | sleep(2) 452 | check = self.check_pgpid_func() 453 | if check != 0: 454 | self.log("start_postgresql_func(): PostgreSQL refused to start", "ERROR") 455 | return False 456 | return True 457 | 458 | # Stop postgresql 459 | def stop_postgresql_func(self): 460 | pgctl = self.pgctl_base + ["-m", "fast"] 461 | pgctl.insert(1, "stop") 462 | retval = subprocess.call(pgctl) 463 | if retval: 464 | self.notify_external(critical=True, message="Unable to stop PostgreSQL") 465 | return False 466 | return True 467 | 468 | # Writes recovery.conf file to pgdata 469 | def write_recovery_func(self): 470 | try: 471 | file = open(self.recovery_conf, 'w') 472 | file.write('%s' % (self.recovery_string)) 473 | file.close() 474 | except Exception, e: 475 | self.log("write_recovery_func(): Unable to write recovery file %s" % self.recovery_conf) 476 | self.notify_external(critical=True, message="Unable to write recovery file: %s" % (e,)) 477 | raise 478 | 479 | # Copies in production slave configurations from storage location to production pgdata location 480 | def copy_conf(self, failover=False): 481 | # return if we opt to set command-line options to read files from 482 | # their original location. 483 | if self.no_copy_conf: 484 | return 485 | if failover: 486 | pg_conf = self.pg_conf_failover; 487 | pg_hba_conf = self.pg_hba_conf_failover; 488 | else: 489 | pg_conf = self.pg_conf; 490 | pg_hba_conf = self.pg_hba_conf; 491 | 492 | for f in (pg_conf, pg_hba_conf): 493 | try: 494 | if f: 495 | if os.access(f, os.F_OK | os.R_OK): 496 | shutil.copy(f, self.pgdata) 497 | else: 498 | self.log("copy_conf(): %s is inaccessible" % f, "WARNING") 499 | except Exception: 500 | self.log("Unable to copy configuration file %s" % f, "ERROR") 501 | raise 502 | 503 | # Standby function, we need to write the recovery configuration 504 | # and start postgresql 505 | def standby_func(self): 506 | result = False 507 | check = self.check_pgpid_func() 508 | if check == 0: 509 | self.log("standby_func(): Can not enter standby mode if PG is already running", "ERROR") 510 | return False 511 | try: 512 | self.write_recovery_func() 513 | except Exception, e: 514 | self.log("standby_func(): %s" % e, "ERROR") 515 | else: 516 | self.copy_conf() 517 | result = self.start_postgresql_func() 518 | if not result: 519 | self.log("standby_func(): Unable to enter standby mode", "ERROR") 520 | else: 521 | self.log("standby_func(): Successfully entered standby mode") 522 | return result 523 | 524 | # Function allows you to specify a script to execute on failover 525 | # The script must return 0 to be considered successful 526 | def failover_action_func(self): 527 | if self.action_failover: 528 | retval = system("%s" % (self.action_failover)) 529 | if retval: 530 | self.notify_external(critical=True, message="failover action returned non-zero exit code") 531 | return False 532 | else: 533 | self.log("Statistics are not replicated in warm standby mode. Try executing ANALYZE on your databases.") 534 | return True 535 | 536 | def run_base_backup(self): 537 | result = False 538 | check = self.check_pgpid_func() 539 | if check == 0: 540 | self.log("run_base_backup(): Cannot take base backup with PG running locally", "ERROR") 541 | else: 542 | try: 543 | self.start_backup_func() 544 | self.debuglog("start_backup_func() handled") 545 | self.base_backup_func() 546 | self.debuglog("base_backup_func() handled") 547 | if self.check_pgxlog_path_func(): 548 | result = True 549 | self.debuglog("check_pgxlog_path_func() handled") 550 | else: 551 | self.notify_external(critical=True, message="unable to access xlog path") 552 | except RsyncFailure: 553 | self.log("run_base_backup(): rsync returned non-zero exit status. Check logged errors as this may be harmless.", "WARNING") 554 | except RemoteNoData, e: 555 | self.log("run_base_backup(): remote host returned no data for query: %s" % e, "ERROR") 556 | except SSHFailure: 557 | self.log("run_base_backup(): received error code 255 from ssh, likely due to the wrong password", "ERROR") 558 | except Exception, e: 559 | self.log("run_base_backup(): %s" % e, "ERROR") 560 | finally: 561 | result = self.stop_backup_func() and result 562 | if result: 563 | self.log("run_base_backup(): Base backup finished successfully") 564 | else: 565 | self.log("run_base_backup(): Base backup is unsuccessful and should be restarted", "ERROR") 566 | return result 567 | 568 | def do_failover(self): 569 | success = True 570 | try: 571 | check = self.check_pgpid_func() 572 | # Note: we already check that use_streaming_replication is not issued 573 | # with recovertotime inside of check_config. 574 | if self.use_streaming_replication: 575 | if check == 0: 576 | # Postgres is running, just touch the trigger file and all should be good. 577 | try: 578 | file = open(self.trigger_file, "w") 579 | file.close() 580 | except: 581 | self.log("do_failover(): unable to write trigger file %s" % self.trigger_file, "ERROR") 582 | raise 583 | else: 584 | # pgsql is NOT running, and we are trying to failover. Lets try to rename recovery.conf into recovery.stopped and try to start pgsql. 585 | try: 586 | os.rename(self.recovery_conf, self.recovery_stopped_file) 587 | except: 588 | self.log("do_failover(): unable to rename %s to %s" % (self.recovery_conf, self.recovery_stopped_file), "ERROR") 589 | raise 590 | else: 591 | # XXX: trying to stop the server that does recovery may not work 592 | if check == 0: 593 | self.stop_postgresql_func() 594 | self.write_recovery_func() 595 | # XXX: if it's streaming replication and the server is already running - than it's running with an old set of configuration files. 596 | self.copy_conf(failover=True) 597 | except Exception, e: 598 | self.log("do_failover(): %s" % e, "ERROR") 599 | success = False 600 | else: 601 | # try to start PostgreSQL if it's not running 602 | if not self.use_streaming_replication or check != 0: 603 | success = success and self.start_postgresql_func() 604 | if not success: 605 | self.log("do_failover(): failover FAILED", "ERROR") 606 | else: 607 | self.log("do_failover(): failover finished") 608 | # technically, failover has succeeded at this moment, but we still may fail 609 | # at the failover action script. Distinguish that case from failure at failover 610 | success = success and self.failover_action_func() 611 | if success: 612 | self.notify_external(ok=True, message="successful failover") 613 | return success 614 | 615 | def main(self): 616 | # before we do anything, let's just check you we are 617 | if os.geteuid() == 0: 618 | sys.exit("\nBad Mojo... no root access for this script\n") 619 | 620 | retval = 0 621 | try: 622 | self.parse_commandline_arguments(argslist, CMDStandby.command_line_check_func) 623 | self.load_configuration_file() 624 | 625 | # Locate executables on master 626 | self.get_remote_psql() 627 | # Configure ssh command-line string, rsync options, 628 | # pg_standy arguments and recovery options. 629 | self.set_options() 630 | 631 | success = True 632 | # perform different actions depending on command-line switches 633 | if self.options.dbinit: 634 | success = self.dbinit_func() 635 | 636 | elif self.options.base_backup: 637 | success = self.run_base_backup() 638 | 639 | elif self.options.failover == '999': 640 | success = self.do_failover() 641 | 642 | elif self.options.standby: 643 | success = self.standby_func() 644 | 645 | elif self.options.ping_check: 646 | success = self.ping_check_func() 647 | 648 | elif self.options.pgctl_action == 'start': 649 | self.copy_conf() 650 | success = self.start_postgresql_func() 651 | 652 | elif self.options.pgctl_action == 'stop': 653 | success = self.stop_postgresql_func() 654 | 655 | elif self.options.pgctl_action == 'stop_basebackup': 656 | success = self.stop_backup_func() 657 | 658 | else: 659 | print "Config OK" 660 | 661 | if not success: 662 | retval = 1 663 | except Exception, e: 664 | self.log(e, "ERROR") 665 | retval = 2 666 | 667 | self.debuglog("Standby exiting with status %d" % retval) 668 | return retval 669 | 670 | if __name__ == '__main__': 671 | standby = CMDStandby(classdict) 672 | retval = standby.main() 673 | exit(retval) 674 | -------------------------------------------------------------------------------- /bin/cmd_worker.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Base class for CMDStandby and CMDArchiver. 4 | 5 | import os 6 | import sys 7 | import time 8 | import traceback 9 | import subprocess 10 | from optparse import * 11 | from ConfigParser import * 12 | 13 | class CMDWorker(object): 14 | """ 15 | Base class for CMDArchiver and CMDStandby, 16 | containing common routines to read configuration options, 17 | notify external programs and do basic sanity checks. 18 | """ 19 | 20 | def __init__(self, classdict): 21 | self.classdict = classdict 22 | self.pitr_bin_path = os.path.realpath( 23 | os.path.join(os.getcwd(), os.path.dirname(__file__)) 24 | ) 25 | 26 | def parse_commandline_arguments(self, argslist, options_check_cb=None, 27 | usage="usage: %prog [options] arg1 arg2", version="%prog (pitrtools) 1.3\n\nCopyright Command Prompt, Inc.\n\nFor licensing information see the LICENSE file.\n"): 28 | 29 | parser = OptionParser(usage=usage,version=version) 30 | 31 | for arg in argslist: 32 | parser.add_option(arg[0], arg[1], **arg[2]) 33 | 34 | self.options, self.args = parser.parse_args() 35 | if options_check_cb: 36 | options_check_cb(parser, self.options) 37 | 38 | def load_configuration_file(self, set_defaults_cb=None): 39 | result = dict() 40 | config = ConfigParser() 41 | files = config.read(self.options.configfilename) 42 | if not files: 43 | raise Exception('Configuration file %s is empty or not found' % (self.options.configfilename,)) 44 | for opt in self.classdict: 45 | key, typ, default = opt 46 | val = None 47 | try: 48 | if typ == 's': 49 | val = config.get('DEFAULT', key) 50 | elif typ == 'b': 51 | val = config.getboolean('DEFAULT', key) 52 | elif typ == 'i': 53 | val = config.getint('DEFAULT', key) 54 | elif typ == 'f': 55 | val = config.getfloat('DEFAULT', key) 56 | else: 57 | raise Exception('Invalid type for %s: %s' % (key, typ)) 58 | except NoOptionError: 59 | if default != None: 60 | val = default 61 | else: 62 | raise 63 | result[key] = val 64 | if set_defaults_cb: 65 | set_defaults_cb(result) 66 | self.__dict__.update(result) 67 | 68 | self.check_config() 69 | self.locate_binaries() 70 | 71 | # checks config values and sets some common defaults 72 | def check_config(self): 73 | # set up our ssh transfer timeout and debug options 74 | self.ssh_flags = "-o ConnectTimeout=%s -o StrictHostKeyChecking=no" % (self.ssh_timeout,) 75 | if self.ssh_debug: 76 | self.ssh_flags += " -vvv" 77 | 78 | if 'slaves' in self.__dict__: 79 | self.slaves_list = self.slaves.replace(" ", "").split(",") 80 | if not any(self.slaves_list): 81 | raise Exception("Refusing to run with empty or invalid slaves list.") 82 | 83 | @staticmethod 84 | def check_paths(pathvars): 85 | for element in pathvars: 86 | os.stat(element) 87 | 88 | COMMON_BIN_NAMES = ["rsync", "ssh"] 89 | 90 | #Get and set the required absolute paths for executables 91 | def locate_binaries(self, exes=COMMON_BIN_NAMES): 92 | found = [] 93 | exe_paths = [] 94 | final_paths = {} 95 | 96 | #Generator yielding joined paths of directories and filenames [used for searching] 97 | def search(dirs, names): 98 | for f in names: 99 | for directory in dirs: 100 | abspath = os.path.join(directory, f) 101 | yield f,abspath 102 | 103 | path = [] 104 | if "PATH" in os.environ: 105 | envpath = os.environ['PATH'].split(os.pathsep) 106 | path.extend(envpath) 107 | if 'includepath' in vars(self): 108 | includepath = self.includepath.split(os.pathsep) 109 | if path: 110 | unique = set(includepath).difference(set(envpath)) 111 | path.extend(unique) 112 | else: 113 | path.extend(includepath) 114 | if not path: 115 | raise Exception("CONFIG: No PATH in environment, and includepath not set in config. Can't find executables.") 116 | 117 | #Start searching 118 | for exe,abspath in search(path, exes): 119 | if os.access(abspath, os.X_OK) and exe not in found: 120 | exe_paths.append(abspath) 121 | found.append(exe) 122 | 123 | #Raise exception if we couldn't find all the executables 124 | if len(exes) > len(found): 125 | raise Exception("CONFIG: Couldn't find executables: %s" % (", ".join(set(exes).difference(set(found))))) 126 | 127 | #Populate final dict of names to paths, assign to self 128 | for i, exe in enumerate(found): 129 | final_paths[exe] = exe_paths[i] 130 | self.__dict__.update(final_paths) 131 | 132 | def pull_exception(self): 133 | exc = sys.exc_info() 134 | return traceback.format_exc(exc[2]) 135 | 136 | def log(self, msg, level="NOTICE"): 137 | """ 138 | Log a message to stdout in the format: 139 | [month.day.year hour:minute:second] level: message 140 | 141 | Arguments: 142 | | argument | type | default | description 143 | * msg - string - - Message to log 144 | * level - string - NOTICE - Log level to prepend to message 145 | """ 146 | 147 | timestamp = time.strftime("%F %T %Z") 148 | print "[%s] %s: %s" % (timestamp, level, msg) 149 | sys.stdout.flush() # in case we've been running under logging collector 150 | 151 | def debuglog(self, msg): 152 | if self.debug: 153 | self.log(msg, "DEBUG") 154 | 155 | def notify_external(self, log=False, ok=False, warning=False, critical=False, message=None): 156 | """ 157 | Notify some external program (i.e. monitoring plugin) 158 | about an event occuring. The program itself can be set 159 | via notify_* configuration options. 160 | 161 | Arguments: 162 | | argument | type | default | description 163 | * log - boolean - False - Log the message with self.log if true. 164 | * ok, false, warning, critical - boolean - False - If one is not set True, immediately return. 165 | * message - string - None - Will be appended to the end of the command. 166 | """ 167 | 168 | #Return if we don't have an alert status 169 | if not any((ok, warning, critical)): 170 | return 171 | if log and message: 172 | self.log(message) 173 | #Return if none of the notify commands are set in the config, but not before logging message 174 | if not filter(len, [self.notify_ok, self.notify_warning, self.notify_critical]): 175 | return 176 | if ok: 177 | exec_str = "%s" % (self.notify_ok,) 178 | elif warning: 179 | exec_str = "%s" % (self.notify_warning,) 180 | elif critical: 181 | exec_str = "%s" % (self.notify_critical,) 182 | if message: 183 | exec_str += " %s" % (message,) 184 | 185 | self.debuglog("notify_external exec_str: %s" % exec_str) 186 | subprocess.call(exec_str, shell=True) 187 | 188 | def check_pgpid_func(self): 189 | """ 190 | Checks to see if postgresql is running 191 | """ 192 | if self.debug: 193 | print "NOTICE: check_pgpid_func()" 194 | pidfilename = '%s/postmaster.pid' % (self.pgdata,) 195 | try: 196 | os.stat(pidfilename) 197 | pidfile = open(pidfilename, 'r') 198 | line = int(pidfile.readline()) 199 | os.kill(line, 0) 200 | return 0 201 | except: 202 | return 1 203 | 204 | 205 | if __name__ == '__main__': 206 | argslist = [("-C", "--config", 207 | dict(dest="configfilename", 208 | action="store", 209 | help="the name of the archiver config file", 210 | metavar="FILE"))] 211 | 212 | # test common config parameters 213 | classdict = (('rsync_flags', 's', ""), 214 | ('user', 's', None), 215 | ('ssh_timeout', 'i', None), 216 | ('notify_ok', 's', None), 217 | ('notify_warning', 's', None), 218 | ('notify_critical', 's', None), 219 | ('debug', 'b', False), 220 | ('pgdata', 's', None), 221 | ('pgcontroldata', 's', ""), 222 | ('includepath', 's', None), 223 | ('ssh_debug', 'b', False)) 224 | 225 | worker = CMDWorker(classdict) 226 | worker.parse_commandline_arguments(argslist) 227 | if worker.options.configfilename: 228 | worker.load_configuration_file() 229 | print worker.__dict__ 230 | -------------------------------------------------------------------------------- /bin/threaded_rsync.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Uses rsync to sync files in parallel between a source and destination. 5 | 6 | This script has been tested with Python 2.7.4 and 2.6.8; it requires 2.5+. 7 | This script has also been tested with rsync versions 3.0.9 and 2.6.8. 8 | 9 | 10 | BASIC USAGE: 11 | 12 | Using rsync's archive mode along with --delete: 13 | threaded_rsync.py "/usr/bin/rsync -a --delete /path/to/src/ user@host:/path/to/dest" 14 | 15 | Specifying the number of threads to spawn for file copies (2 is the default), 16 | and debug mode (default is off): 17 | threaded_rsync.py "/usr/bin/rsync -a /path/to/src/ user@host:/path/to/dest" --num_threads 4 --debug 18 | 19 | We are not restricted to archive mode; using other sets of rsync flags: 20 | threaded_rsync.py "/usr/bin/rsync -rltv /path/to/src/ user@host:/path/to/dest" 21 | 22 | 23 | The script always uses your full rsync command that is passed in (although it 24 | may add additional arguments for certain phases of processing), so --dry-run 25 | could not result in any changes to the source and destination when using threaded_rsync.py: 26 | threaded_rsync.py "/usr/bin/rsync -a --delete --dry-run -vv /path/to/src/ user@host:/path/to/dest" 27 | 28 | 29 | LIMITING CPU AND DISK USAGE: 30 | 31 | Local Host 32 | Use custom nice and ionice settings to limit the cpu and disk contention on the 33 | local host; an example of setting the priority very low: 34 | threaded_rsync.py "nice -n 19 ionice -c 2 -n 7 /usr/bin/rsync -a /path/to/src/ user@host:/path/to/dest" 35 | 36 | Remote Host 37 | Similarly, you can prepend nice and ionice calls to rsync's --rsync-path option 38 | to change the priority of the remote host's rsync processes: 39 | threaded_rsync.py 'nice /usr/bin/rsync --rsync-path="nice /usr/bin/rsync" -a /path/to/src/ user@host:/path/to/dest' 40 | 41 | 42 | BASIC ALGORITHMIC DESCRIPTION: 43 | 44 | This script generates an initial list of files to sync by using a user supplied 45 | rsync command. In the first phase --dry-run and --itemize-changes are added to 46 | this rsync command to find a list of files for parallel syncing. In phase two 47 | we sync each of these files individually using a different thread (the caller 48 | chooses how many threads to use). In the final phase we use the original rsync 49 | command again, but we exclude the files that were already synced; directories, 50 | symlinks, etc. are synced in this final (non-threaded) call. 51 | """ 52 | 53 | import optparse, re, subprocess, sys, tempfile, thread, time 54 | from threading import Thread 55 | from Queue import Queue 56 | 57 | class rsync_in_parallel(object): 58 | """Main class for managing parallel rsyncs""" 59 | 60 | def __init__(self, rsync_cmd, num_threads=2, debug=False): 61 | """arguments are: 62 | the user's rsync command, 63 | the number of threads to spawn for file transfers (default=2), 64 | and a flag to show debug information (default=False)""" 65 | self.rsync_cmd = rsync_cmd 66 | self.num_threads = num_threads 67 | self.debug = debug 68 | self._initialize_file_transfer_list() 69 | 70 | self.queue = Queue() 71 | 72 | for i in range(self.num_threads): 73 | worker = Thread(target=self._launcher, args=(i,)) 74 | worker.setDaemon(True) 75 | worker.start() 76 | 77 | 78 | def _initialize_file_transfer_list(self): 79 | """This method constructs a list of files for (later) parallel transfer""" 80 | 81 | # we run the user's rsync command, but we add two flags: 82 | # --dry-run --itemize-changes 83 | # this allows us to find files that need to be transferred 84 | p = subprocess.Popen(self.rsync_cmd + " --dry-run --itemize-changes", shell=True, stdout=subprocess.PIPE) 85 | out = p.stdout.readlines() 86 | # see the rsync man page docs for a complete description of the --itemize-changes output 87 | # to make sense of the regular expression below; we are looking to transfer files 88 | # ('f' in the second column below). we will tranfer dirs, etc. later, and all at once. 89 | # rsync 3.09 uses 11 characters for -i output: YXcstpoguax 90 | # rsync 2.68 uses 9 characters for -i output: YXcstpogz 91 | re_obj = re.compile(r"^[<>ch.]f[c.+][s.+][tT.+][p.+][o.+][g.+][uz.+][a.+]?[x.+]?\s(?P.+)$") 92 | 93 | # a list of all files for parallel/threaded sync 94 | self.file_list = [] 95 | for line in out: 96 | #print "LINE:" + line 97 | match = re_obj.match(line.strip()) 98 | 99 | if (match): 100 | file_path = match.groupdict()['file_name'] 101 | self.file_list.append('/' + file_path) 102 | #print "MATCH:" + file_path 103 | 104 | if len(self.file_list) == 0: 105 | print "WARN: no files will be transferred in parallel; check the output of --dry-run --itemize-changes with your rsync command to verify" 106 | 107 | def _launcher(self, i): 108 | """Spawns an rsync process to update/sync a single file""" 109 | while True: 110 | file_path = self.queue.get() 111 | if self.debug: 112 | print "Thread %s: %s" % (i, file_path) 113 | 114 | # take the users's rsync command but use --files-from to just send a specific file 115 | # (parent directories of the file will be created automatically if they are needed) 116 | temp = tempfile.NamedTemporaryFile() 117 | temp.write(file_path) 118 | temp.flush() 119 | 120 | cmd = "%s --files-from=%s" % (self.rsync_cmd, temp.name) 121 | if self.debug: 122 | print "CALLING:" + cmd 123 | 124 | ret = subprocess.call(cmd, shell=True) 125 | if ret != 0: 126 | print "WARN: could not transfer %s, rsync failed with error code=%s; continuing..." % (file_path, ret) 127 | 128 | temp.close() 129 | self.queue.task_done() 130 | 131 | 132 | def sync_files(self): 133 | """The main entry point to start the sync processes""" 134 | 135 | # create a (synchronized) queue for the threads to access 136 | for file_path in self.file_list: 137 | self.queue.put(file_path) 138 | self.queue.join() 139 | 140 | # now we perform the final call to rsync to sync directories, symlinks, 141 | # perform deletes (if --delete was in the original command), etc. 142 | # i.e., everything that remains beyond the parallel file transfers 143 | # that have already occurred. 144 | 145 | # we could just issue the original command, but it will be faster to 146 | # explicitly --exclude-from the files we already transferred (especially 147 | # when --checksum is used in the original command) 148 | temp = tempfile.NamedTemporaryFile() 149 | for file_path in self.file_list: 150 | temp.write(file_path + "\n") 151 | temp.flush() 152 | cmd = "%s --exclude-from=%s" % (self.rsync_cmd, temp.name) 153 | 154 | if (self.debug): 155 | print "Calling final rsync:" + cmd 156 | ret = subprocess.call(cmd, shell=True) 157 | if ret != 0: 158 | print "WARN: potential problem with final rsync call, rsync failed with error code=%s" % ret 159 | 160 | temp.close() 161 | return ret 162 | 163 | 164 | if __name__ == "__main__": 165 | p = optparse.OptionParser(description="Python threaded rsync", 166 | prog="threaded_rsync.py", 167 | version="0.1", 168 | usage="%prog ") 169 | 170 | p.add_option('--num_threads', '-n', type="int", help="the number of spawned rsync file copy threads", default=2) 171 | p.add_option('--debug', '-d', help="enable debugging output", action="store_true", default=False) 172 | 173 | options, arguments = p.parse_args() 174 | 175 | if options.debug: 176 | print arguments 177 | print options 178 | 179 | if len(arguments) != 1: 180 | #print __doc__ 181 | p.print_help() 182 | sys.exit(1) 183 | 184 | #start = time.time() 185 | 186 | rsync_cmd = arguments[0] 187 | r = rsync_in_parallel(rsync_cmd, options.num_threads, options.debug) 188 | ret = r.sync_files() 189 | 190 | #end = time.time() 191 | #print "rsyncs completed in %s seconds" % (end - start) 192 | 193 | sys.exit(ret) 194 | -------------------------------------------------------------------------------- /doc/cmd_archiver.README: -------------------------------------------------------------------------------- 1 | 2 | pitrtools 3 | © COPYRIGHT Command Prompt, Inc. 4 | May 5, 2013 5 | 6 | 7 | cmd_archiver script is used in conjunction with PostgreSQL's native 8 | archive mode functionality to ship WAL segments to configured standby 9 | server to replay them. 10 | 11 | * Automatically ship log files to configured standby server 12 | * Enable/disable archiving without the need to restart PostgreSQL 13 | * Alert based on failures 14 | 15 | 16 | Usage: 17 | 18 | cmd_archiver [options] arg1 arg2 19 | 20 | Options: 21 | -h, --help Show this help message and exit 22 | -F FILE, --file=FILE Archive file 23 | -C FILE, --config=FILE 24 | The name of the archiver config file 25 | -I, --init Initialize master environment 26 | 27 | 28 | Naming conventions: 29 | 30 | Master server can be referred to also as archiver. 31 | Slave is often called standby. 32 | 33 | All the names are used interchangeably. 34 | 35 | 36 | The process: 37 | 38 | pitrtools is just a wrapper around standard tools. If you are familiar 39 | with the process of setting up warm standby, there's not much new to 40 | learn about what pitrtools does. It simply attempts to make the process 41 | simpler, so the order of steps to take looks like this: 42 | 43 | 44 | On master server 45 | 46 | * Turn on archiving 47 | * Install helper scripts 48 | * cmd_archiver -C $CONFIG -I 49 | 50 | On standby server 51 | 52 | * cmd_standby -C $CONFIG -I 53 | * cmd_standby -C $CONFIG -B 54 | * cmd_standby -C $CONFIG -S 55 | 56 | 57 | Prerequisites: 58 | 59 | SSH key-based authentication between the master and standby (both ways) 60 | for `postgres' user must be configured. Note, pitrtools was designed to 61 | run by `postgres' system user (in fact, any user but root), thus your 62 | SSH keys must be generated for `postgres' shell user. 63 | 64 | 65 | Where to put pitrtools files: 66 | 67 | `postgres' user's home directory is a default recommended location, but 68 | it could be located anywhere as long as `postgres' user can access the 69 | files. 70 | 71 | Since pitrtools utilities are run by the `postgres' system user make 72 | sure that proper permissions and ownership are set. 73 | 74 | For example 75 | 76 | $ chown -R postgres.postgres /var/lib/postgresql/pitrtools 77 | 78 | 79 | Notes on configuration: 80 | 81 | On master server you need to enable archiving, to do that edit 82 | the postgresql.conf: 83 | 84 | 85 | archive_mode = on 86 | archive_command = '/var/lib/postgresql/pitrtools/bin/cmd_archiver -C /var/lib/postgresql/pitrtools/etc/cmd_archiver.ini -F %p' 87 | 88 | -C takes the path to the config file 89 | -F takes the path (or file) of the archive. 90 | 91 | 92 | Install helper scripts: 93 | 94 | $ psql -U postgres < /var/lib/postgresql/pitrtools/scripts/cmd_standby.sql 95 | 96 | or 97 | 98 | $ psql -U postgres < /var/lib/postgresql/pitrtools/scripts/cmd_standby.92.sql (for PostgreSQL 9.2) 99 | 100 | After that, initialize master server environment: 101 | 102 | $ pitrtools/bin/cmd_archiver -C pitrtools/etc/cmd_archiver.ini -I 103 | 104 | You are done setting up master server. 105 | 106 | 107 | 108 | More about options: 109 | 110 | --init 111 | 112 | Used to create the appropriate archive/queue file directories. 113 | 114 | 115 | 116 | cmd_archiver.ini options: 117 | 118 | 119 | 120 | [DEFAULT] 121 | 122 | state: online 123 | 124 | You can change the archiver to offline live, without touching PostgreSQL. 125 | When taking the archiver offline, this will cause a nonzero exit status 126 | to PostgreSQL's archive command on each archive attempt. This is o.k. 127 | because PostgreSQL will just queue up the archives while the archiver is 128 | offline. However, this can cause you to accidently fill up your postgres 129 | partition, so use with care. 130 | 131 | pgdata: /var/lib/postgresql/9.0/main 132 | 133 | The base database directory. This is the PGDATA on the archiving machine. 134 | 135 | 136 | r_archivedir: /var/lib/postgresql/archive 137 | 138 | Where to remotely copy archives. The remote archive directory on standby 139 | server, i.e. the directory that will accept all WAL segment files to be 140 | restored by the standby server. 141 | 142 | l_archivedir: /var/lib/postgresql/archive 143 | 144 | The local WAL archive directory. For every standby server a subdirectory 145 | named after hostname or IP address of the standby will be created. The 146 | archiver will queue any WAL files that has failed to transfer to a 147 | standby(s) under their respective subdirectories. 148 | 149 | This is also used when you want to archive all files locally. For 150 | example, if you are using an NFS mount for the standby server to get its 151 | files. 152 | 153 | includepath: /usr/bin:/bin:/usr/local/bin 154 | 155 | Directories to be included in search paths for executables; separated by colons 156 | 157 | rsync_flags: -z 158 | 159 | Optional flags to pass to rsync (compression, etc.) 160 | 161 | rsync_version = 2 162 | 163 | rsync version, run `rsync --version' to see what version you have. 164 | Expects 2 or 3, if running RHEL5 or similar it is likely 2. 165 | If you are running something that ships remotely modern software 166 | it will be 3. 167 | 168 | slaves: 192.168.3.1 or slaves: 192.168.3.1,192.168.4.1 169 | 170 | This is the IP address of the standby server. A hostname/FQDN would work 171 | as well. It must be a comma separated list for multiple slaves. 172 | 173 | user: postgres 174 | 175 | The UNIX shell user that is using rsync. This is the user that must have 176 | SSH keys configured between the master and standby. 177 | 178 | ssh_timeout: 10 179 | 180 | The SSH timeout. If it takes > timeout to begin the transfer, the archiver 181 | will provide a nonzero return and alert per notify_critical. 182 | 183 | queue_user: postgres 184 | 185 | The name of a Unix user that will be running cmd_archiver or 186 | cmd_queue. The cmd_queue script will refuse to start if run 187 | by a different user. 188 | 189 | queue_wait: 5 190 | 191 | Number of seconds the cmd_queue process will sleep between 192 | tries to ship the accumulated WAL files to the slaves. 193 | 194 | max_queue_workers: 1 195 | 196 | The maximum allowed number of WAL-shipping rsync processes 197 | that cmd_queue will run in parallel. 198 | 199 | notify_ok: echo OK 200 | 201 | Everything is o.k. so what do we do? This should be full path to a script. 202 | For example, a nagios alert. 203 | 204 | notify_warning: echo WARNING 205 | 206 | The same as OK but for warnings. 207 | 208 | notify_critical: echo CRITICAL 209 | 210 | The same as OK but for critical. This is total failure conditions. 211 | 212 | debug: off 213 | 214 | This will provide you a lot of noisy info to wherever you push your 215 | postgresql logging. 216 | 217 | ssh_debug: off 218 | 219 | Shows more information about SSH session, if set to on. 220 | 221 | -------------------------------------------------------------------------------- /doc/cmd_queue.README: -------------------------------------------------------------------------------- 1 | 2 | pitrtools 3 | © COPYRIGHT Command Prompt, Inc. 4 | Nov 13, 2014 5 | 6 | 7 | cmd_queue script is used by cmd_archiver internally, but can be run 8 | manually by the user if needed. 9 | 10 | 11 | Usage: 12 | 13 | cmd_queue [options] 14 | 15 | Options: 16 | -h, --help Show this help message and exit 17 | -C FILE, --config=FILE The name of the archiver config file 18 | -d, --daemon Enter background daemon mode 19 | 20 | 21 | The cmd_queue script uses the same config file as cmd_archiver (see 22 | cmd_archiver.README for configuation details.) 23 | 24 | This job of this script is to run indefinitely monitoring the 25 | l_archivedir for newly arriving WAL files and make sure they are 26 | shipped to the slave hosts. 27 | 28 | The cmd_archiver will put a copy of every new WAL file in l_archivedir 29 | then call cmd_queue to fulfill shipping to the slaves. The cmd_queue 30 | checks for its PID file, cmd_queue.pid under l_archivedir, to avoid 31 | running multiple instances and will bail out if it detects another 32 | intstance already running. Running cmd_queue for every archived file 33 | makes sure that it's restarted even after crash/failure or when killed 34 | by user. On the other hand, the user-started cmd_queue will be 35 | "reused" by cmd_archiver, i.e. no concurrent cmd_queue process will be 36 | running, because of the PID file check. 37 | 38 | When cmd_queue detects a new file under l_archivedir it will try to 39 | make a hard link to the file in every slave directory, then remove the 40 | original link (but only if the links to all slaves were made 41 | successfully). Later, when the file is shipped to the last slave 42 | host, the last reference will be removed and the disk space is 43 | reclaimed. 44 | 45 | If you have a number of slave hosts to ship WAL files to and enough 46 | bandwith and CPU power, you might want to set max_queue_workers to 47 | something higher than the default 1. This will tell cmd_queue to run 48 | up to that much rsync processes to individual slave hosts in parallel. 49 | Setting this configuration parameter to a value higher than the total 50 | number of slaves is not useful, however. 51 | 52 | After every round of attempts to ship the pending WAL files to the 53 | slaves, the cmd_queue process will sleep for the number of seconds 54 | specified in queue_wait configuration parameter. Tuning this 55 | parameter might help when establishing ssh connection by rsync to the 56 | slave host has a relatively high latency. With higher value more WAL 57 | files will be accumulated locally to be later shipped in one go. 58 | 59 | Finally, cmd_queue will check if the master server's postmaster is 60 | still running by checking $PGDATA/postmaster.pid file. If it is no 61 | longer alive, cmd_queue will break from infinite loop and exit with 62 | success error level. However, if you run cmd_queue manually with 63 | stopped postmaster, it will attempt to ship any pending WAL files to 64 | the slave hosts. 65 | -------------------------------------------------------------------------------- /doc/cmd_standby.README: -------------------------------------------------------------------------------- 1 | 2 | pitrtools 3 | © COPYRIGHT Command Prompt, Inc. 4 | May 5, 2013 5 | 6 | The cmd_standby script is a utility that allows easy management of all types of 7 | standby PostgreSQL servers, from cold to warm and streaming replication. It is 8 | run on the standby host. 9 | 10 | It currently has the following capabilities: 11 | 12 | * Easily take a base backup, including table spaces 13 | * Restore archives 14 | * Automatically purge old archives 15 | * Alert based on failures 16 | * Stop and start PostgreSQL standby 17 | * Failover to the latest restore point and point-in-time recovery 18 | 19 | 20 | 21 | Usage: 22 | 23 | cmd_standby [options] arg1 arg2 24 | 25 | Options: 26 | -h, --help show this help message and exit 27 | -A start|stop, --action=start|stop 28 | Start or Stop PostgreSQL 29 | -B, --basebackup Start/Stop a base backup. Run with -I before this. 30 | -C FILE, --config=FILE 31 | the name of the standby config file 32 | -F VALUE, --failover=VALUE 33 | Value is a 3 digit number 999 34 | 35 | -I, --dbinit Create local filesystem directories like on the master. 36 | -P, --ping Is my master alive? 37 | -R TIMESTAMP, --recovertotime=TIMESTAMP 38 | If you need to restore to a specific point in time 39 | -S, --standby Enter standby mode 40 | 41 | 42 | 43 | The process: 44 | 45 | You need to apply cmd_standby.sql (cmd_standby.92.sql if you run PostgreSQL 9.2) 46 | to master database before starting standby process on slave. 47 | 48 | The process of configuring a standby slave boils down to the following: 49 | 50 | $ pitrtools/bin/cmd_standby -C pitrtools/etc/cmd_standby.ini -I 51 | 52 | Fix any problems it reports. Which is likely be pathing or permission 53 | issues. 54 | 55 | $ pitrtools/bin/cmd_standby -C pitrtools/etc/cmd_standby.ini -B 56 | 57 | Will start the base backup and fix up pg_xlog if its linked. 58 | 59 | $ pitrtools/bin/cmd_standby -C pitrtools/etc/cmd_standby.ini -S 60 | 61 | If you want a warm standby or anything more fancier, use -S. 62 | If you want a cold one, you are done. 63 | 64 | 65 | To bring a warm standby into production run a failover action: 66 | 67 | $ pitrtools/bin/cmd_standby -C pitrtools/etc/cmd_standby.ini -F999 68 | 69 | To bring a standby into production, but at a certain point in time do: 70 | 71 | $ pitrtools/bin/cmd_standby -C pitrtools/etc/cmd_standby.ini -F999 -R '2008-05-28 11:00:38.059389' 72 | 73 | Where -R is a valid timestamp. 74 | 75 | 76 | 77 | Configuration: 78 | 79 | [DEFAULT] 80 | 81 | pgversion: 9.2 82 | 83 | What version of PostgreSQL are we running? NOTE: Versions below 9.2 are not compatible. 84 | 85 | use_streaming_replication: off 86 | 87 | If set to on, the recovery.conf file generated on the standby server will 88 | use parameters for streaming replication. Also, the parameters trigger_file, 89 | repl_db_user and repl_db_password will be needed. 90 | 91 | trigger_file: %(pgdata)/cmd_end_recovery 92 | 93 | Is the file that the "cmd_standby -C cmd_standby.ini -F999" command would 94 | create in order to bring the former standby online. If you change this 95 | parameter you will need to update recovery.conf file or maybe shutdown 96 | running PostgreSQL and re-run the -S command. 97 | 98 | repl_db_user: postgres 99 | 100 | User that the standby server will use to connect to master server for 101 | replication purposes. 102 | 103 | repl_db_password: secretpassword 104 | 105 | Password that the standby will use to connect to master for replication purposes. 106 | This one can be empty, if you decide to store the password in a .pgpass file. 107 | 108 | sslmode: 109 | 110 | sslmode to use while connecting to master server for streaming replication. 111 | Defaults to 'prefer', accepted values are: 112 | 113 | disable, allow, prefer, require, verfiy-ca, verify-full 114 | 115 | See more at http://www.postgresql.org/docs/current/static/libpq-ssl.html 116 | 117 | includepath: /usr/bin:/bin:/usr/local/bin 118 | 119 | Directories to include when searching for executables; separated by colons. Also used to locate remote psql executable on master. 120 | 121 | rsync_flags: -z 122 | 123 | Optional flags to pass to rsync (compression, etc.) 124 | 125 | rsync_threads: 2 126 | 127 | Number of rsync threads to use when performing a base backup. 128 | 129 | port: 5432 130 | 131 | The port on the master that PostgreSQL is listening 132 | 133 | master_public_ip: 192.168.3.254 134 | 135 | The external available ip address for ssh 136 | 137 | master_local_ip: 127.0.0.1 138 | 139 | The internal IP address psql should connect to on the master 140 | 141 | user: postgres 142 | 143 | The user who performed initdb 144 | 145 | debug: on 146 | 147 | If you want the diagnostic info 148 | 149 | ssh_debug: on 150 | 151 | If you want lots of ssh diagnostic info 152 | 153 | ssh_timeout: 30 154 | 155 | If ssh can not connect in this many seconds, we will throw an alarm and exit. 156 | 157 | archivedir: /var/lib/postgresql/archive/ 158 | 159 | This is where cmd_archiver is copying files from the master to the standby. 160 | 161 | pgdata: /var/lib/postgresql/9.2/main 162 | 163 | The absolute path to your cluster directory. 164 | 165 | pg_conf_failover: /var/lib/postgresql/pitrtools/etc/pg_conf_failover/postgresql.conf 166 | 167 | The absolute path to the postgresql.conf to use when we failover to become a master. 168 | 169 | pg_hba_conf_failover: /var/lib/postgresql/pitrtools/etc/pg_conf_failover/pg_hba.conf 170 | 171 | The absolute path to the pg_hba.conf to use when we failover to become a master. 172 | 173 | 174 | postgresql_conf: /var/lib/postgresql/pitrtools/etc/pg_conf/postgresql.conf 175 | 176 | The absolute path to the postgresql.conf to use when we bring up the standby. 177 | 178 | pg_hba_conf: /var/lib/postgresql/pitrtools/etc/pg_conf/pg_hba.conf 179 | 180 | The absolute path to the pg_hba.conf to use when we bring up the standby. 181 | 182 | no_copy_conf: off 183 | 184 | By default postgresql.conf and pg_hba.conf will be copied from the 185 | locations specified above to pgdata directory on failover. Set this 186 | to on to make postgres actually use the above conf files without 187 | copying them to pgdata. 188 | 189 | recovery_conf: /var/lib/postgresql/9.2/main/recovery.conf 190 | 191 | The absolute path to the recovery.conf to create when we failover. 192 | This defaults to %(pgdata)/recovery.conf 193 | 194 | logfile: /var/log/postgresql/postgresql.log 195 | 196 | The absolute path to file where to append the server log (optional). 197 | Useful when postgresql.conf doesn't specify log destination. Will be 198 | passed with -l to pg_ctl when starting the server. 199 | 200 | notify_critical: 201 | notify_warning: 202 | notify_ok: 203 | 204 | Absolute paths to the script to fire at each alarm level. 205 | 206 | action_failover: /var/lib/postgresql/pitrtools/failover.sh 207 | 208 | Absolute path to the script to fire at the end of failover. This might be 209 | to change an IP address on the slave (for example). 210 | 211 | -------------------------------------------------------------------------------- /etc/cmd_archiver.ini.sample: -------------------------------------------------------------------------------- 1 | [DEFAULT] 2 | 3 | ; online or offline 4 | state: online 5 | 6 | ; The base database directory 7 | pgdata: /var/lib/postgresql/9.2/main 8 | 9 | ; where to remotely copy archives 10 | r_archivedir: /var/lib/postgresql/archive 11 | 12 | ; where to locally copy archives 13 | l_archivedir: /var/lib/postgresql/archive 14 | 15 | ; where to look for executables 16 | includepath: /usr/bin:/bin:/usr/local/bin 17 | 18 | ; extra rsync flags 19 | rsync_flags: -z 20 | 21 | ; option 2 or 3, if running RHEL5 or similar it is likely 2 22 | ; if you are running something that ships remotely modern software 23 | ; it will be 3 24 | rsync_version = 3 25 | 26 | ; IP of slave 27 | slaves: 192.168.1.201,192.168.1.202 28 | 29 | ; the user for the ssh connection to a standby 30 | user: postgres 31 | 32 | ; if rsync can't connect in 10 seconds error 33 | ssh_timeout: 10 34 | 35 | ; Unix user name that is allowed to run the cmd_queue process 36 | queue_user: postgres 37 | 38 | ; number of seconds to sleep between syncs to slaves 39 | queue_wait: 5 40 | 41 | ; maximum allowed number of parallel WAL-shipping workers 42 | max_queue_workers: 1 43 | 44 | ; command to process in ok 45 | notify_ok: echo OK 46 | 47 | ; command to process in warning 48 | notify_warning: echo WARNING 49 | 50 | ; command to process in critical 51 | notify_critical: echo CRITICAL 52 | 53 | ; if you want to debug on/off only 54 | debug: off 55 | 56 | ; if you want ssh debug (warning noisy) 57 | ssh_debug: off 58 | 59 | -------------------------------------------------------------------------------- /etc/cmd_standby.ini.sample: -------------------------------------------------------------------------------- 1 | [DEFAULT] 2 | 3 | ; what major version are we using? 4 | pgversion: 9.2 5 | 6 | ; Whether or not to use streaming replication. If this is set to "on" 7 | ; pitrtools will configure the standby server to replicate from master 8 | ; using streaming replication. 9 | use_streaming_replication: off 10 | 11 | ; File to touch to end replication when using streaming replication. 12 | trigger_file: /var/lib/postgresql/pitrtools/cmd_end_recovery 13 | 14 | ; User to connect to master DB while using streaming replication, 15 | ; ignored if not using streaming replication. 16 | repl_db_user: replication 17 | 18 | ; Password for the user repl_db_user. 19 | repl_db_password: secret 20 | 21 | ; sslmode to use when connecting for streaming replication. 22 | ; Accepted values: the same as libpq: disable, allow, prefer, require, verify-ca and verify-full 23 | ; Default: sslmode: prefer 24 | sslmode: prefer 25 | 26 | ; Paths to use when searching for remote psql (also included when searching for local executables) 27 | includepath: /usr/bin:/usr/local/bin:/bin 28 | 29 | ; Generalized information 30 | 31 | ; the port postgresql runs on (master) 32 | port: 5432 33 | 34 | ; IP or name of Master server 35 | master_public_ip: 192.168.3.254 36 | 37 | ; the IP address we should use when processing remote shell 38 | master_local_ip: 127.0.0.1 39 | 40 | ; the user performed initdb 41 | user: postgres 42 | 43 | ; on or off 44 | debug: off 45 | 46 | ; on or off 47 | ssh_debug: off 48 | 49 | ; the timeout for ssh before we throw an alarm 50 | ssh_timeout: 30 51 | 52 | ; Number of threads to use for rsync during base backups. 1 disables multi-threading entirely. 53 | rsync_threads: 2 54 | 55 | ; should be the same as r_archivedir for archiver 56 | archivedir: /var/lib/postgresql/archive 57 | 58 | ; where you executed initdb -D to 59 | pgdata: /var/lib/postgresql/9.2/main 60 | 61 | ; Confs 62 | 63 | ; This is the postgresql.conf to be used for the failover 64 | pg_conf_failover: /var/lib/postgresql/pitrtools/etc/pg_conf_failover/postgresql.conf 65 | 66 | ; This is the pg_hba.conf to be used for the failover 67 | pg_hba_conf_failover: /var/lib/postgresql/pitrtools/etc/pg_conf_failover/pg_hba.conf 68 | 69 | ; This is the postgresql.conf to be used for Postgres when in standby mode 70 | pg_conf: /var/lib/postgresql/pitrtools/etc/pg_conf/postgresql.conf 71 | 72 | ; This is the pg_hba.conf to be used for Postgres when in standby mode 73 | pg_hba_conf: /var/lib/postgresql/pitrtools/etc/pg_conf/pg_hba.conf 74 | 75 | ; By default postgresql.conf and pg_hba.conf will be copied from the 76 | ; locations specified above to pgdata directory on failover. 77 | ; 78 | ; Uncomment the following to make postgres actually use the above conf 79 | ; files w/o copying them to pgdata. 80 | ;no_copy_conf: true 81 | 82 | ; The recovery.conf file to create when starting up 83 | ; Defaults to %(pgdata)/recovery.conf 84 | recovery_conf: /var/lib/postgresql/9.2/main/recovery.conf 85 | 86 | ; Useful when postgresql.conf doesn't specify log destination 87 | ; Will be passed with -l to pg_ctl when starting the server. 88 | ; 89 | ; If you're worried about having complete logs, either make sure 90 | ; postgresql.conf points to a log file, or use the logfile: parameter. 91 | ; 92 | ; Otherwise postgresql will print on standard stdout and nothing 93 | ; will be recorded in the logs 94 | ; 95 | ;logfile: /var/log/postgresql/postgresql.log 96 | 97 | ; Alarms 98 | 99 | notify_critical: echo CRITICAL 100 | notify_warning: echo WARNING 101 | notify_ok: echo OK 102 | 103 | ; On failover action 104 | 105 | ; Whatever is placed here will be executed on -FS must return 0, 106 | ; must have execution bit for at least owner (chmod u+x) 107 | 108 | action_failover: /var/lib/postgresql/pitrtools/failover/failover.sh 109 | 110 | -------------------------------------------------------------------------------- /etc/pg_conf/pg_conf.README: -------------------------------------------------------------------------------- 1 | cmd_standby expects two configuration files in this directory, namely 2 | 3 | - postgresql.conf 4 | - pg_hba.conf 5 | 6 | They will be used to start PostgreSQL in standby mode. By default these 7 | configuration files will be copied from pitrtools/etc/pg_conf/ to PostgreSQL 8 | data folder. However, you can configure cmd_standby to use them directly 9 | from the pg_conf/ without first copying to the data folder. 10 | 11 | Use no_copy_conf: parameter in cmd_standby.ini to control how 12 | these files will be used. 13 | -------------------------------------------------------------------------------- /etc/pg_conf_failover/pg_conf_failover.README: -------------------------------------------------------------------------------- 1 | cmd_standby expects two configuration files in this directory, namely 2 | 3 | - postgresql.conf 4 | - pg_hba.conf 5 | 6 | They will be used to start PostgreSQL when you failover. By default these 7 | configuration files will be copied from pitrtools/etc/pg_conf_failover/ to 8 | PostgreSQL data folder. However, you can configure cmd_standby to use them 9 | directly from the pg_conf_failover/ without first copying to the data folder. 10 | 11 | Use no_copy_conf: parameter in cmd_standby.ini to control how 12 | these files will be used. 13 | -------------------------------------------------------------------------------- /scripts/cmd_standby.92.sql: -------------------------------------------------------------------------------- 1 | -- These are the helper scripts for cmd_standby. You apply them to the master "postgres" database; 2 | -- ; 3 | CREATE OR REPLACE FUNCTION cmd_get_data_dirs() RETURNS SETOF TEXT AS $$ 4 | SELECT DISTINCT pg_tablespace_location(oid) as spclocation from pg_tablespace WHERE pg_tablespace_location(oid) <> ''; 5 | $$ LANGUAGE SQL; 6 | 7 | COMMENT ON FUNCTION cmd_get_data_dirs() IS 'Returns tablespace paths. The text input is for later when we have to determine between > 8.3'; 8 | 9 | CREATE OR REPLACE FUNCTION cmd_get_pgdata() RETURNS TEXT AS $$ 10 | SELECT setting FROM pg_catalog.pg_settings WHERE name='data_directory'; 11 | $$ LANGUAGE SQL IMMUTABLE; 12 | 13 | CREATE OR REPLACE FUNCTION cmd_get_tablespaces() RETURNS SETOF TEXT AS $$ 14 | SELECT DISTINCT pg_tablespace_location(oid) as spclocation from pg_tablespace WHERE pg_tablespace_location(oid) IS NOT NULL; 15 | $$ LANGUAGE SQL STABLE; 16 | 17 | CREATE OR REPLACE FUNCTION cmd_pg_start_backup() RETURNS INT AS $$ 18 | SELECT pg_start_backup('base_backup'); 19 | SELECT 1; 20 | $$ LANGUAGE SQL; 21 | 22 | COMMENT ON FUNCTION cmd_pg_start_backup() IS 'Slim wrapper around pg_start_backup for flexibility'; 23 | 24 | CREATE OR REPLACE FUNCTION cmd_pg_stop_backup() RETURNS INT AS $$ 25 | SELECT pg_stop_backup(); 26 | SELECT 1; 27 | $$ LANGUAGE SQL; 28 | 29 | COMMENT ON FUNCTION cmd_pg_stop_backup() IS 'Slim wrapper around pg_stop_backup for flexibility'; 30 | --------------------------------------------------------------------------------