├── sql ├── Makefile └── pg_paxos.sql ├── pg_paxos.control ├── updates └── pg_paxos--0.1--0.2.sql ├── include ├── table_metadata.h ├── paxos_api.h └── pg_paxos.h ├── LICENSE ├── META.json ├── Makefile ├── src ├── table_metadata.c ├── paxos_api.c └── pg_paxos.c └── README.md /sql/Makefile: -------------------------------------------------------------------------------- 1 | DATA_built += sql/$(EXTENSION)--$(EXTVERSION).sql 2 | 3 | # define build process for latest install file 4 | sql/$(EXTENSION)--$(EXTVERSION).sql: sql/$(EXTENSION).sql 5 | cp $< $@ 6 | -------------------------------------------------------------------------------- /pg_paxos.control: -------------------------------------------------------------------------------- 1 | # pg_paxos extension 2 | comment = 'functions that implement the Paxos distributed consensus algorithm' 3 | default_version = '0.2' 4 | module_pathname = '$libdir/pg_paxos' 5 | requires = 'dblink' 6 | -------------------------------------------------------------------------------- /updates/pg_paxos--0.1--0.2.sql: -------------------------------------------------------------------------------- 1 | /* 2 | * paxos_execute executes a query locally, by-passing the Paxos query logging logic. 3 | */ 4 | CREATE FUNCTION paxos_execute(INTERNAL) 5 | RETURNS void 6 | LANGUAGE C 7 | AS 'MODULE_PATHNAME', $$paxos_execute$$; 8 | -------------------------------------------------------------------------------- /include/table_metadata.h: -------------------------------------------------------------------------------- 1 | /*------------------------------------------------------------------------- 2 | * 3 | * include/table_metadata.h 4 | * 5 | * Declarations for public functions and types related to metadata handling. 6 | * 7 | * Copyright (c) 2014-2015, Citus Data, Inc. 8 | * 9 | *------------------------------------------------------------------------- 10 | */ 11 | 12 | #ifndef PG_PAXOS_TABLE_METADATA_H 13 | #define PG_PAXOS_TABLE_METADATA_H 14 | 15 | #include "postgres.h" 16 | #include "c.h" 17 | 18 | 19 | /* function declarations to access and manipulate the metadata */ 20 | extern char *PaxosTableGroup(Oid paxosTableOid); 21 | extern bool IsPaxosTable(Oid tableId); 22 | 23 | 24 | #endif /* PG_PAXOS_TABLE_METADATA_H */ 25 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015, Citus Data 2 | 3 | Permission to use, copy, modify, and distribute this software and its documentation for any purpose, without fee, and without a written agreement is hereby granted, provided that the above copyright notice and this paragraph and the following two paragraphs appear in all copies. 4 | 5 | IN NO EVENT SHALL CITUS DATA BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF CITUS DATA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 6 | 7 | CITUS DATA SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND CITUS DATA HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. 8 | -------------------------------------------------------------------------------- /include/paxos_api.h: -------------------------------------------------------------------------------- 1 | /*------------------------------------------------------------------------- 2 | * 3 | * include/paxos_api.h 4 | * 5 | * Declarations for public functions that implement Paxos. 6 | * 7 | * Copyright (c) 2014-2015, Citus Data, Inc. 8 | * 9 | *------------------------------------------------------------------------- 10 | */ 11 | 12 | #ifndef PG_PAXOS_API_H 13 | #define PG_PAXOS_API_H 14 | 15 | 16 | /* function declarations to run Paxos */ 17 | extern int64 PaxosAppend(char *groupId, char *proposerId, const char* value); 18 | extern int64 PaxosMaxAppliedRound(char *groupId); 19 | extern int64 PaxosMaxLocalConsensusRound(char *groupId); 20 | extern int64 PaxosMaxAcceptedRound(char *groupId); 21 | extern int64 PaxosApplyLog(char *groupId, char *proposerId, int64 maxRoundId); 22 | extern void PaxosSetApplied(char *groupId, int64 appliedRoundId); 23 | extern int64 PaxosMembershipVersion(char *groupId); 24 | 25 | #endif /* PG_PAXOS_API_H */ 26 | -------------------------------------------------------------------------------- /include/pg_paxos.h: -------------------------------------------------------------------------------- 1 | /*------------------------------------------------------------------------- 2 | * 3 | * pg_paxos.h 4 | * 5 | * Declarations for public functions and types needed by the pg_paxos 6 | * extension. 7 | * 8 | * Copyright (c) 2014-2015, Citus Data, Inc. 9 | * 10 | *------------------------------------------------------------------------- 11 | */ 12 | 13 | #ifndef PG_PAXOS_H 14 | #define PG_PAXOS_H 15 | 16 | #include "lib/stringinfo.h" 17 | #include "nodes/parsenodes.h" 18 | 19 | /* extension name used to determine if extension has been created */ 20 | #define PG_PAXOS_EXTENSION_NAME "pg_paxos" 21 | #define PG_PAXOS_METADATA_SCHEMA_NAME "pgp_metadata" 22 | #define REPLICATED_TABLES_TABLE_NAME "replicated_tables" 23 | #define MAX_PAXOS_GROUP_ID_LENGTH 128 24 | 25 | 26 | /* configuration parameter that specifies a unique node ID */ 27 | extern char *PaxosNodeId; 28 | 29 | 30 | /* function declarations for extension loading and unloading */ 31 | extern void _PG_init(void); 32 | extern void _PG_fini(void); 33 | 34 | 35 | /* function declarations for pg_paxos utility functions */ 36 | extern char *GenerateProposerId(void); 37 | extern void deparse_query(Query *query, StringInfo buffer); 38 | 39 | #endif /* PG_PAXOS_H */ 40 | -------------------------------------------------------------------------------- /META.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "pg_paxos", 3 | "abstract": "Paxos functions for PostgreSQL", 4 | "description": "Adds functions implementing the Paxos consensus algorithm to PostgreSQL", 5 | "version": "0.2", 6 | "maintainer": "\"Marco Slot\" ", 7 | "license": { 8 | "PostgreSQL": "http://www.postgresql.org/about/licence" 9 | }, 10 | "prereqs": { 11 | "runtime": { 12 | "requires": { 13 | "PostgreSQL": "9.3.4" 14 | } 15 | } 16 | }, 17 | "provides": { 18 | "pg_paxos": { 19 | "abstract": "Paxos functions for PostgreSQL", 20 | "file": "sql/pg_paxos--0.2.sql", 21 | "docfile": "README.md", 22 | "version": "0.2" 23 | } 24 | }, 25 | "release_status": "stable", 26 | "resources": { 27 | "homepage": "https://citusdata.com/", 28 | "bugtracker": { 29 | "web": "https://github.com/citusdata/pg_paxos/issues", 30 | "mailto": "support@citusdata.com" 31 | }, 32 | "repository": { 33 | "url": "git://github.com/citusdata/pg_paxos.git", 34 | "web": "https://github.com/citusdata/pg_paxos", 35 | "type": "git" 36 | } 37 | }, 38 | 39 | "generated_by": "\"Marco Slot\" ", 40 | 41 | "tags": [ 42 | "replication", 43 | "failover", 44 | "distribution", 45 | "scale", 46 | "cluster" 47 | ], 48 | 49 | "meta-spec": { 50 | "version": "1.0.0", 51 | "url": "http://pgxn.org/meta/spec.txt" 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | #------------------------------------------------------------------------- 2 | # 3 | # Makefile for pg_paxos 4 | # 5 | # Copyright (c) 2014-2015, Citus Data, Inc. 6 | # 7 | #------------------------------------------------------------------------- 8 | 9 | # grab name and version from META.json file 10 | EXTENSION = $(shell grep -m 1 '"name":' META.json | sed -e 's/[[:space:]]*"name":[[:space:]]*"\([^"]*\)",/\1/') 11 | EXTVERSION = $(shell grep default_version $(EXTENSION).control | sed -e "s/default_version[[:space:]]*=[[:space:]]*'\([^']*\)'/\1/") 12 | 13 | # installation scripts 14 | DATA = $(wildcard updates/*--*.sql) 15 | 16 | # compilation configuration 17 | MODULE_big = $(EXTENSION) 18 | OBJS = $(patsubst %.c,%.o,$(wildcard src/*.c)) 19 | PG_CPPFLAGS = -std=c99 -Wall -Wextra -Werror -Wno-unused-parameter -Iinclude -I$(libpq_srcdir) 20 | SHLIB_LINK = $(libpq) 21 | EXTRA_CLEAN += $(addprefix src/,*.gcno *.gcda) # clean up after profiling runs 22 | 23 | # add coverage flags if requested 24 | ifeq ($(enable_coverage),yes) 25 | PG_CPPFLAGS += --coverage 26 | SHLIB_LINK += --coverage 27 | endif 28 | 29 | # be explicit about the default target 30 | all: 31 | 32 | # delegate to subdirectory makefiles as needed 33 | include sql/Makefile 34 | 35 | # detect whether to build with pgxs or build in-tree 36 | ifndef NO_PGXS 37 | PG_CONFIG = pg_config 38 | PGXS := $(shell $(PG_CONFIG) --pgxs) 39 | include $(PGXS) 40 | else 41 | SHLIB_PREREQS = submake-libpq 42 | subdir = contrib/pg_paxos 43 | top_builddir = ../.. 44 | include $(top_builddir)/src/Makefile.global 45 | include $(top_srcdir)/contrib/contrib-global.mk 46 | endif 47 | 48 | -------------------------------------------------------------------------------- /src/table_metadata.c: -------------------------------------------------------------------------------- 1 | /*------------------------------------------------------------------------- 2 | * 3 | * src/table_metadata.c 4 | * 5 | * This file contains functions to access and manage the paxos table 6 | * metadata. 7 | * 8 | * Copyright (c) 2014-2015, Citus Data, Inc. 9 | * 10 | *------------------------------------------------------------------------- 11 | */ 12 | 13 | #include "postgres.h" 14 | #include "c.h" 15 | #include "fmgr.h" 16 | #include "miscadmin.h" 17 | 18 | #include "pg_paxos.h" 19 | #include "table_metadata.h" 20 | 21 | #include 22 | #include 23 | 24 | #include "access/attnum.h" 25 | #include "access/htup.h" 26 | #include "access/htup_details.h" 27 | #include "access/tupdesc.h" 28 | #include "executor/spi.h" 29 | #include "catalog/catalog.h" 30 | #include "catalog/namespace.h" 31 | #include "catalog/pg_type.h" 32 | #include "commands/extension.h" 33 | #include "nodes/makefuncs.h" 34 | #include "nodes/memnodes.h" 35 | #include "nodes/pg_list.h" 36 | #include "nodes/primnodes.h" 37 | #include "storage/lock.h" 38 | #include "storage/lmgr.h" 39 | #include "utils/builtins.h" 40 | #include "utils/elog.h" 41 | #include "utils/errcodes.h" 42 | #include "utils/fmgroids.h" 43 | #include "utils/lsyscache.h" 44 | #include "utils/memutils.h" 45 | #include "utils/palloc.h" 46 | #include "utils/tqual.h" 47 | 48 | 49 | /* human-readable names for addressing columns of partition table */ 50 | #define REPLICATED_TABLES_TABLE_ATTRIBUTE_COUNT 2 51 | #define ATTR_NUM_REPLICATED_TABLES_RELATION_ID 1 52 | #define ATTR_NUM_REPLICATED_TABLES_GROUP 2 53 | 54 | 55 | /* 56 | * PaxosTableGroup looks up the group ID of a paxos table 57 | */ 58 | char * 59 | PaxosTableGroup(Oid paxosTableOid) 60 | { 61 | char *groupId = NULL; 62 | RangeVar *heapRangeVar = NULL; 63 | Relation heapRelation = NULL; 64 | HeapScanDesc scanDesc = NULL; 65 | const int scanKeyCount = 1; 66 | ScanKeyData scanKey[scanKeyCount]; 67 | HeapTuple heapTuple = NULL; 68 | 69 | heapRangeVar = makeRangeVar(PG_PAXOS_METADATA_SCHEMA_NAME, 70 | REPLICATED_TABLES_TABLE_NAME, -1); 71 | heapRelation = relation_openrv(heapRangeVar, AccessShareLock); 72 | 73 | ScanKeyInit(&scanKey[0], ATTR_NUM_REPLICATED_TABLES_RELATION_ID, InvalidStrategy, 74 | F_OIDEQ, ObjectIdGetDatum(paxosTableOid)); 75 | 76 | scanDesc = heap_beginscan(heapRelation, SnapshotSelf, scanKeyCount, scanKey); 77 | 78 | heapTuple = heap_getnext(scanDesc, ForwardScanDirection); 79 | if (HeapTupleIsValid(heapTuple)) 80 | { 81 | TupleDesc tupleDescriptor = RelationGetDescr(heapRelation); 82 | bool isNull = false; 83 | 84 | Datum groupIdDatum = heap_getattr(heapTuple, 85 | ATTR_NUM_REPLICATED_TABLES_GROUP, 86 | tupleDescriptor, &isNull); 87 | groupId = TextDatumGetCString(groupIdDatum); 88 | } 89 | else 90 | { 91 | groupId = NULL; 92 | } 93 | 94 | heap_endscan(scanDesc); 95 | relation_close(heapRelation, AccessShareLock); 96 | 97 | return groupId; 98 | } 99 | 100 | 101 | /* 102 | * IsPaxosTable returns whether the specified table is paxos. It 103 | * returns false if the input is InvalidOid. 104 | */ 105 | bool 106 | IsPaxosTable(Oid tableOid) 107 | { 108 | bool isPaxosTable = false; 109 | RangeVar *heapRangeVar = NULL; 110 | Relation heapRelation = NULL; 111 | HeapScanDesc scanDesc = NULL; 112 | const int scanKeyCount = 1; 113 | ScanKeyData scanKey[scanKeyCount]; 114 | HeapTuple heapTuple = NULL; 115 | 116 | if (tableOid == InvalidOid) 117 | { 118 | return false; 119 | } 120 | 121 | heapRangeVar = makeRangeVar(PG_PAXOS_METADATA_SCHEMA_NAME, 122 | REPLICATED_TABLES_TABLE_NAME, -1); 123 | heapRelation = relation_openrv(heapRangeVar, AccessShareLock); 124 | 125 | ScanKeyInit(&scanKey[0], ATTR_NUM_REPLICATED_TABLES_RELATION_ID, InvalidStrategy, 126 | F_OIDEQ, ObjectIdGetDatum(tableOid)); 127 | 128 | scanDesc = heap_beginscan(heapRelation, SnapshotSelf, scanKeyCount, scanKey); 129 | 130 | heapTuple = heap_getnext(scanDesc, ForwardScanDirection); 131 | 132 | isPaxosTable = HeapTupleIsValid(heapTuple); 133 | 134 | heap_endscan(scanDesc); 135 | relation_close(heapRelation, AccessShareLock); 136 | 137 | return isPaxosTable; 138 | } 139 | -------------------------------------------------------------------------------- /src/paxos_api.c: -------------------------------------------------------------------------------- 1 | /*------------------------------------------------------------------------- 2 | * 3 | * src/paxos_api.c 4 | * 5 | * This file contains functions to run Paxos. 6 | * 7 | * Copyright (c) 2014-2015, Citus Data, Inc. 8 | * 9 | *------------------------------------------------------------------------- 10 | */ 11 | 12 | #include "postgres.h" 13 | #include "c.h" 14 | #include "fmgr.h" 15 | #include "miscadmin.h" 16 | 17 | #include "paxos_api.h" 18 | 19 | #include 20 | #include 21 | 22 | #include "catalog/pg_type.h" 23 | #include "executor/spi.h" 24 | #include "utils/builtins.h" 25 | 26 | 27 | int64 28 | PaxosAppend(char *groupId, char *proposerId, const char* value) 29 | { 30 | int roundId = -1; 31 | Datum roundIdDatum = 0; 32 | Oid argTypes[] = { 33 | TEXTOID, 34 | TEXTOID, 35 | TEXTOID 36 | }; 37 | Datum argValues[] = { 38 | CStringGetTextDatum(proposerId), 39 | CStringGetTextDatum(groupId), 40 | CStringGetTextDatum(value) 41 | }; 42 | bool isNull = false; 43 | 44 | SPI_connect(); 45 | 46 | SPI_execute_with_args("SELECT paxos_apply_and_append($1,$2,$3)", 47 | 3, argTypes, argValues, NULL, false, 1); 48 | 49 | roundIdDatum = SPI_getbinval(SPI_tuptable->vals[0], SPI_tuptable->tupdesc, 1, 50 | &isNull); 51 | roundId = DatumGetInt64(roundIdDatum); 52 | 53 | SPI_finish(); 54 | 55 | return roundId; 56 | } 57 | 58 | 59 | int64 60 | PaxosMaxAppliedRound(char *groupId) 61 | { 62 | int roundId = -1; 63 | Datum roundIdDatum = 0; 64 | Oid argTypes[] = { 65 | TEXTOID 66 | }; 67 | Datum argValues[] = { 68 | CStringGetTextDatum(groupId) 69 | }; 70 | bool isNull = false; 71 | 72 | SPI_connect(); 73 | 74 | SPI_execute_with_args("SELECT last_applied_round FROM pgp_metadata.group " 75 | "WHERE group_id = $1", 76 | 1, argTypes, argValues, NULL, false, 1); 77 | 78 | roundIdDatum = SPI_getbinval(SPI_tuptable->vals[0], SPI_tuptable->tupdesc, 1, 79 | &isNull); 80 | roundId = DatumGetInt64(roundIdDatum); 81 | 82 | SPI_finish(); 83 | 84 | return roundId; 85 | } 86 | 87 | 88 | int64 89 | PaxosMaxLocalConsensusRound(char *groupId) 90 | { 91 | int roundId = -1; 92 | Datum roundIdDatum = 0; 93 | Oid argTypes[] = { 94 | TEXTOID 95 | }; 96 | Datum argValues[] = { 97 | CStringGetTextDatum(groupId) 98 | }; 99 | bool isNull = false; 100 | 101 | SPI_connect(); 102 | 103 | SPI_execute_with_args("SELECT max(round_num) FROM pgp_metadata.round " 104 | "WHERE group_id = $1 AND consensus", 105 | 1, argTypes, argValues, NULL, false, 1); 106 | 107 | roundIdDatum = SPI_getbinval(SPI_tuptable->vals[0], SPI_tuptable->tupdesc, 1, 108 | &isNull); 109 | 110 | if (!isNull) 111 | { 112 | roundId = DatumGetInt64(roundIdDatum); 113 | } 114 | else 115 | { 116 | roundId = -1; 117 | } 118 | 119 | SPI_finish(); 120 | 121 | return roundId; 122 | } 123 | 124 | 125 | int64 126 | PaxosMaxAcceptedRound(char *groupId) 127 | { 128 | int roundId = -1; 129 | Datum roundIdDatum = 0; 130 | Oid argTypes[] = { 131 | TEXTOID 132 | }; 133 | Datum argValues[] = { 134 | CStringGetTextDatum(groupId) 135 | }; 136 | bool isNull = false; 137 | 138 | SPI_connect(); 139 | 140 | SPI_execute_with_args("SELECT paxos_max_group_round($1,true)", 141 | 1, argTypes, argValues, NULL, false, 1); 142 | 143 | roundIdDatum = SPI_getbinval(SPI_tuptable->vals[0], SPI_tuptable->tupdesc, 1, 144 | &isNull); 145 | roundId = DatumGetInt64(roundIdDatum); 146 | 147 | SPI_finish(); 148 | 149 | return roundId; 150 | } 151 | 152 | 153 | int64 154 | PaxosApplyLog(char *groupId, char *proposerId, int64 maxRoundId) 155 | { 156 | int roundId = -1; 157 | Datum roundIdDatum = 0; 158 | Oid argTypes[] = { 159 | TEXTOID, 160 | TEXTOID, 161 | INT8OID 162 | }; 163 | Datum argValues[] = { 164 | CStringGetTextDatum(proposerId), 165 | CStringGetTextDatum(groupId), 166 | Int64GetDatum(maxRoundId) 167 | }; 168 | bool isNull = false; 169 | 170 | SPI_connect(); 171 | 172 | SPI_execute_with_args("SELECT paxos_apply_log($1,$2,$3)", 173 | 3, argTypes, argValues, NULL, false, 1); 174 | 175 | roundIdDatum = SPI_getbinval(SPI_tuptable->vals[0], SPI_tuptable->tupdesc, 1, 176 | &isNull); 177 | roundId = DatumGetInt64(roundIdDatum); 178 | 179 | SPI_finish(); 180 | 181 | return roundId; 182 | } 183 | 184 | 185 | void 186 | PaxosSetApplied(char *groupId, int64 appliedRoundId) 187 | { 188 | Oid argTypes[] = { 189 | TEXTOID, 190 | INT8OID 191 | }; 192 | Datum argValues[] = { 193 | CStringGetTextDatum(groupId), 194 | Int64GetDatum(appliedRoundId) 195 | }; 196 | 197 | SPI_connect(); 198 | 199 | SPI_execute_with_args("UPDATE pgp_metadata.group SET last_applied_round = $2 WHERE group_id = $1 ", 200 | 2, argTypes, argValues, NULL, false, 1); 201 | 202 | SPI_finish(); 203 | } 204 | 205 | 206 | int64 207 | PaxosMembershipVersion(char *groupId) 208 | { 209 | int membershipVersion = -1; 210 | Datum membershipVersionDatum = 0; 211 | Oid argTypes[] = { 212 | TEXTOID 213 | }; 214 | Datum argValues[] = { 215 | CStringGetTextDatum(groupId) 216 | }; 217 | bool isNull = false; 218 | 219 | SPI_connect(); 220 | 221 | SPI_execute_with_args("SELECT max(xmin::text::bigint) FROM pgp_metadata.host " 222 | "WHERE group_id = $1", 223 | 1, argTypes, argValues, NULL, false, 1); 224 | 225 | membershipVersionDatum = SPI_getbinval(SPI_tuptable->vals[0], SPI_tuptable->tupdesc, 226 | 1, &isNull); 227 | membershipVersion = DatumGetInt64(membershipVersionDatum); 228 | 229 | SPI_finish(); 230 | 231 | return membershipVersion; 232 | } 233 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pg_paxos 2 | 3 | This PostgreSQL extension provides a basic implementation of the [Paxos algorithm](http://research.microsoft.com/en-us/um/people/lamport/pubs/paxos-simple.pdf) in PL/pgSQL and basic table replication through Paxos. 4 | 5 | Warning: pg_paxos is in an early stage, consider it experimental. 6 | 7 | pg_paxos can be used to replicate a table across multiple PostgreSQL servers. Every INSERT/UPDATE/DELETE on a replicated table is logged through Paxos. When a query is performed on the table, pg_paxos first ensures that all preceding queries in the Multi-Paxos log have been applied, providing strong consistency. By using the Paxos algorithm, pg_paxos is also robust to failure of a minority of nodes (read: servers), e.g. 2 out of 5. 8 | 9 | ## The Paxos Algorithm 10 | 11 | paxos(k,v) is a function that returns the same value on all nodes in a group for a certain key (k), and the value is one of the inputs (v). For example, a node might call paxos('leader','primary = node-a') to assign a permanent group leader. paxos(k,v) first picks a proposal number n (usually 0) and then tries to get a value accepted by a majority of nodes in 2 phases: 12 | 13 | 1. a) ask a majority of nodes to reject proposals for key k with a number smaller than n (or equal with lower node id), and return any previously accepted value and its proposal number
14 | b) if a value was already accepted by one or more nodes, then v takes on the value with the highest proposal number 15 | 2. ask responders from phase 1 to accept proposal n with value v for key k 16 | 17 | If in either phase the proposer cannot get confirmation from the majority, either due to failure or due to a rejection based on the proposal number, then it restarts with n = the highest proposal number in responses + 1, until a proposal succeeds in which case the function returns v. 18 | 19 | - 1a. ensures that the proposer has exclusive access among the majority, it's basically a lock with preemption 20 | - 1b. ensures that a value is never changed once the majority accepts a value, since the proposer cannot get a new majority without at least one node having the existing value 21 | - 2. achieves consensus if the majority lock has not been preempted, which implies that any other proposal still needs to complete 1a for at least one node in the majority and will thus see the value in 1b 22 | 23 | Any subsequent proposal will use the same value in phase 2, and therefore paxos always returns the same value on all nodes. 24 | 25 | ## Multi-Paxos 26 | 27 | Once a majority accepts a value, it can never be changed. However, Paxos can be used to implement a distributed log by using the index (round) in the log as a key. The log can be used to replicate a sequence of writes to a common, initial state. This technique is typically referred to as Multi-Paxos. 28 | 29 | To append a new write to the log, a node needs to reach consensus on its write in a given round and apply all preceding writes. This may require several attempts if other nodes are trying to reach consensus on the same round. Heavily simplified, writing to the Multi-Paxos log might look as follows: 30 | 31 | round = last_applied_round+1 32 | 33 | while((c = paxos(round,v)) != v) 34 | execute(c) 35 | last_applied_round = round 36 | round++ 37 | 38 | To perform a consistent read, a node needs to confirm the highest accepted round number in the group and execute all items in the log up to that round number. Some rounds may have been abandoned before consensus was reached, in which case the reader can force consensus by proposing a value that does not change the state. 39 | 40 | round = last_applied_round+1 41 | max_round_num = max_accepted_round() 42 | 43 | while(round <= max_round_num) 44 | execute(paxos(round,'')) 45 | last_applied_round = round 46 | round++ 47 | 48 | While the examples above are heavily simplified, pg_paxos follows the same general approach, using SQL queries as log items. 49 | 50 | ## Installation 51 | 52 | The easiest way to install pg_paxos is to build the sources from GitHub. 53 | 54 | git clone https://github.com/citusdata/pg_paxos.git 55 | cd pg_paxos 56 | PATH=/usr/local/pgsql/bin/:$PATH make 57 | sudo PATH=/usr/local/pgsql/bin/:$PATH make install 58 | 59 | pg_paxos requires the dblink extension that you'll find in the contrib directory of PostgreSQL to be installed. After installing both extensions run: 60 | 61 | -- run via psql on each node: 62 | CREATE EXTENSION dblink; 63 | CREATE EXTENSION pg_paxos; 64 | 65 | To do table replication, pg_paxos uses PostgreSQL's executor hooks to log SQL queries performed by the user in the Paxos log. To activate executor hooks, add pg_paxos to the shared_preload_libraries in postgresql.conf and restart postgres. It is also advisable to specify a unique node_id, which is needed to guarantee consistency in certain scenarios. 66 | 67 | # in postgresql.conf 68 | shared_preload_libraries = 'pg_paxos' 69 | pg_paxos.node_id = '' 70 | 71 | ## Setting up Table Replication 72 | 73 | pg_paxos allows you to replicate a table across a group of servers. When a table is marked as replicated, pg_paxos intercepts all SQL queries on that table via the executor hooks and appends them to the Multi-Paxos log. Before a query is performed, preceding SQL queries in the log are executed to bring the table up-to-date. From the perspective of the user, the table always appears consistent, even though the physical representation of the table on disk may be behind at the start of the read. 74 | 75 | To set up Paxos group with a replicated table, first create the table on all the nodes: 76 | 77 | CREATE TABLE coordinates ( 78 | x int, 79 | y int 80 | ); 81 | 82 | Then, on one of the nodes, call paxos_create_group to create a named Paxos group with the node itself (defined as a connection string using its external address) as its sole member, and call paxos_replicate_table to replicate a table within a group: 83 | 84 | SELECT paxos_create_group('mokka', 'host=10.0.0.1'); 85 | SELECT paxos_replicate_table('mokka', 'coordinates'); 86 | 87 | To add another node (e.g. 10.0.0.49), connect to it and call paxos_join_group using the name of the group, the connection string of an existing node, and the connection string of the node itself: 88 | 89 | SELECT paxos_join_group('mokka', 'host=10.0.0.1', 'host=10.0.0.49'); 90 | 91 | If you want to replicate an additional table after forming the group, then run paxos_replicate_table on all the nodes. 92 | 93 | An example of how pg_paxos replicates the metadata: 94 | 95 | [marco@marco-desktop pg_paxos]$ psql 96 | psql (9.5.1) 97 | Type "help" for help. 98 | 99 | postgres=# INSERT INTO coordinates VALUES (1,1); 100 | INSERT 0 1 101 | postgres=# INSERT INTO coordinates VALUES (2,2); 102 | INSERT 0 1 103 | postgres=# SELECT * FROM coordinates ; 104 | x | y 105 | ---+--- 106 | 1 | 1 107 | 2 | 2 108 | (2 rows) 109 | 110 | postgres=# \q 111 | [marco@marco-desktop pg_paxos]$ psql -p 9700 112 | psql (9.4.4) 113 | Type "help" for help. 114 | 115 | postgres=# SELECT * FROM coordinates ; 116 | DEBUG: Executing: INSERT INTO coordinates VALUES (1,1); 117 | CONTEXT: SQL statement "SELECT paxos_apply_log($1,$2,$3)" 118 | DEBUG: Executing: INSERT INTO coordinates VALUES (2,2); 119 | CONTEXT: SQL statement "SELECT paxos_apply_log($1,$2,$3)" 120 | x | y 121 | ---+--- 122 | 1 | 1 123 | 2 | 2 124 | (2 rows) 125 | 126 | postgres=# UPDATE coordinates SET x = x * 10; 127 | UPDATE 2 128 | postgres=# \q 129 | [marco@marco-desktop pg_paxos]$ psql -p 9701 130 | psql (9.4.4) 131 | Type "help" for help. 132 | 133 | postgres=# SELECT * FROM coordinates ; 134 | DEBUG: Executing: INSERT INTO coordinates VALUES (1,1); 135 | CONTEXT: SQL statement "SELECT paxos_apply_log($1,$2,$3)" 136 | DEBUG: Executing: INSERT INTO coordinates VALUES (2,2); 137 | CONTEXT: SQL statement "SELECT paxos_apply_log($1,$2,$3)" 138 | DEBUG: Executing: UPDATE coordinates SET x = x * 10; 139 | CONTEXT: SQL statement "SELECT paxos_apply_log($1,$2,$3)" 140 | x | y 141 | ----+--- 142 | 10 | 1 143 | 20 | 2 144 | (2 rows) 145 | 146 | By default, pg_paxos asks other nodes for the highest accepted round number in the log before every read. It then applies the SQL queries in the log up to and including the highest accepted round number, which ensures strong consistency. In some cases, low read latencies may be preferable to strong consistency. The pg_paxos.consistency_model setting can be changed to 'optimistic', in which case the node assumes it has already learned about preceding writes. The optimistic consistency model provides read-your-writes consistency in the absence of failure, but may return older results when failures occur. 147 | 148 | The consistency model can be changed in the session: 149 | 150 | SET pg_paxos.consistency_model TO 'optimistic'; 151 | 152 | To switch back to strong consistency: 153 | 154 | SET pg_paxos.consistency_model TO 'strong'; 155 | 156 | ## Internal Table Replication functions 157 | 158 | The following functions are called automatically when using table replications when a query is performed. We show how to call them explicitly to clarify the internals of pg_paxos. 159 | 160 | The paxos_apply_and_append function (called on writes) appends a SQL query to the log after ensuring that all queries that will preceed it in the log have been executed. 161 | 162 | SELECT * FROM paxos_apply_and_append( 163 | current_proposer_id := 'node-a/1251', 164 | current_group_id := 'ha_postgres', 165 | proposed_value := 'INSERT INTO coordinates VALUES (3,3)'); 166 | 167 | The paxos_apply_log function (called on SELECT) executes all SQL queries in the log for a given group that have not yet been executed up to and including round number max_round_num. 168 | 169 | SELECT * FROM paxos_apply_log( 170 | current_proposer_id := 'node-a/1252', 171 | current_group_id := 'ha_postgres', 172 | max_round_num := 3); 173 | 174 | The paxos_max_group_round function queries a majority of hosts for their highest accepted round number. The round number returned by paxos_max_group_round will be greater or equal to any round on which there is consensus (a majority has accepted) at the start of the call to paxos_max_group_round. Therefore, a node is guaranteed to see any preceding write if it applies the log up to that round number. 175 | 176 | SELECT * FROM paxos_max_group_round( 177 | current_group_id := 'ha_postgres'); 178 | 179 | ## Using Paxos functions directly 180 | 181 | You can also implement an arbitrary distributed log using pg_paxos by calling the paxos functions directly. The following query appends value 'primary = node-a' to the Multi-Paxos log for the group ha_postgres: 182 | 183 | SELECT * FROM paxos_append( 184 | current_proposer_id := 'node-a/1253', 185 | current_group_id := 'ha_postgres', 186 | proposed_value := 'primary = node-a'); 187 | 188 | current_proposer_id is a value that should be unique across the cluster for the given group and round. This is mainly used to determine which proposal was accepted when two proposers propose the same value. 189 | 190 | The latest value in the Multi-Paxos log can be retrieved using: 191 | 192 | SELECT * FROM paxos( 193 | current_proposer_id := 'node-a/1254', 194 | current_group_id := 'ha_postgres', 195 | current_round_num := paxos_max_group_round('ha_postgres')); 196 | 197 | Copyright © 2016 Citus Data, Inc. 198 | -------------------------------------------------------------------------------- /src/pg_paxos.c: -------------------------------------------------------------------------------- 1 | /*------------------------------------------------------------------------- 2 | * 3 | * src/pg_paxos.c 4 | * 5 | * This file contains executor hooks that use the Paxos API to do 6 | * replicated writes and consistent reads on PostgreSQL tables that are 7 | * replicated across multiple nodes. 8 | * 9 | * Copyright (c) 2014-2015, Citus Data, Inc. 10 | * 11 | *------------------------------------------------------------------------- 12 | */ 13 | 14 | #include "postgres.h" 15 | #include "c.h" 16 | #include "fmgr.h" 17 | #include "funcapi.h" 18 | #include "libpq-fe.h" 19 | #include "miscadmin.h" 20 | #include "plpgsql.h" 21 | 22 | #include "paxos_api.h" 23 | #include "pg_paxos.h" 24 | #include "table_metadata.h" 25 | 26 | #include 27 | #include 28 | #include 29 | #include "executor/execdesc.h" 30 | #include "executor/executor.h" 31 | #include "executor/spi.h" 32 | #include "catalog/namespace.h" 33 | #include "catalog/pg_type.h" 34 | #include "commands/extension.h" 35 | #include "lib/stringinfo.h" 36 | #include "nodes/memnodes.h" 37 | #include "nodes/nodeFuncs.h" 38 | #include "nodes/nodes.h" 39 | #include "nodes/params.h" 40 | #include "nodes/parsenodes.h" 41 | #include "nodes/plannodes.h" 42 | #include "nodes/pg_list.h" 43 | #include "nodes/print.h" 44 | #include "optimizer/clauses.h" 45 | #include "optimizer/planner.h" 46 | #include "parser/parse_func.h" 47 | #include "tcop/dest.h" 48 | #include "tcop/utility.h" 49 | #include "utils/builtins.h" 50 | #include "utils/lsyscache.h" 51 | #include "utils/memutils.h" 52 | #include "utils/snapmgr.h" 53 | 54 | 55 | /* declarations for dynamic loading */ 56 | PG_MODULE_MAGIC; 57 | 58 | /* exports for SQL callable functions */ 59 | PG_FUNCTION_INFO_V1(paxos_execute); 60 | 61 | 62 | /* Paxos planner function declarations */ 63 | static PlannedStmt * PgPaxosPlanner(Query *parse, int cursorOptions, 64 | ParamListInfo boundParams); 65 | static PlannedStmt * NextPlannerHook(Query *query, int cursorOptions, 66 | ParamListInfo boundParams); 67 | static bool IsPgPaxosActive(void); 68 | static bool ExtractRangeTableEntryWalker(Node *node, List **rangeTableList); 69 | static bool HasPaxosTable(List *rangeTableList); 70 | static bool FindModificationQueryWalker(Node *node, bool *isModificationQuery); 71 | static void ErrorIfQueryNotSupported(Query *queryTree); 72 | static Plan * CreatePaxosExecutePlan(char *queryString); 73 | static Oid PaxosExecuteFuncId(void); 74 | static char * GetPaxosQueryString(PlannedStmt *plan); 75 | 76 | /* Paxos executor function declarations */ 77 | static void PgPaxosExecutorStart(QueryDesc *queryDesc, int eflags); 78 | static void NextExecutorStartHook(QueryDesc *queryDesc, int eflags); 79 | static char *DeterminePaxosGroup(List *rangeTableList); 80 | static Oid ExtractTableOid(Node *node); 81 | static void PrepareConsistentWrite(char *groupId, const char *sqlQuery); 82 | static void PrepareConsistentRead(char *groupId); 83 | static void PgPaxosProcessUtility(Node *parsetree, const char *queryString, 84 | ProcessUtilityContext context, ParamListInfo params, 85 | DestReceiver *dest, char *completionTag); 86 | 87 | 88 | /* Enumeration that represents the consistency model to use */ 89 | typedef enum 90 | { 91 | STRONG_CONSISTENCY = 0, 92 | OPTIMISTIC_CONSISTENCY = 1 93 | 94 | } ConsistencyModel; 95 | 96 | 97 | /* configuration options */ 98 | static const struct config_enum_entry consistency_model_options[] = { 99 | {"strong", STRONG_CONSISTENCY, false}, 100 | {"optimistic", OPTIMISTIC_CONSISTENCY, false}, 101 | {NULL, 0, false} 102 | }; 103 | 104 | /* whether writes go through Paxos */ 105 | static bool PaxosEnabled = true; 106 | 107 | /* unique node ID to use in Paxos */ 108 | char *PaxosNodeId = NULL; 109 | 110 | /* consistency model for reads */ 111 | static int ReadConsistencyModel = STRONG_CONSISTENCY; 112 | 113 | /* whether to allow mutable functions */ 114 | static bool AllowMutableFunctions = false; 115 | 116 | /* saved hook values in case of unload */ 117 | static planner_hook_type PreviousPlannerHook = NULL; 118 | static ExecutorStart_hook_type PreviousExecutorStartHook = NULL; 119 | static ProcessUtility_hook_type PreviousProcessUtilityHook = NULL; 120 | 121 | 122 | 123 | /* 124 | * _PG_init is called when the module is loaded. In this function we save the 125 | * previous utility hook, and then install our hook to pre-intercept calls to 126 | * the copy command. 127 | */ 128 | void 129 | _PG_init(void) 130 | { 131 | PreviousPlannerHook = planner_hook; 132 | planner_hook = PgPaxosPlanner; 133 | 134 | PreviousExecutorStartHook = ExecutorStart_hook; 135 | ExecutorStart_hook = PgPaxosExecutorStart; 136 | 137 | PreviousProcessUtilityHook = ProcessUtility_hook; 138 | ProcessUtility_hook = PgPaxosProcessUtility; 139 | 140 | DefineCustomBoolVariable("pg_paxos.enabled", 141 | "If enabled, pg_paxos handles queries on Paxos tables", 142 | NULL, &PaxosEnabled, true, PGC_USERSET, 143 | GUC_NO_SHOW_ALL | GUC_NOT_IN_SAMPLE, NULL, NULL, NULL); 144 | 145 | DefineCustomStringVariable("pg_paxos.node_id", 146 | "Unique node ID to use in Paxos interactions", NULL, 147 | &PaxosNodeId, NULL, PGC_USERSET, 0, NULL, 148 | NULL, NULL); 149 | 150 | DefineCustomEnumVariable("pg_paxos.consistency_model", 151 | "Consistency model to use for reads (strong, optimistic)", 152 | NULL, &ReadConsistencyModel, STRONG_CONSISTENCY, 153 | consistency_model_options, PGC_USERSET, 0, NULL, NULL, 154 | NULL); 155 | 156 | DefineCustomBoolVariable("pg_paxos.allow_mutable_functions", 157 | "If enabled, mutable functions in queries are allowed", 158 | NULL, &AllowMutableFunctions, false, PGC_USERSET, 159 | 0, NULL, NULL, NULL); 160 | } 161 | 162 | 163 | /* 164 | * _PG_fini is called when the module is unloaded. This function uninstalls the 165 | * extension's hooks. 166 | */ 167 | void 168 | _PG_fini(void) 169 | { 170 | ProcessUtility_hook = PreviousProcessUtilityHook; 171 | ExecutorStart_hook = PreviousExecutorStartHook; 172 | planner_hook = PreviousPlannerHook; 173 | } 174 | 175 | 176 | /* 177 | * PgPaxosPlanner implements custom planning logic for queries involving 178 | * replicated tables. 179 | */ 180 | static PlannedStmt * 181 | PgPaxosPlanner(Query *query, int cursorOptions, ParamListInfo boundParams) 182 | { 183 | bool isModificationQuery = false; 184 | List *rangeTableList = NIL; 185 | 186 | if (!IsPgPaxosActive()) 187 | { 188 | return NextPlannerHook(query, cursorOptions, boundParams); 189 | } 190 | 191 | FindModificationQueryWalker((Node *) query, &isModificationQuery); 192 | 193 | if (!isModificationQuery) 194 | { 195 | return NextPlannerHook(query, cursorOptions, boundParams); 196 | } 197 | 198 | ExtractRangeTableEntryWalker((Node *) query, &rangeTableList); 199 | 200 | if (HasPaxosTable(rangeTableList)) 201 | { 202 | PlannedStmt *plannedStatement = NULL; 203 | Plan *paxosExecutePlan = NULL; 204 | 205 | /* Build the DML query to log */ 206 | Query *paxosQuery = copyObject(query); 207 | StringInfo queryString = makeStringInfo(); 208 | 209 | /* call standard planner first to have Query transformations performed */ 210 | plannedStatement = standard_planner(paxosQuery, cursorOptions, boundParams); 211 | 212 | ErrorIfQueryNotSupported(paxosQuery); 213 | 214 | /* get the transformed query string */ 215 | deparse_query(paxosQuery, queryString); 216 | 217 | /* define the plan as a call to paxos_execute(queryString) */ 218 | paxosExecutePlan = CreatePaxosExecutePlan(queryString->data); 219 | 220 | /* PortalStart will use the targetlist from the plan */ 221 | paxosExecutePlan->targetlist = plannedStatement->planTree->targetlist; 222 | 223 | plannedStatement->planTree = paxosExecutePlan; 224 | 225 | return plannedStatement; 226 | } 227 | else 228 | { 229 | return NextPlannerHook(query, cursorOptions, boundParams); 230 | } 231 | } 232 | 233 | /* 234 | * NextPlannerHook simply encapsulates the common logic of calling the next 235 | * planner hook in the chain or the standard planner start hook if no other 236 | * hooks are present. 237 | */ 238 | static PlannedStmt * 239 | NextPlannerHook(Query *query, int cursorOptions, ParamListInfo boundParams) 240 | { 241 | if (PreviousPlannerHook != NULL) 242 | { 243 | return PreviousPlannerHook(query, cursorOptions, boundParams); 244 | } 245 | else 246 | { 247 | return standard_planner(query, cursorOptions, boundParams); 248 | } 249 | } 250 | 251 | 252 | /* 253 | * IsPgPaxosActive returns whether pg_paxos should intercept queries. 254 | */ 255 | static bool 256 | IsPgPaxosActive(void) 257 | { 258 | bool missingOK = true; 259 | Oid extensionOid = InvalidOid; 260 | Oid metadataNamespaceOid = InvalidOid; 261 | Oid tableMetadataTableOid = InvalidOid; 262 | 263 | if (!PaxosEnabled) 264 | { 265 | return false; 266 | } 267 | 268 | if (!IsTransactionState()) 269 | { 270 | return false; 271 | } 272 | 273 | extensionOid = get_extension_oid(PG_PAXOS_EXTENSION_NAME, missingOK); 274 | if (extensionOid == InvalidOid) 275 | { 276 | return false; 277 | } 278 | 279 | metadataNamespaceOid = get_namespace_oid("pgp_metadata", true); 280 | if (metadataNamespaceOid == InvalidOid) 281 | { 282 | return false; 283 | } 284 | 285 | tableMetadataTableOid = get_relname_relid("replicated_tables", metadataNamespaceOid); 286 | if (tableMetadataTableOid == InvalidOid) 287 | { 288 | return false; 289 | } 290 | 291 | return true; 292 | } 293 | 294 | 295 | /* 296 | * ExtractRangeTableEntryWalker walks over a query tree, and finds all range 297 | * table entries. For recursing into the query tree, this function uses the 298 | * query tree walker since the expression tree walker doesn't recurse into 299 | * sub-queries. 300 | */ 301 | static bool 302 | ExtractRangeTableEntryWalker(Node *node, List **rangeTableList) 303 | { 304 | bool walkIsComplete = false; 305 | if (node == NULL) 306 | { 307 | return false; 308 | } 309 | 310 | if (IsA(node, RangeTblEntry)) 311 | { 312 | RangeTblEntry *rangeTable = (RangeTblEntry *) node; 313 | 314 | (*rangeTableList) = lappend(*rangeTableList, rangeTable); 315 | } 316 | else if (IsA(node, Query)) 317 | { 318 | walkIsComplete = query_tree_walker((Query *) node, ExtractRangeTableEntryWalker, 319 | rangeTableList, QTW_EXAMINE_RTES); 320 | } 321 | else 322 | { 323 | walkIsComplete = expression_tree_walker(node, ExtractRangeTableEntryWalker, 324 | rangeTableList); 325 | } 326 | 327 | return walkIsComplete; 328 | } 329 | 330 | 331 | /* 332 | * FidModificationQuery walks a query tree to find a modification query. 333 | */ 334 | static bool 335 | FindModificationQueryWalker(Node *node, bool *isModificationQuery) 336 | { 337 | bool walkIsComplete = false; 338 | if (node == NULL) 339 | { 340 | return false; 341 | } 342 | 343 | if (IsA(node, Query)) 344 | { 345 | Query *query = (Query *) node; 346 | CmdType commandType = query->commandType; 347 | 348 | if (commandType == CMD_INSERT || commandType == CMD_UPDATE || 349 | commandType == CMD_DELETE) 350 | { 351 | *isModificationQuery = true; 352 | walkIsComplete = true; 353 | } 354 | else 355 | { 356 | walkIsComplete = query_tree_walker(query, 357 | FindModificationQueryWalker, 358 | isModificationQuery, 0); 359 | } 360 | } 361 | else 362 | { 363 | walkIsComplete = expression_tree_walker(node, FindModificationQueryWalker, 364 | isModificationQuery); 365 | } 366 | 367 | return walkIsComplete; 368 | } 369 | 370 | 371 | /* 372 | * HasPaxosTable returns whether the given list of range tables contains 373 | * a Paxos table. 374 | */ 375 | static bool 376 | HasPaxosTable(List *rangeTableList) 377 | { 378 | ListCell *rangeTableCell = NULL; 379 | 380 | foreach(rangeTableCell, rangeTableList) 381 | { 382 | Oid rangeTableOid = ExtractTableOid((Node *) lfirst(rangeTableCell)); 383 | if (IsPaxosTable(rangeTableOid)) 384 | { 385 | return true; 386 | } 387 | } 388 | 389 | return false; 390 | } 391 | 392 | 393 | /* 394 | * ErrorIfQueryNotSupported checks if the query contains unsupported features, 395 | * and errors out if it does. 396 | */ 397 | static void 398 | ErrorIfQueryNotSupported(Query *queryTree) 399 | { 400 | if (!AllowMutableFunctions && contain_mutable_functions((Node *) queryTree)) 401 | { 402 | ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), 403 | errmsg("queries containing mutable functions may " 404 | "load to inconsistencies"), 405 | errhint("To allow mutable functions, set " 406 | "pg_paxos.allow_mutable_functions to on"))); 407 | } 408 | } 409 | 410 | 411 | /* 412 | * CreatePaxosExecutePlan creates a plan to call paxos_execute(queryString), 413 | * which is intercepted by the executor hook, which logs the query and executes 414 | * it using the regular planner. 415 | */ 416 | static Plan * 417 | CreatePaxosExecutePlan(char *queryString) 418 | { 419 | FunctionScan *execFunctionScan = NULL; 420 | RangeTblFunction *execFunction = NULL; 421 | FuncExpr *execFuncExpr = NULL; 422 | Const *queryData = NULL; 423 | 424 | /* store the query string as a cstring */ 425 | queryData = makeNode(Const); 426 | queryData->consttype = CSTRINGOID; 427 | queryData->constlen = -2; 428 | queryData->constvalue = CStringGetDatum(queryString); 429 | queryData->constbyval = false; 430 | queryData->constisnull = queryString == NULL; 431 | queryData->location = -1; 432 | 433 | execFuncExpr = makeNode(FuncExpr); 434 | execFuncExpr->funcid = PaxosExecuteFuncId(); 435 | execFuncExpr->funcretset = true; 436 | execFuncExpr->funcresulttype = VOIDOID; 437 | execFuncExpr->location = -1; 438 | execFuncExpr->args = list_make1(queryData); 439 | 440 | execFunction = makeNode(RangeTblFunction); 441 | execFunction->funcexpr = (Node *) execFuncExpr; 442 | 443 | execFunctionScan = makeNode(FunctionScan); 444 | execFunctionScan->functions = lappend(execFunctionScan->functions, execFunction); 445 | 446 | return (Plan *) execFunctionScan; 447 | } 448 | 449 | 450 | /* 451 | * PaxosExecuteFuncId returns the OID of the paxos_execute function. 452 | */ 453 | static Oid 454 | PaxosExecuteFuncId(void) 455 | { 456 | static Oid cachedOid = 0; 457 | List *nameList = NIL; 458 | Oid paramOids[1] = { INTERNALOID }; 459 | 460 | if (cachedOid == InvalidOid) 461 | { 462 | nameList = list_make2(makeString("public"), 463 | makeString("paxos_execute")); 464 | cachedOid = LookupFuncName(nameList, 1, paramOids, false); 465 | } 466 | 467 | return cachedOid; 468 | } 469 | 470 | 471 | /* 472 | * GetPaxosQueryString returns NULL if the plan is not a Paxos execute plan, 473 | * or the original query string. 474 | */ 475 | static char * 476 | GetPaxosQueryString(PlannedStmt *plan) 477 | { 478 | FunctionScan *execFunctionScan = NULL; 479 | RangeTblFunction *execFunction = NULL; 480 | FuncExpr *execFuncExpr = NULL; 481 | Const *queryData = NULL; 482 | char *queryString = NULL; 483 | 484 | if (!IsA(plan->planTree, FunctionScan)) 485 | { 486 | return NULL; 487 | } 488 | 489 | execFunctionScan = (FunctionScan *) plan->planTree; 490 | 491 | if (list_length(execFunctionScan->functions) != 1) 492 | { 493 | return NULL; 494 | } 495 | 496 | execFunction = linitial(execFunctionScan->functions); 497 | 498 | if (!IsA(execFunction->funcexpr, FuncExpr)) 499 | { 500 | return NULL; 501 | } 502 | 503 | execFuncExpr = (FuncExpr *) execFunction->funcexpr; 504 | 505 | if (execFuncExpr->funcid != PaxosExecuteFuncId()) 506 | { 507 | return NULL; 508 | } 509 | 510 | if (list_length(execFuncExpr->args) != 1) 511 | { 512 | ereport(ERROR, (errmsg("unexpected number of function arguments to " 513 | "paxos_execute"))); 514 | } 515 | 516 | queryData = (Const *) linitial(execFuncExpr->args); 517 | Assert(IsA(queryData, Const)); 518 | Assert(queryData->consttype == CSTRINGOID); 519 | 520 | queryString = DatumGetCString(queryData->constvalue); 521 | 522 | return queryString; 523 | } 524 | 525 | 526 | /* 527 | * PgPaxosExecutorStart blocks until the table is ready to read. 528 | */ 529 | static void 530 | PgPaxosExecutorStart(QueryDesc *queryDesc, int eflags) 531 | { 532 | PlannedStmt *plannedStmt = queryDesc->plannedstmt; 533 | List *rangeTableList = plannedStmt->rtable; 534 | char *paxosQueryString = NULL; 535 | 536 | if (!IsPgPaxosActive() || (eflags & EXEC_FLAG_EXPLAIN_ONLY) != 0) 537 | { 538 | NextExecutorStartHook(queryDesc, eflags); 539 | return; 540 | } 541 | 542 | paxosQueryString = GetPaxosQueryString(plannedStmt); 543 | if (paxosQueryString != NULL) 544 | { 545 | /* paxos write query */ 546 | char *groupId = NULL; 547 | bool isTopLevel = true; 548 | List *parseTreeList = NIL; 549 | Node *parseTreeNode = NULL; 550 | List *queryTreeList = NIL; 551 | Query *query = NULL; 552 | 553 | /* disallow transactions during paxos commands */ 554 | PreventTransactionChain(isTopLevel, "paxos commands"); 555 | 556 | groupId = DeterminePaxosGroup(rangeTableList); 557 | PrepareConsistentWrite(groupId, paxosQueryString); 558 | 559 | queryDesc->snapshot->curcid = GetCurrentCommandId(false); 560 | 561 | elog(DEBUG1, "Executing: %s %d", paxosQueryString, plannedStmt->hasReturning); 562 | 563 | /* replan the query */ 564 | parseTreeList = pg_parse_query(paxosQueryString); 565 | 566 | if (list_length(parseTreeList) != 1) 567 | { 568 | ereport(ERROR, (errmsg("can only execute single-statement queries on " 569 | "replicated tables"))); 570 | } 571 | 572 | parseTreeNode = (Node *) linitial(parseTreeList); 573 | queryTreeList = pg_analyze_and_rewrite(parseTreeNode, paxosQueryString, NULL, 0); 574 | query = (Query *) linitial(queryTreeList); 575 | queryDesc->plannedstmt = pg_plan_query(query, 0, queryDesc->params); 576 | } 577 | else if (HasPaxosTable(rangeTableList)) 578 | { 579 | /* paxos read query */ 580 | char *groupId = NULL; 581 | 582 | groupId = DeterminePaxosGroup(rangeTableList); 583 | PrepareConsistentRead(groupId); 584 | 585 | queryDesc->snapshot->curcid = GetCurrentCommandId(false); 586 | } 587 | 588 | NextExecutorStartHook(queryDesc, eflags); 589 | } 590 | 591 | 592 | /* 593 | * NextExecutorStartHook simply encapsulates the common logic of calling the 594 | * next executor start hook in the chain or the standard executor start hook 595 | * if no other hooks are present. 596 | */ 597 | static void 598 | NextExecutorStartHook(QueryDesc *queryDesc, int eflags) 599 | { 600 | /* call into the standard executor start, or hook if set */ 601 | if (PreviousExecutorStartHook != NULL) 602 | { 603 | PreviousExecutorStartHook(queryDesc, eflags); 604 | } 605 | else 606 | { 607 | standard_ExecutorStart(queryDesc, eflags); 608 | } 609 | } 610 | 611 | 612 | /* 613 | * DeterminePaxosGroup determines the paxos group for the given list of relations. 614 | * If more than one Paxos group is used, this function errors out. 615 | */ 616 | static char * 617 | DeterminePaxosGroup(List *rangeTableList) 618 | { 619 | ListCell *rangeTableCell = NULL; 620 | char *queryGroupId = NULL; 621 | 622 | foreach(rangeTableCell, rangeTableList) 623 | { 624 | char *tableGroupId = NULL; 625 | 626 | Oid rangeTableOid = ExtractTableOid((Node *) lfirst(rangeTableCell)); 627 | if (rangeTableOid == InvalidOid) 628 | { 629 | /* range table entry for something other than a relation */ 630 | continue; 631 | } 632 | 633 | tableGroupId = PaxosTableGroup(rangeTableOid); 634 | if (tableGroupId == NULL) 635 | { 636 | char *relationName = get_rel_name(rangeTableOid); 637 | ereport(ERROR, (errcode(ERRCODE_UNDEFINED_OBJECT), 638 | errmsg("relation \"%s\" is not managed by pg_paxos", 639 | relationName))); 640 | } 641 | else if (queryGroupId == NULL) 642 | { 643 | queryGroupId = tableGroupId; 644 | } 645 | else 646 | { 647 | int compareResult = strncmp(tableGroupId, queryGroupId, 648 | MAX_PAXOS_GROUP_ID_LENGTH); 649 | if (compareResult != 0) 650 | { 651 | ereport(ERROR, (errmsg("cannot run queries spanning more than a single " 652 | "Paxos group."))); 653 | } 654 | } 655 | } 656 | 657 | return queryGroupId; 658 | } 659 | 660 | 661 | /* 662 | * ExtractTableOid attempts to extract a table OID from a node. 663 | */ 664 | static Oid 665 | ExtractTableOid(Node *node) 666 | { 667 | Oid tableOid = InvalidOid; 668 | 669 | NodeTag nodeType = nodeTag(node); 670 | if (nodeType == T_RangeTblEntry) 671 | { 672 | RangeTblEntry *rangeTableEntry = (RangeTblEntry *) node; 673 | tableOid = rangeTableEntry->relid; 674 | } 675 | else if(nodeType == T_RangeVar) 676 | { 677 | RangeVar *rangeVar = (RangeVar *) node; 678 | bool failOK = true; 679 | tableOid = RangeVarGetRelid(rangeVar, NoLock, failOK); 680 | } 681 | 682 | return tableOid; 683 | } 684 | 685 | 686 | /* 687 | * GenerateProposerId attempts to generate a globally unique proposer ID. 688 | * It mainly relies on the pg_paxos.node_id setting to distinguish hosts, 689 | * and appends the process ID and transaction ID to ensure local uniqueness. 690 | */ 691 | char * 692 | GenerateProposerId(void) 693 | { 694 | StringInfo proposerId = makeStringInfo(); 695 | MyProcPid = getpid(); 696 | 697 | if (PaxosNodeId != NULL) 698 | { 699 | appendStringInfo(proposerId, "%s/", PaxosNodeId); 700 | } 701 | 702 | appendStringInfo(proposerId, "%d/%d", MyProcPid, GetTopTransactionId()); 703 | 704 | return proposerId->data; 705 | } 706 | 707 | 708 | /* 709 | * PrepareConsistentWrite prepares a write for execution. After 710 | * calling this function the write can be executed. 711 | */ 712 | static void 713 | PrepareConsistentWrite(char *groupId, const char *sqlQuery) 714 | { 715 | int64 loggedRoundId = 0; 716 | char *proposerId = GenerateProposerId(); 717 | 718 | /* 719 | * Log the current query through Paxos. 720 | */ 721 | loggedRoundId = PaxosAppend(groupId, proposerId, sqlQuery); 722 | CommandCounterIncrement(); 723 | 724 | /* 725 | * Mark the current query as applied and let the regular executor handle 726 | * it. This change is rolled back if the current query fails. 727 | */ 728 | PaxosSetApplied(groupId, loggedRoundId); 729 | CommandCounterIncrement(); 730 | } 731 | 732 | 733 | /* 734 | * PrepareConsistentRead prepares the replicated tables in a Paxos group 735 | * for a consistent read based on the configured consistency model. 736 | */ 737 | static void 738 | PrepareConsistentRead(char *groupId) 739 | { 740 | int64 maxRoundId = -1; 741 | int64 membershipVersion = PaxosMembershipVersion(groupId); 742 | int64 maxAppliedRoundId = PaxosMaxAppliedRound(groupId); 743 | char *proposerId = GenerateProposerId(); 744 | 745 | if (ReadConsistencyModel == STRONG_CONSISTENCY) 746 | { 747 | maxRoundId = PaxosMaxAcceptedRound(groupId); 748 | } 749 | else /* ReadConsistencyModel == OPTIMISTIC_CONSISTENCY */ 750 | { 751 | maxRoundId = PaxosMaxLocalConsensusRound(groupId); 752 | } 753 | 754 | while (maxAppliedRoundId < maxRoundId) 755 | { 756 | maxAppliedRoundId = PaxosApplyLog(groupId, proposerId, maxRoundId); 757 | CommandCounterIncrement(); 758 | 759 | if (ReadConsistencyModel == STRONG_CONSISTENCY) 760 | { 761 | int64 newMembershipVersion = PaxosMembershipVersion(groupId); 762 | if(newMembershipVersion > membershipVersion) 763 | { 764 | /* 765 | * If membership changed a lot, then the original maxRoundId may 766 | * not have been accurate. Refresh it to make sure. 767 | */ 768 | maxRoundId = PaxosMaxAcceptedRound(groupId); 769 | membershipVersion = newMembershipVersion; 770 | } 771 | } 772 | } 773 | } 774 | 775 | 776 | /* 777 | * PgPaxosProcessUtility intercepts utility statements and errors out for 778 | * unsupported utility statements on paxos tables. 779 | */ 780 | static void 781 | PgPaxosProcessUtility(Node *parsetree, const char *queryString, 782 | ProcessUtilityContext context, ParamListInfo params, 783 | DestReceiver *dest, char *completionTag) 784 | { 785 | if (IsPgPaxosActive()) 786 | { 787 | NodeTag statementType = nodeTag(parsetree); 788 | if (statementType == T_TruncateStmt) 789 | { 790 | TruncateStmt *truncateStatement = (TruncateStmt *) parsetree; 791 | List *relations = truncateStatement->relations; 792 | 793 | if (HasPaxosTable(relations)) 794 | { 795 | char *groupId = DeterminePaxosGroup(relations); 796 | 797 | PrepareConsistentWrite(groupId, queryString); 798 | } 799 | } 800 | else if (statementType == T_IndexStmt) 801 | { 802 | IndexStmt *indexStatement = (IndexStmt *) parsetree; 803 | Oid tableOid = ExtractTableOid((Node *) indexStatement->relation); 804 | if (IsPaxosTable(tableOid)) 805 | { 806 | char *groupId = PaxosTableGroup(tableOid); 807 | 808 | PrepareConsistentWrite(groupId, queryString); 809 | } 810 | } 811 | else if (statementType == T_AlterTableStmt) 812 | { 813 | AlterTableStmt *alterStatement = (AlterTableStmt *) parsetree; 814 | Oid tableOid = ExtractTableOid((Node *) alterStatement->relation); 815 | if (IsPaxosTable(tableOid)) 816 | { 817 | ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), 818 | errmsg("ALTER TABLE commands on paxos tables " 819 | "are unsupported"))); 820 | } 821 | } 822 | else if (statementType == T_CopyStmt) 823 | { 824 | CopyStmt *copyStatement = (CopyStmt *) parsetree; 825 | RangeVar *relation = copyStatement->relation; 826 | Node *rawQuery = copyObject(copyStatement->query); 827 | 828 | if (relation != NULL) 829 | { 830 | Oid tableOid = ExtractTableOid((Node *) relation); 831 | if (IsPaxosTable(tableOid)) 832 | { 833 | ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), 834 | errmsg("COPY commands on paxos tables " 835 | "are unsupported"))); 836 | } 837 | } 838 | else if (rawQuery != NULL) 839 | { 840 | Query *parsedQuery = NULL; 841 | List *queryList = pg_analyze_and_rewrite(rawQuery, queryString, 842 | NULL, 0); 843 | 844 | if (list_length(queryList) != 1) 845 | { 846 | ereport(ERROR, (errmsg("unexpected rewrite result"))); 847 | } 848 | 849 | parsedQuery = (Query *) linitial(queryList); 850 | 851 | /* determine if the query runs on a paxos table */ 852 | if (HasPaxosTable(parsedQuery->rtable)) 853 | { 854 | char *groupId = DeterminePaxosGroup(parsedQuery->rtable); 855 | 856 | PrepareConsistentRead(groupId); 857 | } 858 | } 859 | } 860 | } 861 | 862 | if (PreviousProcessUtilityHook != NULL) 863 | { 864 | PreviousProcessUtilityHook(parsetree, queryString, context, 865 | params, dest, completionTag); 866 | } 867 | else 868 | { 869 | standard_ProcessUtility(parsetree, queryString, context, 870 | params, dest, completionTag); 871 | } 872 | } 873 | 874 | 875 | /* 876 | * paxos_execute is a placeholder function to store a query string in 877 | * in plain postgres node trees. 878 | */ 879 | Datum 880 | paxos_execute(PG_FUNCTION_ARGS) 881 | { 882 | PG_RETURN_NULL(); 883 | } 884 | -------------------------------------------------------------------------------- /sql/pg_paxos.sql: -------------------------------------------------------------------------------- 1 | /* 2 | * Metadata on Paxos groups and replicated tables 3 | */ 4 | CREATE SCHEMA pgp_metadata 5 | 6 | /* 7 | * The "group" table keeps track of which items from the log have been 8 | * consumed/applied for a particular group. 9 | */ 10 | CREATE TABLE "group" ( 11 | group_id text not null, 12 | last_applied_round bigint not null default -1, 13 | own_connection_string text not null, 14 | PRIMARY KEY (group_id) 15 | ) 16 | 17 | /* 18 | * The "host" table stores which members are part of a Paxos group during a given 19 | * round. A host should be included if the current round lies between min_round_num 20 | * and max_round_num. 21 | */ 22 | CREATE TABLE "host" ( 23 | group_id text not null, 24 | connection_string text not null, 25 | min_round_num bigint not null, 26 | max_round_num bigint, 27 | PRIMARY KEY (group_id, connection_string, min_round_num), 28 | FOREIGN KEY (group_id) REFERENCES pgp_metadata.group (group_id) 29 | ) 30 | 31 | /* 32 | * The "round" table contains a the Multi-Paxos log for a Paxos group. 33 | * 34 | * group_id - the name of the group 35 | * round_num - the round number 36 | * min_proposal_num - do not accept proposals with a lower proposal number than this 37 | * proposer_id - the node that made the proposal (to ensure ordering) 38 | * value_id - an identifier for the value that is unique for the round 39 | * value - a value that was accepted for this round 40 | * accepted_proposal_num - the proposal number that was accepted 41 | * consensus - whether consensus was confirmed for the value 42 | * error - error messages that was generated when applying the value 43 | */ 44 | CREATE TABLE "round" ( 45 | group_id text not null, 46 | round_num bigint not null, 47 | min_proposal_num bigint not null, 48 | proposer_id text not null, 49 | value_id text, 50 | value text, 51 | accepted_proposal_num bigint, 52 | consensus bool not null default false, 53 | error text, 54 | PRIMARY KEY (group_id, round_num), 55 | FOREIGN KEY (group_id) REFERENCES pgp_metadata.group (group_id) 56 | ) 57 | 58 | /* 59 | * The "replicated_tables" table stores which tables are managed by pg_paxos and 60 | * in which Paxos group they are replicated. A table can be replicated within 61 | * at most 1 Paxos group. 62 | */ 63 | CREATE TABLE "replicated_tables" ( 64 | table_oid regclass not null, 65 | group_id text not null, 66 | PRIMARY KEY (table_oid), 67 | FOREIGN KEY (group_id) REFERENCES pgp_metadata.group (group_id) 68 | ); 69 | 70 | 71 | /* 72 | * Response from acceptor to prepare requests. 73 | * 74 | * promised - true if the acceptor participates, false otherwise 75 | * proposer_id - if a higher proposal was received, the proposer id 76 | * min_proposal_num - if a higher proposal was received, its number 77 | * value_id - if a proposal was previously accepted, its value identifier 78 | * value - if a proposal was previously accepted, its value 79 | * accepted_proposal_num - if a proposal was previously accepted, its number 80 | */ 81 | CREATE TYPE prepare_response AS ( 82 | promised boolean, 83 | proposer_id text, 84 | min_proposal_num bigint, 85 | value_id text, 86 | value text, 87 | accepted_proposal_num bigint 88 | ); 89 | 90 | /* 91 | * Response from acceptor to accept requests. 92 | * 93 | * accepted - false if a higher proposal was received, true otherwise 94 | * min_proposal_num - if a higher proposal was received, its number 95 | */ 96 | CREATE TYPE accept_response AS ( 97 | accepted boolean, 98 | min_proposal_num bigint 99 | ); 100 | 101 | /* 102 | * The result of a call to the paxos function. 103 | * 104 | * accepted_value - the value on which consensus was reached in the given round 105 | * value_changed - true if this was the value specified by the user, false otherwise 106 | */ 107 | CREATE TYPE paxos_result AS ( 108 | accepted_value text, 109 | value_changed boolean 110 | ); 111 | 112 | 113 | /* 114 | * paxos_request_prepare is a remote procedure that request participation of an 115 | * acceptor in a proposal. Effectively it grabs a preemptable lock. 116 | */ 117 | CREATE FUNCTION paxos_request_prepare( 118 | current_proposer_id text, 119 | current_group_id text, 120 | current_round_num bigint, 121 | current_proposal_num bigint) 122 | RETURNS prepare_response 123 | AS $BODY$ 124 | DECLARE 125 | response prepare_response; 126 | round record; 127 | BEGIN 128 | /* 129 | * My response depends on any preceding prepare and accept requests for the same 130 | * group. Ensure prepare and accept requests are serialized through a lock. 131 | */ 132 | PERFORM pg_advisory_xact_lock(29020, hashtext(current_group_id)); 133 | 134 | /* Get state of the current round */ 135 | SELECT * INTO round 136 | FROM pgp_metadata.round 137 | WHERE group_id = current_group_id AND round_num = current_round_num; 138 | 139 | IF NOT FOUND THEN 140 | 141 | /* I have not seen a prepare request for this round */ 142 | 143 | INSERT INTO pgp_metadata.round ( 144 | group_id, 145 | round_num, 146 | min_proposal_num, 147 | proposer_id) 148 | VALUES (current_group_id, 149 | current_round_num, 150 | current_proposal_num, 151 | current_proposer_id); 152 | 153 | SELECT true, current_proposer_id, current_proposal_num, NULL, NULL, NULL 154 | INTO response; 155 | 156 | ELSIF current_proposal_num > round.min_proposal_num OR 157 | (current_proposal_num = round.min_proposal_num AND 158 | (current_proposer_id > round.proposer_id)) THEN 159 | 160 | /* I have seen a prepare request with a lower proposal number */ 161 | 162 | UPDATE pgp_metadata.round 163 | SET min_proposal_num = current_proposal_num, 164 | proposer_id = current_proposer_id 165 | WHERE group_id = current_group_id AND round_num = current_round_num; 166 | 167 | SELECT true, current_proposer_id, current_proposal_num, 168 | round.value_id, round.value, round.accepted_proposal_num 169 | INTO response; 170 | 171 | ELSE 172 | /* I have seen a prepare request with a higher proposal number (or same) */ 173 | 174 | SELECT false, round.proposer_id, round.min_proposal_num, 175 | round.value_id, round.value, round.accepted_proposal_num 176 | INTO response; 177 | END IF; 178 | 179 | RETURN response; 180 | END; 181 | $BODY$ LANGUAGE 'plpgsql'; 182 | 183 | 184 | /* 185 | * paxos_request_accept is a remote procedure that requests an acceptor to accept 186 | * a value, if the lock is still held by the proposer. 187 | */ 188 | CREATE FUNCTION paxos_request_accept( 189 | current_proposer_id text, 190 | current_group_id text, 191 | current_round_num bigint, 192 | current_proposal_num bigint, 193 | proposed_value_id text, 194 | proposed_value text) 195 | RETURNS accept_response 196 | AS $BODY$ 197 | DECLARE 198 | response accept_response; 199 | round record; 200 | BEGIN 201 | /* 202 | * My response depends on any preceding prepare and accept requests for the same 203 | * group. Ensure prepare and accept requests are serialized through a lock. 204 | */ 205 | PERFORM pg_advisory_xact_lock(29020, hashtext(current_group_id)); 206 | 207 | /* Get the state of the current round */ 208 | SELECT * INTO round 209 | FROM pgp_metadata.round 210 | WHERE group_id = current_group_id AND round_num = current_round_num; 211 | 212 | IF NOT FOUND THEN 213 | /* I have not seen a prepare request for this proposal */ 214 | 215 | RAISE EXCEPTION 'Unknown round'; 216 | ELSIF current_proposal_num = round.min_proposal_num AND 217 | current_proposer_id = round.proposer_id THEN 218 | 219 | /* I have indeed promised to participate in this proposal and accept it */ 220 | 221 | UPDATE pgp_metadata.round 222 | SET value_id = proposed_value_id, 223 | value = proposed_value, 224 | accepted_proposal_num = current_proposal_num 225 | WHERE group_id = current_group_id AND round_num = current_round_num; 226 | 227 | SELECT true, current_proposal_num INTO response; 228 | ELSE 229 | /* I have promised to participate in a different proposal */ 230 | 231 | SELECT false, round.min_proposal_num INTO response; 232 | END IF; 233 | 234 | RETURN response; 235 | END; 236 | $BODY$ LANGUAGE 'plpgsql'; 237 | 238 | 239 | /* 240 | * paxos_confirm_consensus is a remote procedure that informs a learner that consensus 241 | * has been reached on a given value. 242 | */ 243 | CREATE FUNCTION paxos_confirm_consensus( 244 | current_proposer_id text, 245 | current_group_id text, 246 | current_round_num bigint, 247 | current_proposal_num bigint, 248 | accepted_value_id text, 249 | accepted_value text) 250 | RETURNS boolean 251 | AS $BODY$ 252 | DECLARE 253 | BEGIN 254 | /* No longer accept any new values and confirm consensus */ 255 | UPDATE pgp_metadata.round 256 | SET consensus = true, 257 | proposer_id = current_proposer_id, 258 | min_proposal_num = current_proposal_num, 259 | value = accepted_value, 260 | value_id = accepted_value_id, 261 | accepted_proposal_num = current_proposal_num 262 | WHERE group_id = current_group_id AND round_num = current_round_num; 263 | 264 | RETURN true; 265 | END; 266 | $BODY$ LANGUAGE 'plpgsql'; 267 | 268 | 269 | /* 270 | * Propose a value in a Paxos group for the given round. The current_proposer_id 271 | * has to be unique within the round (it can be reused in other rounds). 272 | */ 273 | CREATE FUNCTION paxos( 274 | current_proposer_id text, 275 | current_group_id text, 276 | current_round_num bigint, 277 | proposed_value text DEFAULT NULL) 278 | RETURNS paxos_result 279 | AS $BODY$ 280 | DECLARE 281 | num_hosts int; 282 | num_open_connections int; 283 | majority_size int; 284 | current_proposal_num bigint := 0; 285 | num_prepare_responses int; 286 | num_promises int; 287 | max_prepare_response prepare_response; 288 | num_accept_responses int; 289 | num_accepted int; 290 | initial_value text := proposed_value; 291 | initial_value_id text := current_proposer_id; 292 | proposed_value_id text := initial_value_id; 293 | accepted_value_id text; 294 | accepted_value text; 295 | value_changed boolean := false; 296 | start_time double precision := extract(EPOCH FROM clock_timestamp()); 297 | done boolean := false; 298 | inform_learners boolean := true; 299 | result paxos_result; 300 | BEGIN 301 | /* Find the hosts to use for the current round */ 302 | SELECT paxos_find_hosts(current_group_id, current_round_num) INTO num_hosts; 303 | 304 | majority_size = num_hosts / 2 + 1; 305 | 306 | CREATE TEMPORARY TABLE IF NOT EXISTS prepare_responses ( 307 | promised boolean, 308 | proposer_id text, 309 | min_proposal_num bigint, 310 | value_id text, 311 | value text, 312 | accepted_proposal_num bigint 313 | ); 314 | 315 | CREATE TEMPORARY TABLE IF NOT EXISTS accept_responses ( 316 | accepted boolean, 317 | min_proposal_num bigint 318 | ); 319 | 320 | WHILE NOT done LOOP 321 | DELETE FROM prepare_responses; 322 | DELETE FROM accept_responses; 323 | 324 | /* Try to open connections to all hosts in the group */ 325 | SELECT paxos_open_connections(num_hosts) INTO num_open_connections; 326 | 327 | IF num_open_connections < majority_size THEN 328 | PERFORM paxos_close_connections(); 329 | RAISE 'could only open % out of % connections', num_open_connections, majority_size; 330 | END IF; 331 | 332 | /* Phase 1 of Paxos: prepare */ 333 | INSERT INTO prepare_responses SELECT * FROM paxos_prepare( 334 | current_proposer_id, 335 | current_group_id, 336 | current_round_num, 337 | current_proposal_num); 338 | 339 | /* Check whether majority responded */ 340 | SELECT count(*) INTO num_prepare_responses FROM prepare_responses; 341 | 342 | IF num_prepare_responses < majority_size THEN 343 | RAISE WARNING 'could only get % out of % prepare responses, retrying after 1 sec', 344 | num_prepare_responses, majority_size; 345 | PERFORM pg_sleep(1); 346 | 347 | /* Make sure current_proposal_num is higher than any other known proposal */ 348 | SELECT greatest(max(min_proposal_num), current_proposal_num) + 1 349 | INTO current_proposal_num 350 | FROM prepare_responses; 351 | 352 | CONTINUE; 353 | END IF; 354 | 355 | /* Find whether consensus was already reached */ 356 | SELECT value_id, value INTO accepted_value_id, accepted_value 357 | FROM prepare_responses 358 | WHERE value_id IS NOT NULL 359 | GROUP BY value_id, value 360 | HAVING count(*) >= majority_size; 361 | 362 | IF FOUND THEN 363 | IF accepted_value_id <> initial_value_id OR accepted_value <> initial_value THEN 364 | /* There is consensus on someone else's value */ 365 | value_changed := true; 366 | ELSE 367 | /* It's actually my value, apparently I already had consensus on it */ 368 | value_changed := false; 369 | END IF; 370 | 371 | proposed_value := accepted_value; 372 | proposed_value_id := accepted_value_id; 373 | inform_learners := false; 374 | 375 | EXIT; 376 | END IF; 377 | 378 | /* Check whether majority promised to participate */ 379 | SELECT count(*) INTO num_promises FROM prepare_responses WHERE promised; 380 | 381 | IF num_promises < majority_size THEN 382 | 383 | /* Check whether I'm competing with someone */ 384 | SELECT * INTO max_prepare_response 385 | FROM prepare_responses 386 | ORDER BY min_proposal_num DESC, proposer_id DESC LIMIT 1; 387 | 388 | IF max_prepare_response.min_proposal_num = current_proposal_num THEN 389 | RAISE DEBUG 'competing with %, retrying after random back-off', 390 | max_prepare_response.proposer_id; 391 | PERFORM pg_sleep(trunc(random() * (EXTRACT(EPOCH FROM clock_timestamp())-start_time))); 392 | END IF; 393 | 394 | /* Make sure current_proposal_num is higher than any other known proposal */ 395 | SELECT greatest(max(min_proposal_num), current_proposal_num) + 1 396 | INTO current_proposal_num 397 | FROM prepare_responses; 398 | 399 | CONTINUE; 400 | END IF; 401 | 402 | /* Find highest accepted proposal */ 403 | SELECT * INTO max_prepare_response 404 | FROM prepare_responses 405 | WHERE value_id IS NOT NULL 406 | ORDER BY accepted_proposal_num DESC, value_id DESC LIMIT 1; 407 | 408 | /* If value was already accepted, change my proposal to the most recent value */ 409 | IF FOUND THEN 410 | IF max_prepare_response.value_id <> initial_value_id 411 | OR max_prepare_response.value <> initial_value THEN 412 | /* I will use a value from a different proposer */ 413 | value_changed := true; 414 | ELSE 415 | /* I will revert to my own value, which was previously accepted by someone else */ 416 | value_changed := false; 417 | END IF; 418 | 419 | proposed_value := max_prepare_response.value; 420 | proposed_value_id := max_prepare_response.value_id; 421 | END IF; 422 | 423 | /* Phase 2 of Paxos: accept */ 424 | INSERT INTO accept_responses SELECT * FROM paxos_accept( 425 | current_proposer_id, 426 | current_group_id, 427 | current_round_num, 428 | current_proposal_num, 429 | proposed_value_id, 430 | proposed_value); 431 | 432 | /* Check whether majority responded */ 433 | SELECT count(*) INTO num_accept_responses FROM accept_responses; 434 | 435 | IF num_accept_responses < majority_size THEN 436 | RAISE WARNING 'could not get accept responses from majority, retrying after 1 sec'; 437 | PERFORM pg_sleep(1); 438 | 439 | /* Make sure current_proposal_num is higher than any other known proposal */ 440 | SELECT greatest(max(min_proposal_num), current_proposal_num) + 1 441 | INTO current_proposal_num 442 | FROM accept_responses; 443 | 444 | CONTINUE; 445 | END IF; 446 | 447 | /* Check whether majority accepted */ 448 | SELECT count(*) INTO num_accepted FROM accept_responses WHERE accepted; 449 | 450 | IF num_accepted < majority_size THEN 451 | RAISE DEBUG 'could not get accepted by majority, retrying after 1 sec'; 452 | PERFORM pg_sleep(1); 453 | 454 | /* Make sure current_proposal_num is higher than any other known proposal */ 455 | SELECT greatest(max(min_proposal_num), current_proposal_num) + 1 456 | INTO current_proposal_num 457 | FROM accept_responses; 458 | 459 | CONTINUE; 460 | END IF; 461 | 462 | done := true; 463 | END LOOP; 464 | 465 | /* I now know consensus was reached on proposed_value */ 466 | result.accepted_value := proposed_value; 467 | result.value_changed := value_changed; 468 | 469 | IF inform_learners THEN 470 | /* Inform acceptors of the consensus */ 471 | PERFORM paxos_inform_learners( 472 | current_proposer_id, 473 | current_group_id, 474 | current_round_num, 475 | current_proposal_num, 476 | proposed_value_id, 477 | proposed_value); 478 | END IF; 479 | 480 | DROP TABLE prepare_responses; 481 | DROP TABLE accept_responses; 482 | 483 | RETURN result; 484 | END; 485 | $BODY$ LANGUAGE 'plpgsql'; 486 | 487 | 488 | /* 489 | * paxos_prepare performs a paxos_request_prepare RPC on all connected hosts. 490 | */ 491 | CREATE FUNCTION paxos_prepare( 492 | current_proposer_id text, 493 | current_group_id text, 494 | current_round_num bigint, 495 | current_proposal_num bigint) 496 | RETURNS SETOF prepare_response 497 | AS $BODY$ 498 | DECLARE 499 | prepare_query text; 500 | host record; 501 | num_conn int; 502 | BEGIN 503 | prepare_query := format('SELECT paxos_request_prepare(%s,%s,%s,%s)', 504 | quote_literal(current_proposer_id), 505 | quote_literal(current_group_id), 506 | current_round_num, 507 | current_proposal_num); 508 | 509 | PERFORM paxos_broadcast_query(prepare_query); 510 | 511 | FOR host IN SELECT * FROM pg_paxos_hosts WHERE connected LOOP 512 | RETURN QUERY 513 | SELECT (resp).* FROM dblink_get_result(host.connection_name, false) 514 | AS (resp prepare_response); 515 | END LOOP; 516 | 517 | PERFORM paxos_clear_connections(); 518 | END; 519 | $BODY$ LANGUAGE 'plpgsql'; 520 | 521 | 522 | /* 523 | * paxos_accept performs a paxos_request_accept RPC on all connected hosts. 524 | */ 525 | CREATE FUNCTION paxos_accept( 526 | current_proposer_id text, 527 | current_group_id text, 528 | current_round_num bigint, 529 | current_proposal_num bigint, 530 | proposed_value_id text, 531 | proposed_value text) 532 | RETURNS SETOF accept_response 533 | AS $BODY$ 534 | DECLARE 535 | accept_query text; 536 | host record; 537 | BEGIN 538 | accept_query := format('SELECT paxos_request_accept(%s,%s,%s,%s,%s,%s)', 539 | quote_literal(current_proposer_id), 540 | quote_literal(current_group_id), 541 | current_round_num, 542 | current_proposal_num, 543 | quote_literal(proposed_value_id), 544 | quote_literal(proposed_value)); 545 | 546 | PERFORM paxos_broadcast_query(accept_query); 547 | 548 | FOR host IN SELECT * FROM pg_paxos_hosts WHERE connected LOOP 549 | RETURN QUERY 550 | SELECT (resp).* FROM dblink_get_result(host.connection_name, false) 551 | AS (resp accept_response); 552 | END LOOP; 553 | 554 | PERFORM paxos_clear_connections(); 555 | END; 556 | $BODY$ LANGUAGE 'plpgsql'; 557 | 558 | 559 | /* 560 | * paxos_inform_learners performs a paxos_confirm_consensus RPC on all connected hosts. 561 | */ 562 | CREATE FUNCTION paxos_inform_learners( 563 | current_proposer_id text, 564 | current_group_id text, 565 | current_round_num bigint, 566 | current_proposal_num bigint, 567 | proposed_value_id text, 568 | proposed_value text) 569 | RETURNS void 570 | AS $BODY$ 571 | DECLARE 572 | confirm_query text; 573 | host record; 574 | BEGIN 575 | confirm_query := format('SELECT paxos_confirm_consensus(%s,%s,%s,%s,%s,%s)', 576 | quote_literal(current_proposer_id), 577 | quote_literal(current_group_id), 578 | current_round_num, 579 | current_proposal_num, 580 | quote_literal(proposed_value_id), 581 | quote_literal(proposed_value)); 582 | 583 | PERFORM paxos_broadcast_query(confirm_query); 584 | 585 | FOR host IN SELECT * FROM pg_paxos_hosts WHERE connected LOOP 586 | PERFORM * FROM dblink_get_result(host.connection_name, false) AS (resp boolean); 587 | END LOOP; 588 | 589 | PERFORM paxos_clear_connections(); 590 | END; 591 | $BODY$ LANGUAGE 'plpgsql'; 592 | 593 | 594 | /* 595 | * paxos_append appends proposed_value to the log of the group whose name is given by 596 | * current_group_id. 597 | */ 598 | CREATE FUNCTION paxos_append( 599 | current_proposer_id text, 600 | current_group_id text, 601 | proposed_value text) 602 | RETURNS bigint 603 | AS $BODY$ 604 | DECLARE 605 | current_round_num bigint; 606 | accepted_value text; 607 | value_not_logged boolean := true; 608 | BEGIN 609 | 610 | /* 611 | * We optimistically assume we that we know about all rounds and start from the 612 | * highest. Another node could be using the same or higher round ID, but if that 613 | * node reaches consensus on its value for that round we will retry paxos with 614 | * round ID + 1, until we succeed. 615 | */ 616 | SELECT Coalesce(max(round_num), -1) INTO current_round_num 617 | FROM pgp_metadata.round WHERE group_id = current_group_id; 618 | 619 | WHILE value_not_logged LOOP 620 | current_round_num := current_round_num + 1; 621 | 622 | SELECT * FROM paxos( 623 | current_proposer_id, 624 | current_group_id, 625 | current_round_num, 626 | proposed_value) INTO accepted_value, value_not_logged; 627 | END LOOP; 628 | 629 | RETURN current_round_num; 630 | END; 631 | $BODY$ LANGUAGE 'plpgsql'; 632 | 633 | 634 | /* 635 | * paxos_max_group_round returns the highest round number used among a majority of 636 | * nodes. If a majority cannot be reached, the function fails. If accepted_only is 637 | * true, the function returns the highest accepted round number among the nodes. 638 | */ 639 | CREATE FUNCTION paxos_max_group_round( 640 | current_group_id text, 641 | accepted_only boolean DEFAULT true) 642 | RETURNS bigint 643 | AS $BODY$ 644 | DECLARE 645 | num_hosts int; 646 | majority_size int; 647 | round_query text; 648 | max_round_num bigint := -1; 649 | num_responses int := 0; 650 | remote_round_num bigint; 651 | host record; 652 | BEGIN 653 | /* Set up connections */ 654 | SELECT paxos_init_group(current_group_id) INTO num_hosts; 655 | 656 | majority_size = num_hosts / 2 + 1; 657 | 658 | /* Ask nodes for highest round number in their log */ 659 | round_query := format('SELECT max(round_num) '|| 660 | 'FROM pgp_metadata.round '|| 661 | 'WHERE group_id = %s', 662 | quote_literal(current_group_id)); 663 | 664 | IF accepted_only THEN 665 | round_query := round_query || ' AND value_id IS NOT NULL'; 666 | END IF; 667 | 668 | PERFORM paxos_broadcast_query(round_query); 669 | 670 | FOR host IN SELECT * FROM pg_paxos_hosts WHERE connected LOOP 671 | SELECT resp INTO remote_round_num 672 | FROM dblink_get_result(host.connection_name, false) AS (resp bigint); 673 | 674 | IF remote_round_num IS NULL THEN 675 | CONTINUE; 676 | END IF; 677 | 678 | IF remote_round_num > max_round_num THEN 679 | max_round_num := remote_round_num; 680 | END IF; 681 | 682 | num_responses := num_responses + 1; 683 | END LOOP; 684 | 685 | PERFORM paxos_clear_connections(); 686 | 687 | IF num_responses < majority_size THEN 688 | RAISE 'could only get % out of % responses', num_responses, majority_size; 689 | END IF; 690 | 691 | RETURN max_round_num; 692 | END; 693 | $BODY$ LANGUAGE 'plpgsql'; 694 | 695 | 696 | /* 697 | * paxos_apply_log implements table replication through Paxos. It interprets the log of 698 | * the Paxos group whose name is in current_group_id as a sequence of SQL commands and 699 | * executes all commands up to and including the round with number max_round_num. 700 | */ 701 | CREATE FUNCTION paxos_apply_log( 702 | current_proposer_id text, 703 | current_group_id text, 704 | max_round_num bigint) 705 | RETURNS bigint 706 | AS $BODY$ 707 | DECLARE 708 | last_applied_round_num bigint; 709 | current_round_num bigint; 710 | current_membership_view bigint; 711 | post_membership_view bigint; 712 | query text; 713 | noop_written boolean; 714 | BEGIN 715 | /* Prevent other processes from applying the same log */ 716 | PERFORM pg_advisory_xact_lock(29030, hashtext(current_group_id)); 717 | 718 | SELECT last_applied_round INTO current_round_num 719 | FROM pgp_metadata.group 720 | WHERE group_id = current_group_id; 721 | 722 | SET LOCAL pg_paxos.enabled TO false; 723 | 724 | WHILE current_round_num < max_round_num LOOP 725 | 726 | current_round_num := current_round_num + 1; 727 | 728 | SELECT value INTO query 729 | FROM pgp_metadata.round 730 | WHERE group_id = current_group_id AND round_num = current_round_num AND consensus; 731 | 732 | IF NOT FOUND THEN 733 | SELECT * FROM paxos( 734 | current_proposer_id, 735 | current_group_id, 736 | current_round_num, 737 | '') INTO query, noop_written; 738 | END IF; 739 | 740 | RAISE DEBUG 'Executing: %', query; 741 | 742 | BEGIN 743 | EXECUTE query; 744 | EXCEPTION WHEN others THEN 745 | UPDATE pgp_metadata.round 746 | SET error = SQLERRM 747 | WHERE group_id = current_group_id AND round_num = current_round_num; 748 | END; 749 | 750 | END LOOP; 751 | 752 | UPDATE pgp_metadata.group 753 | SET last_applied_round = max_round_num 754 | WHERE group_id = current_group_id; 755 | 756 | RETURN max_round_num; 757 | END; 758 | $BODY$ LANGUAGE 'plpgsql'; 759 | 760 | 761 | /* 762 | * paxos_apply_and_append appends a new command to the log and applies all preceding 763 | * commands in lockstep fashion. 764 | */ 765 | CREATE FUNCTION paxos_apply_and_append( 766 | current_proposer_id text, 767 | current_group_id text, 768 | proposed_value text) 769 | RETURNS bigint 770 | AS $BODY$ 771 | DECLARE 772 | current_round_num bigint; 773 | value_not_logged boolean := true; 774 | query text; 775 | BEGIN 776 | SELECT Coalesce(max(round_num), -1) INTO current_round_num 777 | FROM pgp_metadata.round WHERE group_id = current_group_id; 778 | 779 | WHILE value_not_logged LOOP 780 | PERFORM paxos_apply_log( 781 | current_proposer_id, 782 | current_group_id, 783 | current_round_num); 784 | 785 | current_round_num := current_round_num + 1; 786 | 787 | SELECT * FROM paxos( 788 | current_proposer_id, 789 | current_group_id, 790 | current_round_num, 791 | proposed_value) INTO query, value_not_logged; 792 | END LOOP; 793 | 794 | RETURN current_round_num; 795 | END; 796 | $BODY$ LANGUAGE 'plpgsql'; 797 | 798 | 799 | /* 800 | * paxos_add_host appends a command that adds the given host to the Paxos group 801 | * starting from the round in which this command is logged + 1. 802 | */ 803 | CREATE FUNCTION paxos_add_host( 804 | current_proposer_id text, 805 | current_group_id text, 806 | connection_string text) 807 | RETURNS bigint 808 | AS $BODY$ 809 | DECLARE 810 | current_round_num bigint; 811 | proposed_value text; 812 | accepted_value text; 813 | value_not_logged boolean := true; 814 | BEGIN 815 | SELECT Coalesce(max(round_num), -1) INTO current_round_num 816 | FROM pgp_metadata.round WHERE group_id = current_group_id; 817 | 818 | WHILE value_not_logged LOOP 819 | PERFORM paxos_apply_log( 820 | current_proposer_id, 821 | current_group_id, 822 | current_round_num); 823 | 824 | current_round_num := current_round_num + 1; 825 | 826 | proposed_value := format('INSERT INTO pgp_metadata.host VALUES (%s,%s,%s)', 827 | quote_literal(current_group_id), 828 | quote_literal(connection_string), 829 | current_round_num+1); 830 | SELECT * FROM paxos( 831 | current_proposer_id, 832 | current_group_id, 833 | current_round_num, 834 | proposed_value) INTO accepted_value, value_not_logged; 835 | END LOOP; 836 | 837 | RETURN current_round_num; 838 | END; 839 | $BODY$ LANGUAGE 'plpgsql'; 840 | 841 | 842 | /* 843 | * paxos_remove_host appends a command that removes the given host from the Paxos group 844 | * starting from the round after the one in which this command is logged. 845 | */ 846 | CREATE FUNCTION paxos_remove_host( 847 | current_proposer_id text, 848 | current_group_id text, 849 | connection_string text) 850 | RETURNS bigint 851 | AS $BODY$ 852 | DECLARE 853 | current_round_num bigint; 854 | proposed_value text; 855 | accepted_value text; 856 | value_not_logged boolean := true; 857 | BEGIN 858 | SELECT Coalesce(max(round_num), -1) INTO current_round_num 859 | FROM pgp_metadata.round WHERE group_id = current_group_id; 860 | 861 | WHILE value_not_logged LOOP 862 | PERFORM paxos_apply_log( 863 | current_proposer_id, 864 | current_group_id, 865 | current_round_num); 866 | 867 | current_round_num := current_round_num + 1; 868 | 869 | proposed_value := format('UPDATE pgp_metadata.host '|| 870 | 'SET max_round_num = %s '|| 871 | 'WHERE group_id = %s '|| 872 | 'AND connection_string = %s ', 873 | current_round_num, 874 | quote_literal(current_group_id), 875 | quote_literal(connection_string)); 876 | 877 | SELECT * FROM paxos( 878 | current_proposer_id, 879 | current_group_id, 880 | current_round_num, 881 | proposed_value) INTO accepted_value, value_not_logged; 882 | END LOOP; 883 | 884 | RETURN current_round_num; 885 | END; 886 | $BODY$ LANGUAGE 'plpgsql'; 887 | 888 | 889 | /* 890 | * paxos_init_group finds a current set of hosts for a given Paxos group opens 891 | * connections to them. 892 | */ 893 | CREATE FUNCTION paxos_init_group( 894 | current_group_id text) 895 | RETURNS int 896 | AS $BODY$ 897 | DECLARE 898 | max_local_round bigint; 899 | num_hosts int; 900 | majority_size int; 901 | num_open_connections int; 902 | round_query text; 903 | host record; 904 | BEGIN 905 | /* Always use the most recent membership view (maximum bigint) */ 906 | SELECT paxos_find_hosts(current_group_id, 9223372036854775807) INTO num_hosts; 907 | 908 | majority_size = num_hosts / 2 + 1; 909 | 910 | /* Try to open connections to a majority of hosts */ 911 | SELECT paxos_open_connections(majority_size) INTO num_open_connections; 912 | 913 | return num_hosts; 914 | END; 915 | $BODY$ LANGUAGE 'plpgsql'; 916 | 917 | 918 | /* 919 | * paxos_find_hosts stores a snapshot of the hosts for a given round in a temporary 920 | * table named "hosts". 921 | */ 922 | CREATE FUNCTION paxos_find_hosts( 923 | current_group_id text, 924 | current_round_num bigint) 925 | RETURNS int 926 | AS $BODY$ 927 | DECLARE 928 | num_hosts int; 929 | BEGIN 930 | 931 | IF NOT EXISTS ( 932 | SELECT relname 933 | FROM pg_class 934 | WHERE relnamespace = pg_my_temp_schema() AND relname = 'pg_paxos_hosts') THEN 935 | 936 | CREATE TEMPORARY TABLE IF NOT EXISTS pg_paxos_hosts ( 937 | connection_name text, 938 | connection_string text, 939 | connected boolean 940 | ); 941 | 942 | END IF; 943 | 944 | DELETE FROM pg_paxos_hosts; 945 | 946 | INSERT INTO pg_paxos_hosts 947 | SELECT connection_string AS connection_name, 948 | connection_string, 949 | false AS connected 950 | FROM pgp_metadata.host 951 | WHERE group_id = current_group_id 952 | AND min_round_num <= current_round_num 953 | AND (max_round_num IS NULL OR current_round_num <= max_round_num) 954 | GROUP BY connection_string; 955 | 956 | SELECT count(*) INTO num_hosts FROM pg_paxos_hosts; 957 | 958 | RETURN num_hosts; 959 | END; 960 | $BODY$ LANGUAGE 'plpgsql'; 961 | 962 | 963 | /* 964 | * paxos_open_connections tries to open connections to all the hosts in the "hosts" 965 | * table. If there are at least min_connections connections open at the start of this 966 | * function, then no new connections are opened. 967 | */ 968 | CREATE FUNCTION paxos_open_connections(min_connections int) 969 | RETURNS int 970 | AS $BODY$ 971 | DECLARE 972 | num_open_connections int; 973 | host record; 974 | connection_open boolean; 975 | BEGIN 976 | num_open_connections := 0; 977 | 978 | FOR host IN 979 | SELECT h.connection_name, h.connection_string, c.connected 980 | FROM pg_paxos_hosts h LEFT OUTER JOIN 981 | (SELECT unnest AS connected 982 | FROM unnest(dblink_get_connections())) c ON (h.connection_name = c.connected) LOOP 983 | 984 | IF host.connected IS NOT NULL THEN 985 | IF dblink_error_message(host.connection_name) = 'OK' THEN 986 | /* We previously opened this connection */ 987 | num_open_connections := num_open_connections + 1; 988 | UPDATE pg_paxos_hosts SET connected = true 989 | WHERE connection_name = host.connection_name; 990 | ELSE 991 | /* Close connections that have errored out, will not be used next round */ 992 | RAISE WARNING 'connection error: %', dblink_error_message(host.connection_name); 993 | PERFORM dblink_disconnect(host.connection_name); 994 | UPDATE pg_paxos_hosts SET connected = false 995 | WHERE connection_name = host.connection_name; 996 | END IF; 997 | END IF; 998 | END LOOP; 999 | 1000 | /* If we already have the minimum number of connections open, we're done */ 1001 | IF num_open_connections >= min_connections THEN 1002 | RETURN num_open_connections; 1003 | END IF; 1004 | 1005 | /* Otherwise, try to open as many connections as possible (bias towards reads) */ 1006 | FOR host IN 1007 | SELECT h.connection_name, h.connection_string, c.connected 1008 | FROM pg_paxos_hosts h LEFT OUTER JOIN 1009 | (SELECT unnest AS connected 1010 | FROM unnest(dblink_get_connections())) c ON (h.connection_name = c.connected) LOOP 1011 | 1012 | IF host.connected IS NULL THEN 1013 | /* Open new connection */ 1014 | BEGIN 1015 | PERFORM dblink_connect(host.connection_name, host.connection_string); 1016 | num_open_connections := num_open_connections + 1; 1017 | UPDATE pg_paxos_hosts SET connected = true 1018 | WHERE connection_name = host.connection_name; 1019 | EXCEPTION WHEN OTHERS THEN 1020 | RAISE WARNING 'failed to connect to %', host.connection_string; 1021 | UPDATE pg_paxos_hosts SET connected = false 1022 | WHERE connection_name = host.connection_name; 1023 | END; 1024 | END IF; 1025 | END LOOP; 1026 | 1027 | RETURN num_open_connections; 1028 | END; 1029 | $BODY$ LANGUAGE 'plpgsql'; 1030 | 1031 | 1032 | /* 1033 | * paxos_broadcast_query sends a query to all connected hosts. 1034 | */ 1035 | CREATE FUNCTION paxos_broadcast_query(query_string text) 1036 | RETURNS void 1037 | AS $BODY$ 1038 | DECLARE 1039 | host record; 1040 | BEGIN 1041 | FOR host IN SELECT * FROM pg_paxos_hosts WHERE connected LOOP 1042 | BEGIN 1043 | PERFORM dblink_send_query(host.connection_name, query_string); 1044 | EXCEPTION WHEN OTHERS THEN 1045 | PERFORM dblink_disconnect(host.connection_name); 1046 | UPDATE pg_paxos_hosts SET connected = false 1047 | WHERE connection_name = host.connection_name; 1048 | END; 1049 | END LOOP; 1050 | END; 1051 | $BODY$ LANGUAGE 'plpgsql'; 1052 | 1053 | 1054 | /* 1055 | * paxos_clear_connections calls dblink_get_result on open connections to ensure that 1056 | * the connection can be reused. 1057 | */ 1058 | CREATE FUNCTION paxos_clear_connections() 1059 | RETURNS void 1060 | AS $BODY$ 1061 | DECLARE 1062 | host record; 1063 | BEGIN 1064 | FOR host IN SELECT * FROM pg_paxos_hosts WHERE connected LOOP 1065 | /* Need to call get_result until it returns NULL to clear the connection */ 1066 | PERFORM * FROM dblink_get_result(host.connection_name, false) AS (resp int); 1067 | END LOOP; 1068 | END; 1069 | $BODY$ LANGUAGE 'plpgsql'; 1070 | 1071 | 1072 | /* 1073 | * paxos_close_connections closes all open connections. 1074 | */ 1075 | CREATE FUNCTION paxos_close_connections() 1076 | RETURNS void 1077 | AS $BODY$ 1078 | DECLARE 1079 | host record; 1080 | BEGIN 1081 | FOR host IN SELECT * FROM pg_paxos_hosts WHERE connected LOOP 1082 | PERFORM dblink_disconnect(host.connection_name); 1083 | END LOOP; 1084 | END; 1085 | $BODY$ LANGUAGE 'plpgsql'; 1086 | 1087 | 1088 | /* 1089 | * paxos_execute executes a query locally, by-passing the Paxos query logging logic. 1090 | */ 1091 | CREATE FUNCTION paxos_execute(INTERNAL) 1092 | RETURNS void 1093 | LANGUAGE C 1094 | AS 'MODULE_PATHNAME', $$paxos_execute$$; 1095 | 1096 | 1097 | /* 1098 | * paxos_create_group creates a new Paxos group with a single member. 1099 | */ 1100 | CREATE FUNCTION paxos_create_group( 1101 | current_group_id text, 1102 | founder_connection_string text) 1103 | RETURNS void 1104 | AS $BODY$ 1105 | BEGIN 1106 | INSERT INTO pgp_metadata.group ( 1107 | group_id, 1108 | last_applied_round, 1109 | own_connection_string) 1110 | VALUES (current_group_id, 1111 | 0, 1112 | founder_connection_string); 1113 | 1114 | INSERT INTO pgp_metadata.host ( 1115 | group_id, 1116 | connection_string, 1117 | min_round_num) 1118 | VALUES (current_group_id, 1119 | founder_connection_string, 1120 | 1); 1121 | 1122 | INSERT INTO pgp_metadata.round( 1123 | group_id, 1124 | round_num, 1125 | min_proposal_num, 1126 | proposer_id, 1127 | value_id, 1128 | value, 1129 | accepted_proposal_num, 1130 | consensus) 1131 | VALUES (current_group_id, 1132 | 0, 1133 | 0, 1134 | 'pg_paxos initial', 1135 | 'pg_paxos initial', 1136 | format('INSERT INTO pgp_metadata.host VALUES (%s,%s)', 1137 | quote_literal(current_group_id), 1138 | quote_literal(founder_connection_string)), 1139 | 0, 1140 | true); 1141 | END; 1142 | $BODY$ LANGUAGE plpgsql; 1143 | 1144 | 1145 | /* 1146 | * paxos_join_group joins the node on which the function is called to a Paxos 1147 | * group. 1148 | */ 1149 | CREATE FUNCTION paxos_join_group( 1150 | current_group_id text, 1151 | existing_connection_string text, 1152 | own_connection_string text) 1153 | RETURNS void 1154 | AS $BODY$ 1155 | DECLARE 1156 | test_query text; 1157 | current_proposer_id text := 'join/'||own_connection_string||'/'||txid_current(); 1158 | group_query text; 1159 | table_query text; 1160 | replicated_table record; 1161 | round_query text; 1162 | host_query text; 1163 | BEGIN 1164 | PERFORM dblink_connect('paxos_join_group', existing_connection_string); 1165 | 1166 | /* Ensure we get a consistent snapshot */ 1167 | PERFORM dblink_exec('paxos_join_group', 'BEGIN'); 1168 | PERFORM dblink_exec('paxos_join_group', 'SET TRANSACTION '|| 1169 | 'ISOLATION LEVEL SERIALIZABLE'); 1170 | PERFORM dblink_exec('paxos_join_group', 'SET LOCAL pg_paxos.enabled TO false'); 1171 | 1172 | /* Verify connection string */ 1173 | test_query := format('SELECT * FROM dblink(%s,''SELECT 1'') AS (one int)', 1174 | quote_literal(own_connection_string)); 1175 | 1176 | PERFORM * FROM dblink('paxos_join_group', test_query) AS (one int); 1177 | 1178 | /* Copy the group table from the existing node */ 1179 | group_query := format('SELECT (%s) '|| 1180 | 'FROM pgp_metadata.group '|| 1181 | 'WHERE group_id = %s', 1182 | table_column_names('pgp_metadata.group'), 1183 | quote_literal(current_group_id)); 1184 | 1185 | INSERT INTO pgp_metadata.group 1186 | SELECT (res).group_id, (res).last_applied_round, own_connection_string 1187 | FROM dblink('paxos_join_group', group_query) AS (res pgp_metadata.group); 1188 | 1189 | /* Copy the hosts table from the existing node */ 1190 | host_query := format('SELECT (%s) '|| 1191 | 'FROM pgp_metadata.host '|| 1192 | 'WHERE group_id = %s', 1193 | table_column_names('pgp_metadata.host'), 1194 | quote_literal(current_group_id)); 1195 | 1196 | INSERT INTO pgp_metadata.host SELECT (res).* 1197 | FROM dblink('paxos_join_group', host_query) AS (res pgp_metadata.host); 1198 | 1199 | /* Copy the replicated tables table from the existing node */ 1200 | table_query := format('SELECT (%s) '|| 1201 | 'FROM pgp_metadata.replicated_tables '|| 1202 | 'WHERE group_id = %s', 1203 | table_column_names('pgp_metadata.replicated_tables'), 1204 | quote_literal(current_group_id)); 1205 | 1206 | INSERT INTO pgp_metadata.replicated_tables SELECT (res).* 1207 | FROM dblink('paxos_join_group', table_query) AS (res pgp_metadata.replicated_tables); 1208 | 1209 | SET LOCAL pg_paxos.enabled TO false; 1210 | 1211 | /* Copy the replicated tables from the existing node */ 1212 | FOR replicated_table IN 1213 | SELECT * FROM pgp_metadata.replicated_tables WHERE group_id = current_group_id LOOP 1214 | EXECUTE format('TRUNCATE %I', quote_ident(replicated_table.table_oid::text)); 1215 | 1216 | EXECUTE format('INSERT INTO %I SELECT (res).* '|| 1217 | 'FROM dblink(''paxos_join_group'','|| 1218 | '''SELECT (%s) FROM %I'') AS (res %I)', 1219 | quote_ident(replicated_table.table_oid::text), 1220 | table_column_names(replicated_table.table_oid), 1221 | quote_ident(replicated_table.table_oid::text), 1222 | quote_ident(replicated_table.table_oid::text)); 1223 | END LOOP; 1224 | 1225 | /* Copy the round table from the existing node */ 1226 | round_query := format('SELECT (%s) '|| 1227 | 'FROM pgp_metadata.round '|| 1228 | 'WHERE group_id = %s', 1229 | table_column_names('pgp_metadata.round'), 1230 | quote_literal(current_group_id)); 1231 | 1232 | INSERT INTO pgp_metadata.round SELECT (res).* 1233 | FROM dblink('paxos_join_group', round_query) AS (res pgp_metadata.round); 1234 | 1235 | PERFORM dblink_exec('paxos_join_group', 'END'); 1236 | PERFORM dblink_disconnect('paxos_join_group'); 1237 | 1238 | /* Start actively participating in the Paxos group */ 1239 | PERFORM paxos_add_host( 1240 | current_proposer_id, 1241 | current_group_id, 1242 | own_connection_string); 1243 | 1244 | EXCEPTION WHEN OTHERS THEN 1245 | PERFORM dblink_disconnect('paxos_join_group'); 1246 | RAISE EXCEPTION '%', SQLERRM; 1247 | END; 1248 | $BODY$ LANGUAGE plpgsql; 1249 | 1250 | 1251 | /* 1252 | * paxos_leave_group removes the node on which the function is called from a Paxos 1253 | * group, leaving the replicated tables in their final state at the moment this 1254 | * function is called. 1255 | */ 1256 | CREATE FUNCTION paxos_leave_group( 1257 | current_group_id text) 1258 | RETURNS void 1259 | AS $BODY$ 1260 | DECLARE 1261 | connection_string text; 1262 | current_proposer_id text; 1263 | BEGIN 1264 | SELECT own_connection_string INTO connection_string 1265 | FROM pgp_metadata.group WHERE group_id = current_group_id; 1266 | 1267 | current_proposer_id := 'leave/'||connection_string||'/'||txid_current(); 1268 | 1269 | PERFORM paxos_remove_host( 1270 | current_proposer_id, 1271 | current_group_id, 1272 | connection_string); 1273 | 1274 | DELETE FROM pgp_metadata.replicated_tables WHERE group_id = current_group_id; 1275 | DELETE FROM pgp_metadata.host WHERE group_id = current_group_id; 1276 | DELETE FROM pgp_metadata.round WHERE group_id = current_group_id; 1277 | DELETE FROM pgp_metadata.group WHERE group_id = current_group_id; 1278 | END; 1279 | $BODY$ LANGUAGE plpgsql; 1280 | 1281 | 1282 | /* 1283 | * paxos_replicate_table replicates the table identified by new_table_oid within 1284 | * the Paxos group identified by current_group_id. 1285 | */ 1286 | CREATE FUNCTION paxos_replicate_table( 1287 | current_group_id text, 1288 | new_table_oid regclass) 1289 | RETURNS void 1290 | AS $BODY$ 1291 | BEGIN 1292 | INSERT INTO pgp_metadata.replicated_tables ( 1293 | table_oid, 1294 | group_id) 1295 | VALUES (new_table_oid, 1296 | current_group_id); 1297 | END; 1298 | $BODY$ LANGUAGE plpgsql; 1299 | 1300 | 1301 | /* 1302 | * table_column_names returns a comma-separate string of column names. 1303 | */ 1304 | CREATE FUNCTION table_column_names(table_id regclass) 1305 | RETURNS text 1306 | AS $BODY$ 1307 | DECLARE 1308 | column_string text; 1309 | BEGIN 1310 | SELECT string_agg(col,',') INTO column_string 1311 | FROM (SELECT a.attname col 1312 | FROM pg_attribute a 1313 | WHERE attrelid = table_id AND attnum > 0 ORDER BY attnum) cols; 1314 | 1315 | RETURN column_string; 1316 | END; 1317 | $BODY$ LANGUAGE plpgsql; 1318 | 1319 | --------------------------------------------------------------------------------