├── .gitignore ├── .gitlab-ci.yml ├── Cluster.pm ├── Dockerfile ├── LICENSE ├── Makefile ├── README.md ├── doc ├── multimaster.xml ├── multimaster_book.xml ├── readme.md ├── specs │ ├── .gitignore │ ├── MtmGenerations.cfg │ ├── MtmGenerations.tla │ ├── MtmPrimitiveCurrent.cfg │ ├── MtmPrimitiveCurrent.tla │ ├── MtmPrimitiveCurrentMasks.cfg │ ├── MtmPrimitiveCurrentMasks.tla │ ├── MtmPrimitiveCurrentMasksFixed.cfg │ ├── MtmPrimitiveCurrentMasksFixed.tla │ ├── commit.cfg │ ├── commit.md │ ├── commit.tla │ ├── generations2.md │ └── mm_recovery.ipynb ├── stylesheet.css └── stylesheet.xsl ├── expected ├── atx.out ├── multimaster.out ├── regression_ee.diff └── regression_vanilla.diff ├── multimaster--1.0.sql ├── multimaster.control ├── referee ├── Makefile ├── expected │ └── referee.out ├── referee--1.0.sql ├── referee.control └── sql │ └── referee.sql ├── run.pl ├── sql ├── atx.sql └── multimaster.sql ├── src ├── bgwpool.c ├── bkb.c ├── bytebuf.c ├── commit.c ├── ddd.c ├── ddl.c ├── dmq.c ├── global_tx.c ├── include │ ├── bgwpool.h │ ├── bkb.h │ ├── bytebuf.h │ ├── commit.h │ ├── compat.h │ ├── ddd.h │ ├── ddl.h │ ├── dmq.h │ ├── global_tx.h │ ├── logger.h │ ├── messaging.h │ ├── mtm_utils.h │ ├── multimaster.h │ ├── pglogical_config.h │ ├── pglogical_hooks.h │ ├── pglogical_output.h │ ├── pglogical_output │ │ ├── compat.h │ │ └── hooks.h │ ├── pglogical_proto.h │ ├── pglogical_relid_map.h │ ├── receiver.h │ ├── resolver.h │ ├── spill.h │ ├── state.h │ └── syncpoint.h ├── mtm_utils.c ├── multimaster.c ├── pglogical_apply.c ├── pglogical_config.c ├── pglogical_hooks.c ├── pglogical_output.c ├── pglogical_proto.c ├── pglogical_receiver.c ├── pglogical_relid_map.c ├── resolver.c ├── spill.c ├── state.c ├── syncpoint.c └── test_bkb.sage.py ├── t ├── 000_cross._pl ├── 000_deadlock.pl ├── 000_init._pl ├── 001_regress.pl ├── 002_regressmm.pl ├── 003_basic_recovery.pl ├── 004_recovery.pl ├── 005_pgbench.pl ├── 006_pgbenchdl.pl ├── 007_add_stop_node.pl ├── 008_bugfixes.pl └── 009_identity_func.pl ├── tests ├── .gitignore ├── Pipfile ├── Pipfile.lock ├── deadl.pgb ├── docker-compose.yml ├── docker-entrypoint.sh ├── lib │ ├── __init__.py │ ├── bank_client.py │ ├── failure_injector.py │ ├── log_helper.py │ └── test_helper.py ├── reader.pgb ├── requirements.txt ├── support │ ├── bumptime.c │ ├── docker-regress.sh │ └── two_nodes.yml ├── test_bkb.sage.py ├── test_recovery_random.py ├── test_referee.py ├── test_regression.py ├── test_syncpoint.py └── writer.pgb └── tests_testgres ├── .gitignore ├── connect.jsh ├── ddl.py ├── mm_cluster.py ├── run_tests.sh ├── test_failover.py └── tests ├── __init__.py ├── bootstrap.py └── truncate.py /.gitignore: -------------------------------------------------------------------------------- 1 | /log/ 2 | /results/ 3 | /tmp_check/ 4 | regression.diff.diff -------------------------------------------------------------------------------- /.gitlab-ci.yml: -------------------------------------------------------------------------------- 1 | # Run python tests. 2 | 3 | image: pgpgpro/dev:stretch 4 | 5 | .only-default: &only-default 6 | only: 7 | refs: 8 | - merge_requests 9 | - tags 10 | - schedules 11 | - branches 12 | - pushes 13 | - web 14 | - triggers 15 | changes: 16 | - '**/*' 17 | 18 | # Tests are docker-based, and so is gitlab executor itself. We are using a bit 19 | # monstrous (and recommended) approach of running dind 'service' container 20 | # alongside main executor; it runs docker and exposes its socket: 21 | # https://docs.gitlab.com/ee/ci/docker/using_docker_build.html#use-the-docker-executor-with-the-docker-image-docker-in-docker 22 | # These variables tell the executor how to reach the socket. 23 | # 24 | # The 'docker' hostname is the alias of the service container as described at 25 | # https://docs.gitlab.com/ee/ci/docker/using_docker_images.html#accessing-the-services 26 | .docker_variables: &docker_variables 27 | DOCKER_HOST: tcp://docker:2375/ 28 | # When using dind, it's wise to use the overlayfs driver for 29 | # improved performance. 30 | DOCKER_DRIVER: overlay2 31 | DOCKER_TLS_CERTDIR: "" 32 | 33 | stages: 34 | - build_core_image 35 | - make_check 36 | # hardcoded stuff in python tests doesn't allow to run them in parallel 37 | - recovery random 38 | - referee 39 | - syncpoint 40 | 41 | # builds image with ee core and saves it as an artifact 42 | build_core_image: 43 | <<: *only-default 44 | stage: build_core_image 45 | retry: 1 46 | image: pgpgpro/dev:alpine 47 | # run container providing docker alongside 48 | services: 49 | - docker:dind 50 | variables: 51 | <<: *docker_variables 52 | branch: ee13_mm 53 | artifacts: 54 | expire_in: 24 hours 55 | when: always 56 | paths: 57 | - docker-image/pgmm.tar.gz 58 | - postgrespro.tar.gz 59 | script: 60 | # Add mm_gitlab_ci_ed25519 env var of type 'file' with the key in gitlab 61 | - ssh-agent sh -c 'ssh-add ${mm_gitlab_ci_ed25519}; GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone --depth=1 --branch "${branch}" git@git.postgrespro.ru:pgpro-dev/postgrespro.git' 62 | - cd postgrespro 63 | - docker build -t pgmm . 64 | - cd .. 65 | - mkdir docker-image 66 | - docker save pgmm > docker-image/pgmm.tar 67 | - tar czf docker-image/pgmm.tar.gz docker-image/pgmm.tar 68 | - ls -lah docker-image/ 69 | - rm docker-image/pgmm.tar 70 | # also save archived sources of core 71 | - tar -czf postgrespro.tar.gz postgrespro 72 | 73 | # make check. We build core from sources again which is a bit ugly as we already 74 | # built the image, but let's not wobble here with yet another docker 75 | make_check: 76 | <<: *only-default 77 | stage: make_check 78 | # gives us the archive with core sources 79 | dependencies: 80 | - build_core_image 81 | artifacts: 82 | when: always 83 | paths: 84 | - postgrespro/contrib/mmts/tmp_check/log 85 | - postgrespro/contrib/mmts/tmp_check/regress_outdir 86 | script: 87 | - ls 88 | - tar -xzf postgrespro.tar.gz 89 | - shopt -s extglob 90 | - rm -rf postgrespro/contrib/mmts; mkdir postgrespro/contrib/mmts 91 | - mv !(postgrespro) postgrespro/contrib/mmts 92 | - cd postgrespro 93 | - CFLAGS="-ggdb3 -O0" ./configure --enable-cassert --enable-debug --with-perl --enable-tap-tests 94 | - make -j8 95 | - cd contrib/mmts && make check 96 | 97 | recovery_random: 98 | <<: *only-default 99 | stage: recovery random 100 | image: pgpgpro/dev:alpine 101 | services: 102 | - docker:dind 103 | dependencies: 104 | - build_core_image 105 | artifacts: 106 | when: on_failure 107 | paths: 108 | - tests/logs1 109 | - tests/logs2 110 | - tests/logs3 111 | variables: 112 | <<: *docker_variables 113 | before_script: 114 | - docker info 115 | script: 116 | - tar -xzvf docker-image/pgmm.tar.gz 117 | - docker load -i docker-image/pgmm.tar 118 | - cd tests/ 119 | - env CI=1 python3 -u test_recovery_random.py --failfast 120 | 121 | referee: 122 | extends: recovery_random 123 | stage: referee 124 | artifacts: 125 | paths: 126 | - tests/logs1 127 | - tests/logs2 128 | - tests/logs_referee 129 | script: 130 | - tar -xzvf docker-image/pgmm.tar.gz 131 | - docker load -i docker-image/pgmm.tar 132 | - cd tests/ 133 | - env CI=1 python3 -u test_referee.py --failfast 134 | 135 | syncpoint: 136 | extends: recovery_random 137 | stage: syncpoint 138 | script: 139 | - tar -xzvf docker-image/pgmm.tar.gz 140 | - docker load -i docker-image/pgmm.tar 141 | - cd tests/ 142 | - env CI=1 python3 -u test_syncpoint.py --failfast 143 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM pgmm 2 | 3 | RUN mkdir /pg/mmts 4 | COPY ./ /pg/mmts/ 5 | 6 | RUN export USE_PGXS=1 && \ 7 | cd /pg/mmts && make clean && make install 8 | 9 | # pg_regress client assumes such dir exists on server 10 | RUN cp /pg/src/src/test/regress/*.so /pg/install/lib/postgresql/ 11 | USER postgres 12 | ENV PGDATA /pg/data 13 | ENTRYPOINT ["/pg/mmts/tests/docker-entrypoint.sh"] 14 | 15 | EXPOSE 5432 16 | CMD ["postgres"] 17 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | EXTENSION = multimaster 2 | DATA = multimaster--1.0.sql 3 | OBJS = src/multimaster.o src/dmq.o src/commit.o src/bytebuf.o src/bgwpool.o \ 4 | src/pglogical_output.o src/pglogical_proto.o src/pglogical_receiver.o \ 5 | src/pglogical_apply.o src/pglogical_hooks.o src/pglogical_config.o \ 6 | src/pglogical_relid_map.o src/ddd.o src/bkb.o src/spill.o src/state.o \ 7 | src/resolver.o src/ddl.o src/syncpoint.o src/global_tx.o src/mtm_utils.o 8 | MODULE_big = multimaster 9 | 10 | ifndef USE_PGXS # hmm, user didn't requested to use pgxs 11 | # relative path to this makefile 12 | mkfile_path := $(word $(words $(MAKEFILE_LIST)),$(MAKEFILE_LIST)) 13 | # relative path to dir with this makefile 14 | mkfile_dir := $(dir $(mkfile_path)) 15 | # abs path to dir with this makefile 16 | mkfile_abspath := $(shell cd $(mkfile_dir) && pwd -P) 17 | # parent dir name of directory with makefile 18 | parent_dir_name := $(shell basename $(shell dirname $(mkfile_abspath))) 19 | ifneq ($(parent_dir_name),contrib) # a-ha, but the extension is not inside 'contrib' dir 20 | USE_PGXS := 1 # so use it anyway, most probably that's what the user wants 21 | endif 22 | endif 23 | # $(info) is introduced in 3.81, and PG doesn't support makes older than 3.80 24 | # ifeq ($(MAKE_VERSION),3.80) 25 | # $(warning $$USE_PGXS is [${USE_PGXS}] (we use it automatically if not in contrib dir)) 26 | # else 27 | # $(info $$USE_PGXS is [${USE_PGXS}] (we use it automatically if not in contrib dir)) 28 | # endif 29 | 30 | ifdef USE_PGXS # use pgxs 31 | # You can specify path to pg_config in PG_CONFIG var 32 | ifndef PG_CONFIG 33 | PG_CONFIG := pg_config 34 | endif 35 | PG_CPPFLAGS += -I$(CURDIR)/src/include 36 | # add installation top include directory for libpq header 37 | # (seems like server/ dir is added by pgxs) 38 | PG_CPPFLAGS += -I$(shell $(PG_CONFIG) --includedir) 39 | SHLIB_LINK += -lpq # add libpq 40 | PGXS := $(shell $(PG_CONFIG) --pgxs) 41 | include $(PGXS) 42 | 43 | else # assume the extension is in contrib/ dir of pg distribution 44 | PG_CPPFLAGS += -I$(top_srcdir)/$(subdir)/src/include 45 | PG_CPPFLAGS += -I$(libpq_srcdir) # include libpq-fe, defined in Makefile.global.in 46 | SHLIB_LINK = $(libpq) # defined in Makefile.global.in 47 | subdir = contrib/mmts 48 | top_builddir = ../.. 49 | include $(top_builddir)/src/Makefile.global 50 | # in ee, install pathman as well 51 | ifeq (${PGPRO_EDITION}, enterprise) 52 | EXTRA_INSTALL=contrib/pg_pathman 53 | endif 54 | include $(top_srcdir)/contrib/contrib-global.mk 55 | endif # USE_PGXS 56 | 57 | REGRESS_SHLIB=$(abs_top_builddir)/src/test/regress/regress$(DLSUFFIX) 58 | export REGRESS_SHLIB 59 | 60 | .PHONY: all 61 | 62 | # recurse down to referee/ on install. 63 | # (I'd use $(call recurse...), but how can we pass USE_PGXS there? 64 | referee-install: 65 | USE_PGXS=$(USE_PGXS) $(MAKE) -C referee install 66 | install: referee-install 67 | 68 | all: multimaster.so 69 | 70 | submake-regress: 71 | $(MAKE) -C $(top_builddir)/src/test/regress all 72 | $(MAKE) -C $(top_builddir)/src/test/regress tablespace-setup 73 | 74 | # all .pl tests should pass now, but let's see what the buildfarm says 75 | # ifndef MTM_ALL 76 | # PROVE_TESTS ?= 77 | # endif 78 | PROVE_FLAGS += --timer 79 | ifndef USE_PGXS 80 | check: temp-install submake-regress 81 | $(prove_check) 82 | else # pgxs build 83 | # Note that for PGXS build we override here bail-out recipe defined in pgxs.mk, 84 | # but well, why should we chose another name? 85 | # submake-regress won't work as we have no access to the source; we assume 86 | # regress is already installed 87 | # final spell is inspired by 88 | # https://www.2ndquadrant.com/en/blog/using-postgresql-tap-framework-extensions/ 89 | # and Makefile.global.in which is obviously the original source 90 | check: 91 | rm -rf '$(CURDIR)'/tmp_check 92 | $(MKDIR_P) '$(CURDIR)'/tmp_check 93 | PGXS=$(PGXS) TESTDIR='$(CURDIR)' PATH="$(bindir):$$PATH" PG_REGRESS='$(top_builddir)/src/test/regress/pg_regress' $(PROVE) $(PG_PROVE_FLAGS) $(PROVE_FLAGS) $(if $(PROVE_TESTS),$(PROVE_TESTS),t/*.pl) 94 | endif 95 | 96 | # PG_PROVE_FLAGS adds PostgresNode and friends include dir 97 | start: temp-install 98 | rm -rf '$(CURDIR)'/tmp_check 99 | $(MKDIR_P) '$(CURDIR)'/tmp_check 100 | cd $(srcdir) && TESTDIR='$(CURDIR)' \ 101 | $(with_temp_install) \ 102 | PG_REGRESS='$(CURDIR)/$(top_builddir)/src/test/regress/pg_regress' \ 103 | perl $(PG_PROVE_FLAGS) run.pl --action=start $(RUN_OPTS) 104 | 105 | stop: 106 | cd $(srcdir) && TESTDIR='$(CURDIR)' \ 107 | PG_REGRESS='$(top_builddir)/src/test/regress/pg_regress' \ 108 | perl $(PG_PROVE_FLAGS) run.pl --action=stop $(RUN_OPTS) 109 | 110 | # for manual testing: runs core regress tests on 'make start'ed cluster 111 | run-pg-regress: submake-regress 112 | cd $(CURDIR)/$(top_builddir)/src/test/regress && \ 113 | $(with_temp_install) \ 114 | PGPORT='65432' \ 115 | PGHOST='127.0.0.1' \ 116 | PGUSER='$(USER)' \ 117 | ./pg_regress \ 118 | --bindir='' \ 119 | --use-existing \ 120 | --schedule=$(abs_top_srcdir)/src/test/regress/parallel_schedule \ 121 | --dlpath=$(CURDIR)/$(top_builddir)/src/test/regress \ 122 | --inputdir=$(abs_top_srcdir)/src/test/regress 123 | 124 | # for manual testing: runs contrib/test_partition on 'make start'ed cluster 125 | run-pathman-regress: 126 | cd $(CURDIR)/$(top_builddir)/src/test/regress && \ 127 | $(with_temp_install) \ 128 | PGPORT='65432' \ 129 | PGHOST='127.0.0.1' \ 130 | PGUSER='$(USER)' \ 131 | ./pg_regress \ 132 | --bindir='' \ 133 | --use-existing \ 134 | --temp-config=$(abs_top_srcdir)/contrib/test_partition/pg_pathman.add \ 135 | --inputdir=$(abs_top_srcdir)/contrib/test_partition/ \ 136 | partition 137 | 138 | 139 | # bgw-based partition spawning is not supported by mm, so I 140 | # commenting out body of set_spawn_using_bgw() sql function before 141 | # running that 142 | run-pathman-regress-ext: 143 | cd $(CURDIR)/$(top_builddir)/src/test/regress && \ 144 | $(with_temp_install) \ 145 | PGPORT='65432' \ 146 | PGHOST='127.0.0.1' \ 147 | PGUSER='$(USER)' \ 148 | ./pg_regress \ 149 | --bindir='' \ 150 | --use-existing \ 151 | --temp-config=$(abs_top_srcdir)/contrib/pg_pathman/conf.add \ 152 | --inputdir=$(abs_top_srcdir)/contrib/pg_pathman/ \ 153 | pathman_array_qual pathman_basic pathman_bgw pathman_calamity pathman_callbacks \ 154 | pathman_column_type pathman_cte pathman_domains pathman_dropped_cols pathman_expressions \ 155 | pathman_foreign_keys pathman_gaps pathman_inserts pathman_interval pathman_join_clause \ 156 | pathman_lateral pathman_hashjoin pathman_mergejoin pathman_only pathman_param_upd_del \ 157 | pathman_permissions pathman_rebuild_deletes pathman_rebuild_updates pathman_rowmarks \ 158 | pathman_runtime_nodes pathman_subpartitions pathman_update_node pathman_update_triggers \ 159 | pathman_upd_del pathman_utility_stmt pathman_views 160 | 161 | pg-regress: | start run-pg-regress 162 | pathman-regress: | start run-pathman-regress-ext stop 163 | installcheck: 164 | $(prove_installcheck) 165 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # multimaster 2 | 3 | multimaster is a Postgres extension with a set of core patches that turn the 4 | DBMS into a synchronous shared-nothing symmetric cluster providing high 5 | availability with strong consistency and read scalability. 6 | 7 | It offers the following benefits, some of which are not available in traditional streaming replication based solutions: 8 | * Fault tolerance and automatic node recovery 9 | * Fast failover 10 | * Both read and write transactions can be executed on any node. 11 | * Read scalability 12 | * Working with temporary tables on each cluster node 13 | * Online minor upgrades 14 | 15 | ## Documentation 16 | 17 | [current documentation](https://postgrespro.github.io/mmts/) 18 | 19 | Documentation for versions released with PostgresPro Enterprise can be found 20 | [here](https://postgrespro.ru/docs/enterprise/current/multimaster?lang=en). 21 | 22 | ## Building from source 23 | 24 | Since multimaster depends on core patches, both Postgres and extension must be compiled. The patched version (based on Postgres 13) is available [here](https://github.com/postgrespro/postgres_cluster/tree/rel13_mm_2). Follow the [documentation](https://www.postgresql.org/docs/current/installation.html) to build it. 25 | 26 | Then enter the build directory and install the extension with 27 | ```shell 28 | cd contrib 29 | git clone https://github.com/postgrespro/mmts/ 30 | cd mmts 31 | make install 32 | ``` 33 | -------------------------------------------------------------------------------- /doc/multimaster_book.xml: -------------------------------------------------------------------------------- 1 | 5 | ]> 6 | 7 | 8 | 9 | multimaster Documentation 10 | &multimaster; 11 | 12 | 13 | -------------------------------------------------------------------------------- /doc/readme.md: -------------------------------------------------------------------------------- 1 | # Generating documentation 2 | ``` 3 | xmllint --noout --valid multimaster_book.xml 4 | xsltproc stylesheet.xsl multimaster_book.xml >multimaster.html 5 | ``` 6 | 7 | and don't forget to install the result on postgrespro.github.io: 8 | ``` 9 | cp multimaster.html stylesheet.css /mmts/ 10 | ``` -------------------------------------------------------------------------------- /doc/specs/.gitignore: -------------------------------------------------------------------------------- 1 | *.toolbox/ 2 | .ipynb_checkpoints/ 3 | -------------------------------------------------------------------------------- /doc/specs/MtmGenerations.cfg: -------------------------------------------------------------------------------- 1 | \* MV CONSTANT declarations 2 | CONSTANTS 3 | n1 = n1 4 | n2 = n2 5 | n3 = n3 6 | \* MV CONSTANT definitions 7 | CONSTANT 8 | nodes = {n1, n2, n3} 9 | \* SYMMETRY definition 10 | SYMMETRY perms 11 | \* CONSTANT definitions 12 | CONSTANT 13 | max_xacts = 3 14 | CONSTANT 15 | max_gen = 3 16 | \* INIT definition 17 | INIT 18 | Init 19 | \* NEXT definition 20 | NEXT 21 | Next 22 | \* INVARIANT definition 23 | INVARIANT 24 | OrderOk 25 | \* Generated on Fri Dec 06 18:48:51 MSK 2019 -------------------------------------------------------------------------------- /doc/specs/MtmPrimitiveCurrent.cfg: -------------------------------------------------------------------------------- 1 | \* MV CONSTANT declarations 2 | CONSTANTS 3 | n1 = n1 4 | n2 = n2 5 | n3 = n3 6 | \* MV CONSTANT definitions 7 | CONSTANT 8 | nodes = {n1, n2, n3} 9 | \* CONSTANT definitions 10 | \* INIT definition 11 | INIT 12 | Init 13 | \* NEXT definition 14 | NEXT 15 | Next 16 | \* INVARIANT definition 17 | INVARIANT 18 | OrderOk 19 | \* Generated on Fri Dec 06 18:48:51 MSK 2019 -------------------------------------------------------------------------------- /doc/specs/MtmPrimitiveCurrent.tla: -------------------------------------------------------------------------------- 1 | ---- MODULE MtmPrimitiveCurrent ---- 2 | 3 | \* Primitive (meaning immediate PREPARE everywhere and immediate recovery) 4 | \* but pretty close model of current multimaster. 5 | \* - There is an obvious sausage problem, shown by TLC. One of sort of its appearances 6 | \* is that we push xact into node without checking its state at all; xact is 7 | \* just appended to all nodes coordinator regards as 'enabled'. 8 | \* - Also 'works' only on 3 nodes because we recover from single node. 9 | \* - I don't see any reason for RECOVERED->ONLINE transition condition, 10 | \* and associated maintenance of walsenders/walreceivers masks. We can allow 11 | \* our xacts even just after recovery or (simpler for selecting xacts needing 12 | \* resolving) when majority is enabled. 13 | \* - I also don't see the point of recovery phase in RECOVERED|ONLINE: we don't pull 14 | \* all origins and thus it doesn't save us from sausage-like problems, 15 | \* but we still don't confirm xacts and don't allow parallel apply in it. 16 | 17 | \* model depth constraint is hardcoded in do_tx 18 | 19 | EXTENDS Integers, Sequences, FiniteSets, TLC 20 | VARIABLES state, logs 21 | 22 | CONSTANT nodes 23 | 24 | n_nodes == Cardinality(nodes) 25 | 26 | 27 | \************************************************************************************** 28 | \* Helpers 29 | \************************************************************************************** 30 | 31 | \* is s1 subsequence of s2? 32 | IsSubSeq(s1, s2) == 33 | /\ Len(s1) <= Len(s2) 34 | /\ SubSeq(s2, 1, Len(s1)) = s1 35 | 36 | 37 | quorum(mask) == Cardinality({i \in DOMAIN mask : mask[i] = 1}) >= (n_nodes \div 2 + 1) 38 | 39 | max(set) == IF set = {} THEN 0 ELSE CHOOSE e1 \in set: \A e2 \in set: e1 >= e2 40 | 41 | maxlen(seqs) == max({Len(seqs[n]) : n \in DOMAIN seqs}) 42 | 43 | \* max lsn of given origin in given log 44 | maxlsn(log, origin) == max({log[j].olsn : j \in {i \in DOMAIN log : log[i].origin = origin }}) 45 | 46 | \* how far each node's changes are applied in given log? 47 | rep_state(log) == [n \in nodes |-> maxlsn(log,n)] 48 | 49 | log_newer_than(log, origin_vec) == SelectSeq(log, LAMBDA e: e.olsn > origin_vec[e.origin]) 50 | 51 | \*is_increasing(s) == IF Len(s) > 1 52 | \* THEN {s[i] < s[i+1] : i \in 1..(Len(s)-1)} = {TRUE} 53 | \* ELSE TRUE 54 | 55 | \* returns not just new status but record with new state because masks might change 56 | \* old status is taken from state[n] 57 | new_state(n, view, enabled, wsndmask, wrcvmask) == 58 | LET 59 | old_status == state[n].status 60 | new_status == CASE 61 | \* This is hardly needed; safety won't be altered if we are in recovery 62 | \* with less than majority in view mask 63 | ~ quorum(view) -> "disabled" 64 | [] quorum(view) /\ old_status = "disabled" -> "recovery" 65 | \* recovery -> recovered done explicitly in do_recovery() 66 | [] quorum(view) /\ old_status = "recovered" /\ view = enabled /\ view = wsndmask /\ view = wrcvmask -> "online" 67 | \* I don't think we need that, nothing should be prepared with minority enabled anyway 68 | [] quorum(view) /\ old_status = "online" /\ ~quorum(enabled) -> "disabled" 69 | [] OTHER -> old_status 70 | \* all zeros but me 71 | zeros == [[_n \in nodes |-> 0] EXCEPT ![n] = 1] 72 | new_enabled == IF new_status = "disabled" THEN zeros ELSE enabled 73 | new_wsndmask == IF new_status = "disabled" THEN zeros ELSE wsndmask 74 | new_wrcvmask == IF new_status = "disabled" THEN zeros ELSE wrcvmask 75 | IN 76 | \* next_lsn goes unchanged 77 | [state[n] EXCEPT !.status = new_status, 78 | !.view = view, 79 | !.enabled = new_enabled, 80 | !.walsenders = new_wsndmask, 81 | !.walreceivers = new_wrcvmask] 82 | 83 | 84 | \************************************************************************************** 85 | \* Initial 86 | \************************************************************************************** 87 | 88 | 89 | Init == /\ state = [n \in nodes |-> [ 90 | next_lsn |-> 1, 91 | status |-> "disabled", 92 | view |-> [[_n \in nodes |-> 0] EXCEPT ![n] = 1], 93 | enabled |-> [[_n \in nodes |-> 0] EXCEPT ![n] = 1], 94 | walsenders |-> [[_n \in nodes |-> 0] EXCEPT ![n] = 1], 95 | walreceivers |-> [[_n \in nodes |-> 0] EXCEPT ![n] = 1] 96 | ]] 97 | /\ logs = [n \in nodes |-> << >>] 98 | 99 | \************************************************************************************** 100 | \* Actions 101 | \************************************************************************************** 102 | 103 | 104 | \* n1 disconnects n2 105 | disconnect(n1, n2) == 106 | /\ n1 /= n2 107 | /\ state[n1].view[n2] = 1 108 | 109 | /\ logs' = logs 110 | /\ LET 111 | view == [state[n1].view EXCEPT ![n2] = 0] 112 | enabled == [state[n1].enabled EXCEPT ![n2] = 0] 113 | n1_state == new_state(n1, view, enabled, state[n1].walsenders, state[n2].walreceivers) 114 | IN 115 | state' = [state EXCEPT ![n1] = n1_state] 116 | 117 | 118 | connect(n1, n2) == 119 | /\ n1 /= n2 120 | /\ state[n1].view[n2] = 0 121 | 122 | /\ logs' = logs 123 | /\ LET 124 | view == [state[n1].view EXCEPT ![n2] = 1] 125 | n1_state == new_state(n1, view, state[n1].enabled, state[n1].walsenders, state[n1].walreceivers) 126 | IN 127 | state' = [state EXCEPT ![n1] = n1_state] 128 | 129 | \* n1 recovers from n2 130 | do_recovery(n1, n2) == 131 | /\ n1 /= n2 132 | /\ state[n1].status = "recovery" 133 | /\ state[n1].view[n2] = 1 134 | \* Apparently this ensures we won't keep dead node as enabled 135 | /\ state[n2].view[n1] = 1 136 | 137 | /\ LET 138 | origin_vec == rep_state(logs[n1]) 139 | new_entries == log_newer_than(logs[n2], origin_vec) 140 | \* enable n1 141 | n2_enabled == [state[n2].enabled EXCEPT ![n1] = 1] 142 | n2_state == new_state(n2, state[n2].view, n2_enabled, state[n2].walsenders, state[n2].walreceivers) 143 | IN 144 | /\ logs' = [logs EXCEPT ![n1] = logs[n1] \o new_entries] 145 | /\ state' = [state EXCEPT ![n1].status = "recovered", 146 | ![n2] = n2_state] 147 | 148 | 149 | do_recovered(n1, n2) == 150 | /\ n1 /= n2 151 | /\ (state[n1].status = "recovered" \/ state[n1].status = "online") 152 | /\ state[n1].view[n2] = 1 153 | /\ state[n2].view[n1] = 1 154 | 155 | /\ LET 156 | our_last_lsn == maxlsn(logs[n1], n2) 157 | new_entries == SelectSeq(logs[n2], LAMBDA e: e.origin = n2 /\ e.olsn > our_last_lsn ) 158 | IN 159 | logs' = [logs EXCEPT ![n1] = logs[n1] \o new_entries] 160 | /\ LET 161 | n2_enabled == [state[n2].enabled EXCEPT ![n1] = 1] 162 | n2_walsenders == [state[n2].walsenders EXCEPT ![n1] = 1] 163 | n2_state == new_state(n2, state[n2].view, n2_enabled, n2_walsenders, state[n2].walreceivers) 164 | n1_walreceivers == [state[n1].walreceivers EXCEPT ![n2] = 1] 165 | n1_state == new_state(n1, state[n1].view, state[n1].enabled, state[n1].walsenders, n1_walreceivers) 166 | IN 167 | state' = [state EXCEPT ![n1] = n1_state, 168 | ![n2] = n2_state] 169 | 170 | 171 | do_tx(node) == 172 | \* model depth constraint 173 | /\ Len(logs[node]) <= 4 174 | /\ state[node].status = "online" 175 | /\ quorum(state[node].enabled) 176 | /\ logs' = [n \in nodes |-> 177 | IF state[node].enabled[n] = 1 178 | THEN Append(logs[n], [origin |-> node, olsn |-> state[node].next_lsn]) 179 | ELSE logs[n]] 180 | /\ state' = [state EXCEPT ![node].next_lsn = state[node].next_lsn + 1] 181 | 182 | 183 | \************************************************************************************** 184 | \* Final spec 185 | \************************************************************************************** 186 | 187 | 188 | Next == \/ \E n1,n2 \in nodes : connect(n1,n2) 189 | \/ \E n1,n2 \in nodes : disconnect(n1,n2) 190 | \/ \E n1,n2 \in nodes : do_recovery(n1,n2) 191 | \/ \E n1,n2 \in nodes : do_recovered(n1,n2) 192 | \/ \E n \in nodes : do_tx(n) 193 | 194 | spec == Init /\ [][Next]_<> 195 | 196 | 197 | \************************************************************************************** 198 | \* Stuff to check 199 | \************************************************************************************** 200 | 201 | \* Make sure every log is sublog of the longest one 202 | OrderOk == 203 | LET 204 | most_advanced_node == CHOOSE n1 \in nodes: \A n2 \in nodes: Len(logs[n1]) >= Len(logs[n2]) 205 | IN 206 | \A n \in nodes: IsSubSeq(logs[n], logs[most_advanced_node]) 207 | 208 | ==== -------------------------------------------------------------------------------- /doc/specs/MtmPrimitiveCurrentMasks.cfg: -------------------------------------------------------------------------------- 1 | \* MV CONSTANT declarations 2 | CONSTANTS 3 | n1 = n1 4 | n2 = n2 5 | n3 = n3 6 | \* MV CONSTANT definitions 7 | CONSTANT 8 | nodes = {n1, n2, n3} 9 | \* CONSTANT definitions 10 | \* INIT definition 11 | INIT 12 | Init 13 | \* NEXT definition 14 | NEXT 15 | Next 16 | \* INVARIANT definition 17 | INVARIANT 18 | OrderOk 19 | \* Generated on Fri Dec 06 18:48:51 MSK 2019 -------------------------------------------------------------------------------- /doc/specs/MtmPrimitiveCurrentMasks.tla: -------------------------------------------------------------------------------- 1 | ---- MODULE MtmPrimitiveCurrentMasks ---- 2 | 3 | \* This just adds to MtmPrimitiveCurrent.tla tracking of enabled masks: while 4 | \* doing xact coordinator stamps it with current enabled mask. Others apply it 5 | \* in normal mode iff their enabled mask is exactly the same. TLC demonstrates 6 | \* here that we still have a problem because in do_recovered we ask to enable us 7 | \* without pulling all origins. 8 | 9 | \* model depth constraint is hardcoded in do_tx 10 | 11 | EXTENDS Integers, Sequences, FiniteSets, TLC 12 | VARIABLES state, logs 13 | 14 | CONSTANT nodes 15 | 16 | n_nodes == Cardinality(nodes) 17 | 18 | 19 | \************************************************************************************** 20 | \* Helpers 21 | \************************************************************************************** 22 | 23 | \* is s1 subsequence of s2? 24 | IsSubSeq(s1, s2) == 25 | /\ Len(s1) <= Len(s2) 26 | /\ SubSeq(s2, 1, Len(s1)) = s1 27 | 28 | 29 | quorum(mask) == Cardinality({i \in DOMAIN mask : mask[i] = 1}) >= (n_nodes \div 2 + 1) 30 | 31 | max(set) == IF set = {} THEN 0 ELSE CHOOSE e1 \in set: \A e2 \in set: e1 >= e2 32 | 33 | maxlen(seqs) == max({Len(seqs[n]) : n \in DOMAIN seqs}) 34 | 35 | \* max lsn of given origin in given log 36 | maxlsn(log, origin) == max({log[j].olsn : j \in {i \in DOMAIN log : log[i].origin = origin }}) 37 | 38 | \* how far each node's changes are applied in given log? 39 | rep_state(log) == [n \in nodes |-> maxlsn(log,n)] 40 | 41 | log_newer_than(log, origin_vec) == SelectSeq(log, LAMBDA e: e.olsn > origin_vec[e.origin]) 42 | 43 | \*is_increasing(s) == IF Len(s) > 1 44 | \* THEN {s[i] < s[i+1] : i \in 1..(Len(s)-1)} = {TRUE} 45 | \* ELSE TRUE 46 | 47 | \* returns not just new status but record with new state because masks might change 48 | \* old status is taken from state[n] 49 | new_state(n, view, enabled, wsndmask, wrcvmask) == 50 | LET 51 | old_status == state[n].status 52 | new_status == CASE 53 | \* This is hardly needed; safety won't be altered if we are in recovery 54 | \* with less than majority in view mask 55 | ~ quorum(view) -> "disabled" 56 | [] quorum(view) /\ old_status = "disabled" -> "recovery" 57 | \* recovery -> recovered done explicitly in do_recovery() 58 | [] quorum(view) /\ old_status = "recovered" /\ view = enabled /\ view = wsndmask /\ view = wrcvmask -> "online" 59 | \* I don't think we need that, nothing should be prepared with minority enabled anyway 60 | [] quorum(view) /\ old_status = "online" /\ ~quorum(enabled) -> "disabled" 61 | [] OTHER -> old_status 62 | \* all zeros but me 63 | zeros == [[_n \in nodes |-> 0] EXCEPT ![n] = 1] 64 | new_enabled == IF new_status = "disabled" THEN zeros ELSE enabled 65 | new_wsndmask == IF new_status = "disabled" THEN zeros ELSE wsndmask 66 | new_wrcvmask == IF new_status = "disabled" THEN zeros ELSE wrcvmask 67 | IN 68 | \* next_lsn goes unchanged 69 | [state[n] EXCEPT !.status = new_status, 70 | !.view = view, 71 | !.enabled = new_enabled, 72 | !.walsenders = new_wsndmask, 73 | !.walreceivers = new_wrcvmask] 74 | 75 | 76 | \************************************************************************************** 77 | \* Initial 78 | \************************************************************************************** 79 | 80 | 81 | Init == /\ state = [n \in nodes |-> [ 82 | next_lsn |-> 1, 83 | status |-> "disabled", 84 | view |-> [[_n \in nodes |-> 0] EXCEPT ![n] = 1], 85 | enabled |-> [[_n \in nodes |-> 0] EXCEPT ![n] = 1], 86 | walsenders |-> [[_n \in nodes |-> 0] EXCEPT ![n] = 1], 87 | walreceivers |-> [[_n \in nodes |-> 0] EXCEPT ![n] = 1] 88 | ]] 89 | /\ logs = [n \in nodes |-> << >>] 90 | 91 | \************************************************************************************** 92 | \* Actions 93 | \************************************************************************************** 94 | 95 | 96 | \* n1 disconnects n2 97 | disconnect(n1, n2) == 98 | /\ n1 /= n2 99 | /\ state[n1].view[n2] = 1 100 | 101 | /\ logs' = logs 102 | /\ LET 103 | view == [state[n1].view EXCEPT ![n2] = 0] 104 | enabled == [state[n1].enabled EXCEPT ![n2] = 0] 105 | n1_state == new_state(n1, view, enabled, state[n1].walsenders, state[n2].walreceivers) 106 | IN 107 | state' = [state EXCEPT ![n1] = n1_state] 108 | 109 | 110 | connect(n1, n2) == 111 | /\ n1 /= n2 112 | /\ state[n1].view[n2] = 0 113 | 114 | /\ logs' = logs 115 | /\ LET 116 | view == [state[n1].view EXCEPT ![n2] = 1] 117 | n1_state == new_state(n1, view, state[n1].enabled, state[n1].walsenders, state[n1].walreceivers) 118 | IN 119 | state' = [state EXCEPT ![n1] = n1_state] 120 | 121 | \* n1 recovers from n2 122 | do_recovery(n1, n2) == 123 | /\ n1 /= n2 124 | /\ state[n1].status = "recovery" 125 | /\ state[n1].view[n2] = 1 126 | \* Apparently this ensures we won't keep dead node as enabled 127 | /\ state[n2].view[n1] = 1 128 | 129 | /\ LET 130 | origin_vec == rep_state(logs[n1]) 131 | new_entries == log_newer_than(logs[n2], origin_vec) 132 | \* enable n1 133 | n2_enabled == [state[n2].enabled EXCEPT ![n1] = 1] 134 | n2_state == new_state(n2, state[n2].view, n2_enabled, state[n2].walsenders, state[n2].walreceivers) 135 | IN 136 | /\ logs' = [logs EXCEPT ![n1] = logs[n1] \o new_entries] 137 | /\ state' = [state EXCEPT ![n1].status = "recovered", 138 | ![n2] = n2_state] 139 | 140 | 141 | do_recovered(n1, n2) == 142 | /\ n1 /= n2 143 | /\ (state[n1].status = "recovered" \/ state[n1].status = "online") 144 | /\ state[n1].view[n2] = 1 145 | /\ state[n2].view[n1] = 1 146 | 147 | /\ LET 148 | our_last_lsn == maxlsn(logs[n1], n2) 149 | new_entries == SelectSeq(logs[n2], LAMBDA e: e.origin = n2 /\ e.olsn > our_last_lsn ) 150 | IN 151 | /\ \A k \in DOMAIN new_entries: new_entries[k].participants = state[n1].enabled 152 | /\ logs' = [logs EXCEPT ![n1] = logs[n1] \o new_entries] 153 | /\ LET 154 | n2_enabled == [state[n2].enabled EXCEPT ![n1] = 1] 155 | n2_walsenders == [state[n2].walsenders EXCEPT ![n1] = 1] 156 | n2_state == new_state(n2, state[n2].view, n2_enabled, n2_walsenders, state[n2].walreceivers) 157 | n1_walreceivers == [state[n1].walreceivers EXCEPT ![n2] = 1] 158 | n1_state == new_state(n1, state[n1].view, state[n1].enabled, state[n1].walsenders, n1_walreceivers) 159 | IN 160 | state' = [state EXCEPT ![n1] = n1_state, 161 | ![n2] = n2_state] 162 | 163 | 164 | do_tx(node) == 165 | \* model depth constraint 166 | /\ Len(logs[node]) <= 4 167 | /\ state[node].status = "online" 168 | /\ quorum(state[node].enabled) 169 | \* make sure set of enabled nodes is the same on all participants 170 | /\ \A n \in nodes: state[node].enabled[n] = 0 \/ state[n].enabled = state[node].enabled 171 | /\ logs' = [n \in nodes |-> 172 | IF state[node].enabled[n] = 1 173 | THEN Append(logs[n], [origin |-> node, olsn |-> state[node].next_lsn, participants |-> state[node].enabled]) 174 | ELSE logs[n]] 175 | /\ state' = [state EXCEPT ![node].next_lsn = state[node].next_lsn + 1] 176 | 177 | 178 | \************************************************************************************** 179 | \* Final spec 180 | \************************************************************************************** 181 | 182 | 183 | Next == \/ \E n1,n2 \in nodes : connect(n1,n2) 184 | \/ \E n1,n2 \in nodes : disconnect(n1,n2) 185 | \/ \E n1,n2 \in nodes : do_recovery(n1,n2) 186 | \/ \E n1,n2 \in nodes : do_recovered(n1,n2) 187 | \/ \E n \in nodes : do_tx(n) 188 | 189 | spec == Init /\ [][Next]_<> 190 | 191 | 192 | \************************************************************************************** 193 | \* Stuff to check 194 | \************************************************************************************** 195 | 196 | \* Make sure every log is sublog of the longest one 197 | OrderOk == 198 | LET 199 | most_advanced_node == CHOOSE n1 \in nodes: \A n2 \in nodes: Len(logs[n1]) >= Len(logs[n2]) 200 | IN 201 | \A n \in nodes: IsSubSeq(logs[n], logs[most_advanced_node]) 202 | 203 | ==== -------------------------------------------------------------------------------- /doc/specs/MtmPrimitiveCurrentMasksFixed.cfg: -------------------------------------------------------------------------------- 1 | \* MV CONSTANT declarations 2 | CONSTANTS 3 | n1 = n1 4 | n2 = n2 5 | n3 = n3 6 | \* MV CONSTANT definitions 7 | CONSTANT 8 | nodes = {n1, n2, n3} 9 | \* SYMMETRY definition 10 | SYMMETRY perms 11 | \* CONSTANT definitions 12 | CONSTANT 13 | depth = 3 14 | \* INIT definition 15 | INIT 16 | Init 17 | \* NEXT definition 18 | NEXT 19 | Next 20 | \* INVARIANT definition 21 | INVARIANT 22 | OrderOk 23 | \* Generated on Fri Dec 06 18:48:51 MSK 2019 -------------------------------------------------------------------------------- /doc/specs/commit.cfg: -------------------------------------------------------------------------------- 1 | SPECIFICATION spec 2 | INVARIANTS consistency types_correct1 types_correct2 -------------------------------------------------------------------------------- /doc/stylesheet.css: -------------------------------------------------------------------------------- 1 | @import url('https://fonts.googleapis.com/css?family=Roboto:300,400,500,700&subset=cyrillic'); 2 | 3 | body { 4 | font-family: 'Roboto',Arial,sans-serif; 5 | } 6 | 7 | body { 8 | font-size: 18px; 9 | font-weight: 300; 10 | } 11 | 12 | /* ../media/css/docs.css */ 13 | .navheader th { text-align: center; } /* anti-bootstrap */ 14 | 15 | .navheader tbody tr:nth-child(1) th { /* временно убрать ненужную строчку */ 16 | display: none; 17 | } 18 | 19 | /* PostgreSQL.org Documentation Style */ 20 | 21 | .book div.NAVHEADER table { 22 | margin-left: 0; 23 | } 24 | 25 | .book div.NAVHEADER th { 26 | text-align: center; 27 | } 28 | 29 | .book { 30 | font-size: 15px; 31 | line-height: 1.6; 32 | } 33 | 34 | /* Heading Definitions */ 35 | 36 | .book h1, 37 | .book h2, 38 | .book h3 { 39 | font-weight: bold; 40 | margin-top: 2ex; 41 | } 42 | 43 | .book h1 a, 44 | .book h2 a, 45 | .book h3 a, 46 | .book h4 a 47 | { 48 | color: #EC5800; 49 | } 50 | 51 | /* EKa --> */ 52 | .book h1 { 53 | font-size: 1.4em; 54 | } 55 | 56 | .book h2 { 57 | font-size: 1.25em; 58 | } 59 | 60 | .book h3 { 61 | font-size: 1.2em; 62 | } 63 | 64 | .book h4 { 65 | font-size: 1.15em; 66 | } 67 | 68 | .book h5 { 69 | font-size: 1.1em; 70 | } 71 | 72 | .book h6 { 73 | font-size: 1.0em; 74 | } 75 | /* <-- EKa */ 76 | 77 | .book h1 a:hover { 78 | color: #EC5800; 79 | text-decoration: none; 80 | } 81 | 82 | .book h2 a:hover, 83 | .book h3 a:hover, 84 | .book h4 a:hover { 85 | color: #666666; 86 | text-decoration: none; 87 | } 88 | 89 | 90 | 91 | /* Text Styles */ 92 | 93 | .book div.SECT2 { 94 | margin-top: 4ex; 95 | } 96 | 97 | .book div.SECT3 { 98 | margin-top: 3ex; 99 | margin-left: 3ex; 100 | } 101 | 102 | .book .txtCurrentLocation { 103 | font-weight: bold; 104 | } 105 | 106 | .book p, 107 | .book ol, 108 | .book ul, 109 | .book li { 110 | line-height: 1.5em; 111 | } 112 | 113 | .book code { 114 | font-size: 1em; 115 | padding: 0px; 116 | color: #525f6c; 117 | background-color: #FFF; 118 | border-radius: 0px; 119 | } 120 | 121 | .book code, kbd, pre, samp { 122 | font-family: monospace,monospace; 123 | } 124 | 125 | .book .txtCommentsWrap { 126 | border: 2px solid #F5F5F5; 127 | width: 100%; 128 | } 129 | 130 | .book .txtCommentsContent { 131 | background: #F5F5F5; 132 | padding: 3px; 133 | } 134 | 135 | .book .txtCommentsPoster { 136 | float: left; 137 | } 138 | 139 | .book .txtCommentsDate { 140 | float: right; 141 | } 142 | 143 | .book .txtCommentsComment { 144 | padding: 3px; 145 | } 146 | 147 | .book #docContainer pre code, 148 | .book #docContainer pre tt, 149 | .book #docContainer pre pre, 150 | .book #docContainer tt tt, 151 | .book #docContainer tt code, 152 | .book #docContainer tt pre { 153 | font-size: 1em; 154 | } 155 | 156 | .book pre.LITERALLAYOUT, 157 | .book .SCREEN, 158 | .book .SYNOPSIS, 159 | .book .PROGRAMLISTING, 160 | .book .REFSYNOPSISDIV p, 161 | .book table.CAUTION, 162 | .book table.WARNING, 163 | .book blockquote.NOTE, 164 | .book blockquote.TIP, 165 | .book div.note, 166 | .book div.tip, 167 | .book table.CALSTABLE { 168 | -moz-box-shadow: 3px 3px 5px #DFDFDF; 169 | -webkit-box-shadow: 3px 3px 5px #DFDFDF; 170 | -khtml-box-shadow: 3px 3px 5px #DFDFDF; 171 | -o-box-shadow: 3px 3px 5px #DFDFDF; 172 | box-shadow: 3px 3px 5px #DFDFDF; 173 | } 174 | 175 | .book pre.LITERALLAYOUT, 176 | .book .SCREEN, 177 | .book .SYNOPSIS, 178 | .book .PROGRAMLISTING, 179 | .book .REFSYNOPSISDIV p, 180 | .book table.CAUTION, 181 | .book table.WARNING, 182 | .book blockquote.NOTE, 183 | .book blockquote.TIP 184 | .book div.note, 185 | .book div.tip { 186 | color: black; 187 | border-width: 1px; 188 | border-style: solid; 189 | padding: 2ex; 190 | margin: 2ex 0 2ex 2ex; 191 | overflow: auto; 192 | -moz-border-radius: 8px; 193 | -webkit-border-radius: 8px; 194 | -khtml-border-radius: 8px; 195 | border-radius: 8px; 196 | } 197 | 198 | .book div.note, 199 | .book div.tip { 200 | -moz-border-radius: 8px !important; 201 | -webkit-border-radius: 8px !important; 202 | -khtml-border-radius: 8px !important; 203 | border-radius: 8px !important; 204 | } 205 | 206 | 207 | .book pre.LITERALLAYOUT, 208 | .book pre.SYNOPSIS, 209 | .book pre.PROGRAMLISTING, 210 | .book .REFSYNOPSISDIV p, 211 | .book .SCREEN { 212 | border-color: #CFCFCF; 213 | background-color: #F7F7F7; 214 | } 215 | 216 | .book blockquote.NOTE, 217 | .book blockquote.TIP, 218 | .book div.note, 219 | .book div.tip { 220 | border-color: #DBDBCC; 221 | background-color: #EEEEDD; 222 | padding: 14px; 223 | width: 572px; 224 | /* font-size: 12px; */ 225 | } 226 | 227 | .book blockquote.NOTE, 228 | .book blockquote.TIP, 229 | .book table.CAUTION, 230 | .book table.WARNING { 231 | margin: 4ex auto; 232 | } 233 | 234 | .book div.note, 235 | .book div.tip { 236 | margin: 4ex auto !important; 237 | } 238 | 239 | 240 | .book blockquote.NOTE p, 241 | .book blockquote.TIP p, 242 | .book div.note p, 243 | .book div.tip p { 244 | margin: 0; 245 | } 246 | 247 | .book blockquote.NOTE pre, 248 | .book blockquote.NOTE code, 249 | .book div.note pre, 250 | .book div.note code, 251 | .book blockquote.TIP pre, 252 | .book blockquote.TIP code, 253 | .book div.tip pre, 254 | .book div.tio code { 255 | margin-left: 0; 256 | margin-right: 0; 257 | -moz-box-shadow: none; 258 | -webkit-box-shadow: none; 259 | -khtml-box-shadow: none; 260 | -o-box-shadow: none; 261 | box-shadow: none; 262 | } 263 | 264 | .book .emphasis, 265 | .book .c2 { 266 | font-weight: bold; 267 | } 268 | 269 | .book .REPLACEABLE { 270 | font-style: italic; 271 | } 272 | 273 | /* Table Styles */ 274 | 275 | .book table { 276 | margin-left: 2ex; 277 | } 278 | 279 | .book table.CALSTABLE td, 280 | .book table.CALSTABLE th, 281 | .book table.CAUTION td, 282 | .book table.CAUTION th, 283 | .book table.WARNING td, 284 | .book table.WARNING th { 285 | border-style: solid; 286 | } 287 | 288 | .book table.CALSTABLE, 289 | .book table.CAUTION, 290 | .book table.WARNING { 291 | border-spacing: 0; 292 | border-collapse: collapse; 293 | } 294 | 295 | .book table.CALSTABLE 296 | { 297 | margin: 2ex 0 2ex 2ex; 298 | background-color: #E0ECEF; 299 | border: 2px solid #A7C6DF; 300 | } 301 | 302 | .book table.CALSTABLE tr:hover td 303 | { 304 | background-color: #EFEFEF; 305 | } 306 | 307 | .book table.CALSTABLE td { 308 | background-color: #FFF; 309 | } 310 | 311 | .book table.CALSTABLE td, 312 | .book table.CALSTABLE th { 313 | border: 1px solid #A7C6DF; 314 | padding: 0.5ex 0.5ex; 315 | } 316 | 317 | table.CAUTION, 318 | .book table.WARNING { 319 | border-collapse: separate; 320 | display: block; 321 | padding: 0; 322 | max-width: 600px; 323 | } 324 | 325 | .book table.CAUTION { 326 | background-color: #F5F5DC; 327 | border-color: #DEDFA7; 328 | } 329 | 330 | .book table.WARNING { 331 | background-color: #FFD7D7; 332 | border-color: #DF421E; 333 | } 334 | 335 | .book table.CAUTION td, 336 | .book table.CAUTION th, 337 | .book table.WARNING td, 338 | .book table.WARNING th { 339 | border-width: 0; 340 | padding-left: 2ex; 341 | padding-right: 2ex; 342 | } 343 | 344 | .book table.CAUTION td, 345 | .book table.CAUTION th { 346 | border-color: #F3E4D5 347 | } 348 | 349 | .book table.WARNING td, 350 | .book table.WARNING th { 351 | border-color: #FFD7D7; 352 | } 353 | 354 | .book td.c1, 355 | .book td.c2, 356 | .book td.c3, 357 | .book td.c4, 358 | .book td.c5, 359 | .book td.c6 { 360 | font-size: 1.1em; 361 | font-weight: bold; 362 | border-bottom: 0px solid #FFEFEF; 363 | padding: 1ex 2ex 0; 364 | } 365 | 366 | .book .table thead { 367 | background: #E0ECEF; 368 | border-bottom: 1px solid #000; 369 | } 370 | .book .table > thead > tr > th { 371 | border-bottom: 1px solid #000; 372 | } 373 | 374 | .book td, th { 375 | padding: 0.1ex 0.5ex; 376 | } 377 | 378 | .book .book table tr:hover td { 379 | background-color: #EFEFEF; 380 | } 381 | 382 | /* Link Styles */ 383 | 384 | .book #docNav a { 385 | font-weight: bold; 386 | } 387 | 388 | .book code.FUNCTION tt { 389 | font-size: 1em; 390 | } 391 | 392 | .book table.docs-compare { 393 | align: center; 394 | width: 90%; 395 | border: 2px solid #999; 396 | border-collapse: collapse; 397 | } 398 | 399 | .book table.docs-compare td { 400 | padding: 12px; 401 | border: 1px solid #DDD; 402 | } 403 | 404 | .book dd { 405 | margin-left: 40px; 406 | } 407 | 408 | 409 | .book .sidebar { 410 | padding: 8px; 411 | background: #FFF; 412 | width: auto; 413 | } 414 | 415 | .book pre { 416 | background: #f5f5f5; 417 | padding: 10px; 418 | border: 1px solid #ccc; 419 | border-radius: 4px; 420 | } 421 | -------------------------------------------------------------------------------- /doc/stylesheet.xsl: -------------------------------------------------------------------------------- 1 | 2 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 1 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | sect1 toc 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /expected/atx.out: -------------------------------------------------------------------------------- 1 | create table atx_test1(a text); 2 | -- check that commit of autonomous tx will not steal locks from parent tx 3 | begin; 4 | insert into atx_test1 values (1); 5 | select count(*) from pg_locks where transactionid=txid_current(); 6 | count 7 | ------- 8 | 1 9 | (1 row) 10 | 11 | begin autonomous; 12 | insert into atx_test1 values (1); 13 | select count(*) from pg_locks where transactionid=txid_current(); 14 | count 15 | ------- 16 | 1 17 | (1 row) 18 | 19 | commit; 20 | -- here we still should see our lock 21 | select count(*) from pg_locks where transactionid=txid_current(); 22 | count 23 | ------- 24 | 1 25 | (1 row) 26 | 27 | commit; 28 | drop table atx_test1; 29 | -------------------------------------------------------------------------------- /multimaster.control: -------------------------------------------------------------------------------- 1 | comment = 'Multimaster' 2 | default_version = '1.0' 3 | module_pathname = '$libdir/multimaster' 4 | schema = mtm 5 | relocatable = false 6 | -------------------------------------------------------------------------------- /referee/Makefile: -------------------------------------------------------------------------------- 1 | EXTENSION = referee 2 | DATA = referee--1.0.sql 3 | REGRESS = referee 4 | 5 | ifdef USE_PGXS 6 | PG_CONFIG = pg_config 7 | PGXS := $(shell $(PG_CONFIG) --pgxs) 8 | include $(PGXS) 9 | else 10 | subdir = contrib/mmts/referee 11 | top_builddir = ../../../ 12 | include $(top_builddir)/src/Makefile.global 13 | include $(top_srcdir)/contrib/contrib-global.mk 14 | endif 15 | -------------------------------------------------------------------------------- /referee/expected/referee.out: -------------------------------------------------------------------------------- 1 | CREATE EXTENSION referee; 2 | SELECT * FROM referee.decision; 3 | key | node_id | gen_num 4 | -----+---------+--------- 5 | (0 rows) 6 | 7 | SELECT referee.request_grant(1, 7); 8 | request_grant 9 | --------------- 10 | 11 | (1 row) 12 | 13 | -- node can get its grant reissued 14 | SELECT referee.request_grant(1, 9); 15 | request_grant 16 | --------------- 17 | 18 | (1 row) 19 | 20 | -- but another can't get it while the previous is not cleared 21 | SELECT referee.request_grant(2, 4); 22 | ERROR: grant was already issued to node 1 in generation 9 23 | CONTEXT: PL/pgSQL function request_grant(integer,bigint) line 19 at RAISE 24 | SELECT referee.request_grant(2, 10); 25 | ERROR: grant was already issued to node 1 in generation 9 26 | CONTEXT: PL/pgSQL function request_grant(integer,bigint) line 19 at RAISE 27 | SELECT * FROM referee.decision; 28 | key | node_id | gen_num 29 | --------+---------+--------- 30 | winner | 1 | 9 31 | (1 row) 32 | 33 | DELETE FROM referee.decision WHERE gen_num < 8 OR (node_id = 1 AND gen_num <= 9); 34 | -- surely 2 node can acquire the grant after removal of the old one 35 | SELECT referee.request_grant(2, 11); 36 | request_grant 37 | --------------- 38 | 39 | (1 row) 40 | 41 | SELECT * FROM referee.decision; 42 | key | node_id | gen_num 43 | --------+---------+--------- 44 | winner | 2 | 11 45 | (1 row) 46 | 47 | -------------------------------------------------------------------------------- /referee/referee--1.0.sql: -------------------------------------------------------------------------------- 1 | -- complain if script is sourced in psql, rather than via CREATE EXTENSION 2 | \echo Use "CREATE EXTENSION referee" to load this file. \quit 3 | 4 | CREATE TABLE IF NOT EXISTS referee.decision( 5 | key text PRIMARY KEY NOT NULL, 6 | node_id int, 7 | -- storing gen_num here guarantees we clear (delete) the grant which is 8 | -- indeed can already be cleared instead of accidently removing newer one 9 | gen_num bigint 10 | ); 11 | 12 | -- returns nothing on success, bails out with ERROR on conflict 13 | CREATE OR REPLACE FUNCTION referee.request_grant(applicant_id int, gen_num bigint) RETURNS void AS 14 | $$ 15 | DECLARE 16 | winner_id int; 17 | winner_gen_num bigint; 18 | BEGIN 19 | INSERT INTO referee.decision AS d VALUES ('winner', applicant_id, gen_num) 20 | ON CONFLICT (key) DO UPDATE SET 21 | node_id=EXCLUDED.node_id, gen_num=EXCLUDED.gen_num 22 | -- reissue grant iff it was previously given to this node, not another 23 | WHERE d.node_id = EXCLUDED.node_id AND 24 | -- this could be assert as well, node never repeats request with the same 25 | -- gen num 26 | d.gen_num < EXCLUDED.gen_num 27 | RETURNING applicant_id INTO winner_id; 28 | -- if insertion hasn't happened, there must have been conflict with existing 29 | -- grant 30 | IF winner_id IS NULL THEN 31 | SELECT d.node_id, d.gen_num INTO winner_id, winner_gen_num FROM referee.decision d; 32 | RAISE EXCEPTION 'grant was already issued to node % in generation %', winner_id, winner_gen_num; 33 | END IF; 34 | END 35 | $$ LANGUAGE plpgsql; 36 | 37 | CREATE OR REPLACE FUNCTION referee.clean() RETURNS bool AS 38 | $$ 39 | BEGIN 40 | delete from referee.decision where key = 'winner'; 41 | return 'true'; 42 | END 43 | $$ LANGUAGE plpgsql; 44 | -------------------------------------------------------------------------------- /referee/referee.control: -------------------------------------------------------------------------------- 1 | comment = 'Multimaster referee' 2 | default_version = '1.0' 3 | module_pathname = '$libdir/referee' 4 | schema = referee 5 | relocatable = false 6 | -------------------------------------------------------------------------------- /referee/sql/referee.sql: -------------------------------------------------------------------------------- 1 | CREATE EXTENSION referee; 2 | 3 | SELECT * FROM referee.decision; 4 | 5 | SELECT referee.request_grant(1, 7); 6 | -- node can get its grant reissued 7 | SELECT referee.request_grant(1, 9); 8 | -- but another can't get it while the previous is not cleared 9 | SELECT referee.request_grant(2, 4); 10 | SELECT referee.request_grant(2, 10); 11 | SELECT * FROM referee.decision; 12 | 13 | DELETE FROM referee.decision WHERE gen_num < 8 OR (node_id = 1 AND gen_num <= 9); 14 | -- surely 2 node can acquire the grant after removal of the old one 15 | SELECT referee.request_grant(2, 11); 16 | SELECT * FROM referee.decision; 17 | -------------------------------------------------------------------------------- /run.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use File::Basename; 4 | use Getopt::Long; 5 | BEGIN { unshift @INC, '.'; unshift @INC, '../../src/test/perl' } 6 | use Cluster; 7 | 8 | my $n_nodes = 3; 9 | my $referee = 0; 10 | my $action = 'start'; 11 | GetOptions ("nnodes=i" => \$n_nodes, # numeric 12 | "referee" => \$referee, # flag 13 | "action=s" => \$action); # strings 14 | # referee works only with 2 nodes 15 | if ($referee) 16 | { 17 | $n_nodes = 2; 18 | } 19 | 20 | if ($action eq "start") 21 | { 22 | $Cluster::last_port_assigned = 65431; 23 | 24 | my $cluster = new Cluster($n_nodes, $referee); 25 | $cluster->init(); 26 | $cluster->start(); 27 | $cluster->create_mm('regression'); 28 | 29 | # prevent PostgresNode.pm from shutting down nodes on exit in END {} 30 | @PostgresNode::all_nodes = (); 31 | } 32 | elsif ($action eq "stop") 33 | { 34 | my @datas = <$TestLib::tmp_check/*data>; 35 | foreach my $data (@datas) { 36 | TestLib::system_log('pg_ctl', 37 | '-D', "$data/pgdata", 38 | '-m', 'fast', 39 | 'stop'); 40 | } 41 | } 42 | else 43 | { 44 | die("Usage: run.pl action= [opts]\n"); 45 | } 46 | -------------------------------------------------------------------------------- /sql/atx.sql: -------------------------------------------------------------------------------- 1 | create table atx_test1(a text); 2 | 3 | 4 | -- check that commit of autonomous tx will not steal locks from parent tx 5 | begin; 6 | insert into atx_test1 values (1); 7 | select count(*) from pg_locks where transactionid=txid_current(); 8 | begin autonomous; 9 | insert into atx_test1 values (1); 10 | select count(*) from pg_locks where transactionid=txid_current(); 11 | commit; 12 | -- here we still should see our lock 13 | select count(*) from pg_locks where transactionid=txid_current(); 14 | commit; 15 | 16 | drop table atx_test1; 17 | -------------------------------------------------------------------------------- /src/bkb.c: -------------------------------------------------------------------------------- 1 | /* 2 | * bkb.c 3 | * 4 | * Bron–Kerbosch algorithm to find maximum clique in a graph. 5 | * 6 | * Copyright (c) 2017-2021, Postgres Professional 7 | * 8 | */ 9 | #ifndef TEST 10 | #include "bkb.h" 11 | 12 | #else 13 | #include 14 | #include 15 | #define Assert(expr) assert(expr) 16 | typedef uint64_t nodemask_t; 17 | #define MAX_NODES 64 18 | #define BIT_CHECK(mask, bit) (((mask) & ((nodemask_t)1 << (bit))) != 0) 19 | #define BIT_SET(mask, bit) (mask |= ((nodemask_t)1 << (bit))) 20 | #endif 21 | 22 | typedef struct { 23 | int size; 24 | int nodes[MAX_NODES]; 25 | } NodeList; 26 | 27 | static void 28 | _list_append(NodeList* list, int n) 29 | { 30 | list->nodes[list->size++] = n; 31 | } 32 | 33 | static void 34 | _list_copy(NodeList* dst, NodeList const* src) 35 | { 36 | int i; 37 | int n = src->size; 38 | dst->size = n; 39 | for (i = 0; i < n; i++) { 40 | dst->nodes[i] = src->nodes[i]; 41 | } 42 | } 43 | 44 | static nodemask_t 45 | _list_to_nodemask(NodeList *list) 46 | { 47 | nodemask_t res = 0; 48 | int i; 49 | 50 | for (i = 0; i < list->size; i++) 51 | BIT_SET(res, list->nodes[i]); 52 | return res; 53 | } 54 | 55 | /* 56 | * See original paper 57 | * Bron, Coen; Kerbosch, Joep (1973), "Algorithm 457: finding all cliques of 58 | * an undirected graph", Commun. ACM, ACM, 16 (9): 575–577 59 | * or wiki article (I recommend the latter). Var names (and generally the code) 60 | * here closely resemble ones in the original paper and deserve some deciphering: 61 | * - cur is R in wiki 62 | * - oldSet[0; ne) is X in wiki 63 | * - oldSet[ne; ce) is P in wiki 64 | * 65 | * Pristine Bron-Kerbosch algorithm calculates *all* max cliques. In mtm we 66 | * don't need that, so we return in result only one biggest max clique 67 | * (actually, this means we could avoid maintaining X altogether). 68 | * What we do need though is deterministic calculation, so that whenever we 69 | * have a majority of nodes seeing each other, *all* members of some such 70 | * majority calculate *the same* clique. e.g. with topology 71 | * 72 | * 2 73 | * /|\ 74 | * 1 | 3 75 | * \|/ 76 | * 4 77 | * 78 | * 2 and 4 must calculate the same clique, or we won't converge. 79 | * To this end, we compare max cliques by nodemask and pick the 80 | * smallest one. 81 | */ 82 | static void 83 | extend(NodeList* cur, NodeList* result, nodemask_t* graph, int* oldSet, int ne, int ce) 84 | { 85 | int nod = 0; 86 | int minnod = ce; 87 | int fixp = -1; /* pivot (u in wiki) */ 88 | /* index in oldSet of next vertice we'll include in R -- vertex v in wiki*/ 89 | int s = -1; 90 | int i, j, k; 91 | int newce, newne; 92 | int sel; /* the vertex moved P->R itself, pointed to by s -- v in wiki */ 93 | int newSet[MAX_NODES]; 94 | 95 | /* Choose the pivot vertex fixp */ 96 | for (i = 0; i < ce && minnod != 0; i++) 97 | { 98 | int p = oldSet[i]; 99 | int cnt = 0; 100 | int pos = -1; 101 | 102 | /* 103 | * Count how many non-neighbours of potential pivot we have in P. 104 | * Counterintuitively, we require input to have self-loops, so node is 105 | * sorta neighbour of itself, though we must also recurse into it and 106 | * thus we miss it here (in cnt) and count it in nod instead. 107 | * This mumbo-jumbo is important as it forces (cnt < minnod) be true 108 | * when P contains only one vertex (minnod=1 initially). 109 | * I'd actually make initial minnod bigger and remove self loops... 110 | */ 111 | for (j = ne; j < ce && cnt < minnod; j++) 112 | { 113 | if (!BIT_CHECK(graph[p], oldSet[j])) 114 | { 115 | cnt++; 116 | pos = j; 117 | } 118 | } 119 | 120 | if (cnt < minnod) 121 | { 122 | minnod = cnt; 123 | fixp = p; 124 | if (i < ne) 125 | { 126 | /* if pivot is from X, not P, take random non-neighbour */ 127 | s = pos; 128 | } 129 | else 130 | { 131 | /* 132 | * else, process pivot itself first, otherwise we won't find 133 | * it in the loop below as pivot is a neighbour of itself 134 | */ 135 | s = i; 136 | /* don't forget to increment num of nodes to recurse to */ 137 | nod = 1; 138 | } 139 | } 140 | } 141 | 142 | for (k = minnod + nod; k >= 1; k--) 143 | { 144 | Assert(s >= 0); 145 | Assert(s < MAX_NODES); 146 | Assert(ne >= 0); 147 | Assert(ne < MAX_NODES); 148 | Assert(ce >= 0); 149 | Assert(ce < MAX_NODES); 150 | 151 | /* 152 | * put (wiki) v on the border of X and P, we'll move the border to 153 | * relocate the vertex 154 | */ 155 | sel = oldSet[s]; 156 | oldSet[s] = oldSet[ne]; 157 | oldSet[ne] = sel; 158 | 159 | newne = 0; 160 | /* form X for recursive call -- leave only v's neighbours */ 161 | for (i = 0; i < ne; i++) { 162 | if (BIT_CHECK(graph[sel], oldSet[i])) { 163 | newSet[newne++] = oldSet[i]; 164 | } 165 | } 166 | 167 | newce = newne; 168 | /* 169 | * similarly, form P for recursive call -- leave only v's neighbours 170 | * 171 | * + 1 skips v itself, which is moved to R (again the crutch 172 | * introduced by self loops) 173 | */ 174 | for (i = ne + 1; i < ce; i++) { 175 | if (BIT_CHECK(graph[sel], oldSet[i])) { 176 | newSet[newce++] = oldSet[i]; 177 | } 178 | } 179 | /* push v to R */ 180 | _list_append(cur, sel); 181 | if (newce == 0) { /* both P and X are empty => max clique */ 182 | if (result->size < cur->size || 183 | (result->size == cur->size && 184 | _list_to_nodemask(result) > _list_to_nodemask(cur))) { 185 | _list_copy(result, cur); 186 | } 187 | } else if (newne < newce) { /* P is not empty, so recurse */ 188 | if (cur->size + newce - newne > result->size) { 189 | extend(cur, result, graph, newSet, newne, newce); 190 | } 191 | } 192 | /* remove v back from R for the next iteration */ 193 | cur->size -= 1; 194 | /* move v from P to X */ 195 | ne += 1; 196 | /* and find in P next non-neighbour of pivot */ 197 | if (k > 1) 198 | { 199 | 200 | for (s = ne; BIT_CHECK(graph[fixp], oldSet[s]); s++) 201 | { 202 | Assert(s < MAX_NODES); 203 | } 204 | } 205 | } 206 | } 207 | 208 | /* 209 | * Deterministically (c.f. extend) calculates biggest max clique of the graph. 210 | * The matrix must be symmetric (undirected graph) and must have 1 on the 211 | * diagonal (self loops). 212 | * 213 | * Note that this API renders impossible to distinguish absent node from node 214 | * without any edges -- absent nodes with ids <= n_nodes must still have 1 215 | * on the diagonal. This is fine as we are not interested much in cliques 216 | * of size 1, they never form majority; well, not as far as we don't support 217 | * cluster of size 1. 218 | */ 219 | nodemask_t 220 | MtmFindMaxClique(nodemask_t* graph, int n_nodes, int* clique_size) 221 | { 222 | NodeList tmp; 223 | NodeList result; 224 | int all[MAX_NODES]; 225 | int i; 226 | int j; 227 | 228 | tmp.size = 0; 229 | result.size = 0; 230 | for (i = 0; i < MAX_NODES; i++) 231 | all[i] = i; 232 | 233 | /* check that matrix is symmetric */ 234 | for (i = 0; i < n_nodes; i++) 235 | for (j = 0; j < n_nodes; j++) 236 | Assert(BIT_CHECK(graph[i], j) == BIT_CHECK(graph[j], i)); 237 | 238 | /* algorithm requires diagonal elements to be set */ 239 | for (i = 0; i < n_nodes; i++) 240 | Assert(BIT_CHECK(graph[i], i)); 241 | 242 | extend(&tmp, &result, graph, all, 0, n_nodes); 243 | 244 | *clique_size = result.size; 245 | return _list_to_nodemask(&result); 246 | } 247 | 248 | #ifdef TEST 249 | #include 250 | 251 | /* 252 | * To run some randomized tests, compile with -DTEST to ./a.out, e.g. 253 | * gcc -ggdb3 -O0 -DTEST bkb.c 254 | * , install sage and run ./test_bkb.sage.py 255 | */ 256 | 257 | int main() 258 | { 259 | nodemask_t matrix[64] = {0}; 260 | nodemask_t clique; 261 | int clique_size; 262 | int n_nodes; 263 | 264 | n_nodes = 4; 265 | matrix[0] = 15; /* 1111 */ 266 | matrix[1] = 15; /* 1111 */ 267 | matrix[2] = 7; /* 0111 */ 268 | matrix[3] = 11; /* 1011 */ 269 | 270 | scanf("%d", &n_nodes); 271 | for (int i = 0; i < n_nodes; i++) 272 | { 273 | nodemask_t row; 274 | scanf("%ld", &row); 275 | matrix[i] = row; 276 | } 277 | 278 | clique = MtmFindMaxClique(matrix, n_nodes, &clique_size); 279 | printf("%ld %d\n", clique, clique_size); 280 | return 0; 281 | } 282 | #endif 283 | -------------------------------------------------------------------------------- /src/bytebuf.c: -------------------------------------------------------------------------------- 1 | /* 2 | * bytebuf.c 3 | * 4 | * Copyright (c) 2016-2021, Postgres Professional 5 | * 6 | */ 7 | #include "postgres.h" 8 | 9 | #include "bytebuf.h" 10 | 11 | #define INIT_BUF_SIZE 1024 12 | 13 | void 14 | ByteBufferAlloc(ByteBuffer *buf) 15 | { 16 | buf->size = INIT_BUF_SIZE; 17 | buf->data = palloc(buf->size); 18 | buf->used = 0; 19 | } 20 | 21 | void 22 | ByteBufferAppend(ByteBuffer *buf, void *data, int len) 23 | { 24 | if (buf->used + len > buf->size) 25 | { 26 | buf->size = buf->used + len > buf->size * 2 ? buf->used + len : buf->size * 2; 27 | buf->data = (char *) repalloc(buf->data, buf->size); 28 | } 29 | memcpy(&buf->data[buf->used], data, len); 30 | buf->used += len; 31 | } 32 | 33 | void 34 | ByteBufferAppendInt32(ByteBuffer *buf, int data) 35 | { 36 | ByteBufferAppend(buf, &data, sizeof data); 37 | } 38 | 39 | void 40 | ByteBufferFree(ByteBuffer *buf) 41 | { 42 | pfree(buf->data); 43 | } 44 | 45 | void 46 | ByteBufferReset(ByteBuffer *buf) 47 | { 48 | buf->used = 0; 49 | } 50 | -------------------------------------------------------------------------------- /src/ddd.c: -------------------------------------------------------------------------------- 1 | /*---------------------------------------------------------------------------- 2 | * 3 | * ddd.c 4 | * 5 | * Distributed deadlock detector. 6 | * 7 | * Copyright (c) 2017-2021, Postgres Professional 8 | * 9 | *---------------------------------------------------------------------------- 10 | */ 11 | 12 | #include "postgres.h" 13 | #include "access/clog.h" 14 | #include "access/twophase.h" 15 | #include "access/transam.h" 16 | #include "storage/lwlock.h" 17 | #include "storage/ipc.h" 18 | #include "storage/proc.h" 19 | #include "utils/hsearch.h" 20 | #include "utils/timeout.h" 21 | #include "miscadmin.h" 22 | #include "replication/origin.h" 23 | #include "replication/message.h" 24 | #include "utils/builtins.h" 25 | #include "storage/lmgr.h" 26 | #include "storage/procarray.h" 27 | 28 | #include "multimaster.h" 29 | 30 | #include "ddd.h" 31 | #include "bytebuf.h" 32 | #include "state.h" 33 | #include "logger.h" 34 | #include "commit.h" 35 | 36 | 37 | /* 38 | * This DDD is based on following observations: 39 | * 40 | * Situation when a transaction (say T1) in apply_worker (or receiver 41 | * itself) stucks on some lock created by a transaction in a local backend (say 42 | * T2) will definitely lead to a deadlock since T2 after being prepared and 43 | * replicated will fail to obtain lock that is already held by T1. 44 | * Same reasoning may be applied to the situation when apply_worker (or 45 | * receiver) is waiting for an apply_worker (or receiver) belonging to other 46 | * origin -- no need to wait for a distributed deadlock detection and we may 47 | * just instantly abort. 48 | * Only case for distributed deadlock that is left is when apply_worker 49 | * (or receiver) is waiting for another apply_worker from same origin. However, 50 | * such situation isn't possible since one origin node can not have two 51 | * conflicting prepared transaction simultaneously. 52 | * 53 | * So we may construct distributed deadlock avoiding mechanism by disallowing 54 | * such edges. Now we may ask inverse question: what amount of wait graphs 55 | * with such edges are actually do not represent distributed deadlock? That may 56 | * happen in cases when holding transaction is purely local since it holding 57 | * locks only in SHARED mode. Only lock levels that are conflicting with this 58 | * modes are EXCLUSIVE and ACCESS EXCLUSIVE. In all other cases proposed 59 | * avoiding scheme should not yield false positives. 60 | * 61 | * To cope with false positives in EXCLUSIVE and ACCESS EXCLUSIVE modes we 62 | * may throw exception not in WaitOnLock() when we first saw forbidden edge 63 | * but later during first call to local deadlock detector. This way we still 64 | * have `deadlock_timeout` second to grab that lock and database user also can 65 | * increase it on per-transaction basis if there are long-living read-only 66 | * transactions. 67 | * 68 | * As a further optimization it is possible to check whether our lock is 69 | * EXCLUSIVE or higher so not to delay rollback till `deadlock_timeout` event. 70 | */ 71 | bool 72 | MtmDetectGlobalDeadLock(PGPROC *proc) 73 | { 74 | StringInfoData locktagbuf; 75 | LOCK *lock = proc->waitLock; 76 | bool is_detected = false; 77 | Assert(proc == MyProc); 78 | 79 | /* 80 | * These locks never participate in deadlocks, ignore them. Without it, 81 | * spurious deadlocks might be reported due to concurrency on rel 82 | * extension. 83 | */ 84 | if (LOCK_LOCKTAG(*lock) == LOCKTAG_RELATION_EXTEND || 85 | (LOCK_LOCKTAG(*lock) == LOCKTAG_PAGE)) 86 | return false; 87 | 88 | /* 89 | * There is no need to check for deadlocks in recovery: all 90 | * conflicting transactions must be eventually committed/aborted 91 | * by the resolver. It would not be fatal, but restarting due to 92 | * deadlock ERRORs might significantly slow down the recovery 93 | */ 94 | is_detected = (curr_replication_mode == REPLMODE_NORMAL); 95 | 96 | if (is_detected) 97 | { 98 | initStringInfo(&locktagbuf); 99 | DescribeLockTag(&locktagbuf, &lock->tag); 100 | mtm_log(LOG, "apply worker %d waits for %s on %s", 101 | MyProcPid, 102 | GetLockmodeName(lock->tag.locktag_lockmethodid, proc->waitLockMode), 103 | locktagbuf.data); 104 | } 105 | 106 | return is_detected; 107 | 108 | } 109 | -------------------------------------------------------------------------------- /src/include/bgwpool.h: -------------------------------------------------------------------------------- 1 | #ifndef __BGWPOOL_H__ 2 | #define __BGWPOOL_H__ 3 | 4 | #include "storage/lwlock.h" 5 | #include "storage/pg_sema.h" 6 | #include "postmaster/bgworker.h" 7 | #include "storage/condition_variable.h" 8 | #include "storage/dsm.h" 9 | 10 | #include "receiver.h" 11 | 12 | #define MAX_DBNAME_LEN 30 13 | #define MAX_DBUSER_LEN 30 14 | #define MAX_NAME_LEN 30 15 | #define MULTIMASTER_BGW_RESTART_TIMEOUT BGW_NEVER_RESTART /* seconds */ 16 | 17 | typedef struct 18 | { 19 | int value; /* 0 - not used; 1 - transaction; 2 - sync 20 | * point */ 21 | int prev; 22 | int next; 23 | } txlelem_t; 24 | 25 | typedef struct 26 | { 27 | txlelem_t *store; 28 | int tail; 29 | int head; 30 | int size; 31 | int nelems; 32 | LWLock lock; 33 | ConditionVariable syncpoint_cv; 34 | ConditionVariable transaction_cv; 35 | } txlist_t; 36 | 37 | /* 38 | * Shared data of BgwPool 39 | */ 40 | typedef struct BgwPool 41 | { 42 | int sender_node_id; 43 | LWLock lock; 44 | ConditionVariable syncpoint_cv; 45 | int n_holders; 46 | 47 | /* Tell workers that queue contains a number of work. */ 48 | ConditionVariable available_cv; 49 | 50 | /* 51 | * Queue is full. We can't insert a work data into the queue and wait 52 | * while any worker will take over a piece of data from queue and we will 53 | * do an attempt to try to add the work data into the queue. 54 | */ 55 | ConditionVariable overflow_cv; 56 | 57 | /* Queue state */ 58 | size_t head; 59 | size_t tail; 60 | size_t size; /* Size of queue aligned to INT word */ 61 | 62 | bool producerBlocked; 63 | 64 | char poolName[MAX_NAME_LEN]; 65 | Oid db_id; 66 | Oid user_id; 67 | dsm_handle dsmhandler; /* DSM descriptor. Workers use it for 68 | * attaching */ 69 | 70 | size_t nWorkers; /* a number of pool workers launched */ 71 | TimestampTz lastDynamicWorkerStartTime; 72 | /* Handlers of workers at the pool */ 73 | BackgroundWorkerHandle **bgwhandles; 74 | pid_t receiver_pid; 75 | 76 | txlist_t txlist; 77 | } BgwPool; 78 | 79 | 80 | extern void BgwPoolStart(int sender_node_id, char *poolName, Oid db_id, Oid user_id); 81 | extern void BgwPoolExecute(BgwPool *pool, void *work, int size, MtmReceiverWorkerContext *rwctx); 82 | extern void BgwPoolShutdown(BgwPool *poolDesc); 83 | extern void BgwPoolCancel(BgwPool *pool); 84 | 85 | extern int txl_store(txlist_t *txlist, int value); 86 | extern void txl_remove(txlist_t *txlist, int txlist_pos); 87 | extern void txl_wait_syncpoint(txlist_t *txlist, int txlist_pos); 88 | extern void txl_wait_sphead(txlist_t *txlist, int txlist_pos); 89 | extern void txl_wait_txhead(txlist_t *txlist, int txlist_pos); 90 | extern void txl_wakeup_workers(txlist_t *txlist); 91 | 92 | #endif 93 | -------------------------------------------------------------------------------- /src/include/bkb.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Bron–Kerbosch algorithm to find maximum clique in graph 3 | */ 4 | #ifndef __BKB_H__ 5 | #define __BKB_H__ 6 | 7 | #include "postgres.h" 8 | 9 | #include "multimaster.h" /* xxx move nodemask to separate file */ 10 | 11 | extern uint64 MtmFindMaxClique(uint64 *matrix, int n_modes, int *clique_size); 12 | 13 | #endif 14 | -------------------------------------------------------------------------------- /src/include/bytebuf.h: -------------------------------------------------------------------------------- 1 | #ifndef __BYTEBUF_H__ 2 | #define __BYTEBUF_H__ 3 | 4 | typedef struct 5 | { 6 | char *data; 7 | int size; 8 | int used; 9 | } ByteBuffer; 10 | 11 | extern void ByteBufferAlloc(ByteBuffer *buf); 12 | extern void ByteBufferAppend(ByteBuffer *buf, void *data, int len); 13 | extern void ByteBufferAppendInt32(ByteBuffer *buf, int data); 14 | extern void ByteBufferFree(ByteBuffer *buf); 15 | extern void ByteBufferReset(ByteBuffer *buf); 16 | 17 | #endif 18 | -------------------------------------------------------------------------------- /src/include/commit.h: -------------------------------------------------------------------------------- 1 | /*---------------------------------------------------------------------------- 2 | * 3 | * ddl.h 4 | * Statement based replication of DDL commands. 5 | * 6 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group 7 | * Portions Copyright (c) 1994, Regents of the University of California 8 | * Portions Copyright (c) 2021, Postgres Professional 9 | * 10 | *---------------------------------------------------------------------------- 11 | */ 12 | 13 | #ifndef COMMIT_H 14 | #define COMMIT_H 15 | 16 | #include "postgres.h" 17 | #include "access/xact.h" 18 | 19 | #include "messaging.h" 20 | 21 | /* 22 | * gid starting with MTM is used by internal multimaster 2PC xacts; clients 23 | * shouldn't use them for their own prepares. 24 | */ 25 | #define IS_EXPLICIT_2PC_GID(gid) (strncmp((gid), "MTM-", 4) != 0) 26 | 27 | extern void MtmGenerateGid(char *gid, int node_id, TransactionId xid, 28 | uint64 gen_num); 29 | extern uint64 MtmGidParseGenNum(const char *gid); 30 | extern int MtmGidParseNodeId(const char *gid); 31 | extern TransactionId MtmGidParseXid(const char *gid); 32 | 33 | extern bool MtmTwoPhaseCommit(void); 34 | extern void MtmBeginTransaction(void); 35 | extern void MtmXactCallback(XactEvent event, void *arg); 36 | 37 | extern bool MtmExplicitPrepare(char *gid); 38 | extern void MtmExplicitFinishPrepared(bool isTopLevel, char *gid, bool isCommit); 39 | 40 | #endif 41 | -------------------------------------------------------------------------------- /src/include/compat.h: -------------------------------------------------------------------------------- 1 | #ifndef MTMCOMPAT_H 2 | #define MTMCOMPAT_H 3 | 4 | /* EE pooler gets rid of static variable */ 5 | #ifdef PGPRO_EE 6 | #define FeBeWaitSetCompat() (MyProcPort->pqcomm_waitset) 7 | #else 8 | #define FeBeWaitSetCompat() (FeBeWaitSet) 9 | #endif 10 | 11 | #ifdef PGPRO_EE /* atx */ 12 | #define BeginTransactionBlockCompat() (BeginTransactionBlock(false, NIL)) 13 | #define UserAbortTransactionBlockCompat(chain) (UserAbortTransactionBlock(false, (chain))) 14 | #else 15 | #define BeginTransactionBlockCompat() (BeginTransactionBlock()) 16 | #define UserAbortTransactionBlockCompat(chain) (UserAbortTransactionBlock(chain)) 17 | #endif 18 | 19 | /* atx renames this for some reason */ 20 | #ifdef PGPRO_EE 21 | #define on_commits_compat() (pg_on_commit_actions) 22 | #else 23 | #define on_commits_compat() (on_commits) 24 | #endif 25 | 26 | #endif /* MTMCOMPAT_H */ 27 | -------------------------------------------------------------------------------- /src/include/ddd.h: -------------------------------------------------------------------------------- 1 | #ifndef __DDD_H__ 2 | #define __DDD_H__ 3 | 4 | extern bool MtmDetectGlobalDeadLock(PGPROC *proc); 5 | 6 | #endif 7 | -------------------------------------------------------------------------------- /src/include/ddl.h: -------------------------------------------------------------------------------- 1 | /*---------------------------------------------------------------------------- 2 | * 3 | * ddl.h 4 | * Statement based replication of DDL commands. 5 | * 6 | * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group 7 | * Portions Copyright (c) 1994, Regents of the University of California 8 | * Portions Copyright (c) 2021, Postgres Professional 9 | * 10 | *---------------------------------------------------------------------------- 11 | */ 12 | 13 | #ifndef DML_H 14 | #define DML_H 15 | 16 | #include "utils/relcache.h" 17 | 18 | /* GUCs */ 19 | extern bool MtmMonotonicSequences; 20 | extern char *MtmRemoteFunctionsList; 21 | extern bool MtmRemoteFunctionsUpdating; 22 | extern bool MtmVolksWagenMode; 23 | extern bool MtmIgnoreTablesWithoutPk; 24 | 25 | typedef enum 26 | { 27 | MTM_DDL_IN_PROGRESS_NOTHING, 28 | MTM_DDL_IN_PROGRESS_TX, 29 | MTM_DDL_IN_PROGRESS_NONTX, 30 | } MtmDDLInProgress; 31 | 32 | extern MtmDDLInProgress DDLApplyInProgress; 33 | 34 | extern void MtmDDLReplicationInit(void); 35 | extern void MtmDDLReplicationShmemStartup(void); 36 | extern void temp_schema_reset_all(int my_node_id); 37 | extern bool MtmIsRelationLocal(Relation rel); 38 | extern void MtmDDLResetStatement(void); 39 | extern void MtmApplyDDLMessage(const char *messageBody, bool transactional); 40 | extern void MtmDDLResetApplyState(void); 41 | extern void MtmSetRemoteFunction(char const *list, void *extra); 42 | extern void MtmToggleDML(void); 43 | extern void MtmMakeTableLocal(char const *schema, char const *name, bool locked); 44 | extern void multimaster_fmgr_hook(FmgrHookEventType event, FmgrInfo *flinfo, Datum *private); 45 | 46 | #endif 47 | -------------------------------------------------------------------------------- /src/include/dmq.h: -------------------------------------------------------------------------------- 1 | #ifndef DMQ_H 2 | #define DMQ_H 3 | 4 | #include "libpq-fe.h" 5 | #include "lib/stringinfo.h" 6 | 7 | typedef int8 DmqDestinationId; 8 | 9 | #define DMQ_NAME_MAXLEN 32 10 | /* mm currently uses xact gid as stream name, so this should be >= GIDSIZE */ 11 | #define DMQ_STREAM_NAME_MAXLEN 200 12 | 13 | extern void dmq_init(int send_timeout, int connect_timeout); 14 | 15 | #define DMQ_N_MASK_POS 16 /* ought to be >= MTM_MAX_NODES */ 16 | extern DmqDestinationId dmq_destination_add(char *connstr, char *sender_name, 17 | char *receiver_name, int8 recv_mask_pos, 18 | int ping_period); 19 | extern void dmq_destination_drop(char *receiver_name); 20 | extern void dmq_destination_reconnect(char *receiver_name); 21 | 22 | extern void dmq_attach_receiver(char *sender_name, int8 mask_pos); 23 | extern void dmq_detach_receiver(char *sender_name); 24 | 25 | extern void dmq_terminate_receiver(char *name); 26 | 27 | extern void dmq_reattach_receivers(void); 28 | extern void dmq_stream_subscribe(char *stream_name); 29 | extern void dmq_stream_unsubscribe(void); 30 | 31 | extern void dmq_get_sendconn_cnt(uint64 participants, int *sconn_cnt); 32 | extern bool dmq_pop(int8 *sender_mask_pos, StringInfo msg, uint64 mask); 33 | extern bool dmq_pop_nb(int8 *sender_mask_pos, StringInfo msg, uint64 mask, bool *wait); 34 | extern uint64 dmq_purge_failed_participants(uint64 participants, int *sconn_cnt); 35 | 36 | extern void dmq_push(DmqDestinationId dest_id, char *stream_name, char *msg); 37 | extern void dmq_push_buffer(DmqDestinationId dest_id, char *stream_name, const void *buffer, size_t len); 38 | 39 | typedef void (*dmq_hook_type) (char *); 40 | extern void *(*dmq_receiver_start_hook)(char *sender_name); 41 | extern dmq_hook_type dmq_receiver_stop_hook; 42 | extern void (*dmq_receiver_heartbeat_hook)(char *sender_name, StringInfo msg, void *extra); 43 | extern dmq_hook_type dmq_sender_connect_hook; 44 | extern void (*dmq_sender_heartbeat_hook)(char *receiver_name, StringInfo buf); 45 | extern dmq_hook_type dmq_sender_disconnect_hook; 46 | 47 | #endif 48 | -------------------------------------------------------------------------------- /src/include/global_tx.h: -------------------------------------------------------------------------------- 1 | /*---------------------------------------------------------------------------- 2 | * 3 | * global_tx.h 4 | * Persistent and in-memory state necessary for our E3PC-like atomic commit 5 | # protocol. 6 | * 7 | * Copyright (c) 2016-2021, Postgres Professional 8 | * 9 | *---------------------------------------------------------------------------- 10 | */ 11 | #ifndef GLOBAL_TX_H 12 | #define GLOBAL_TX_H 13 | 14 | #include "multimaster.h" 15 | 16 | typedef struct 17 | { 18 | int ballot; 19 | int node_id; 20 | } GlobalTxTerm; 21 | 22 | #define InvalidGTxTerm ((GlobalTxTerm) {0, 0}) 23 | /* 24 | * This term with ballot 1 and fake 0 node id is less than any term generated 25 | * by resolver; it is used by the coordinator itself. 26 | */ 27 | #define InitialGTxTerm ((GlobalTxTerm) {1, 0}) 28 | 29 | typedef enum 30 | { 31 | GTXInvalid = 0, /* we never gave a vote */ 32 | GTXPreCommitted, /* voted for commit */ 33 | GTXPreAborted, /* voted for abort */ 34 | GTXCommitted, /* definitely know xact is committed */ 35 | GTXAborted /* definitely know xact is aborted */ 36 | } GlobalTxStatus; 37 | 38 | extern char const *const GlobalTxStatusMnem[]; 39 | 40 | typedef enum 41 | { 42 | GTRS_AwaitStatus, /* 1a sent, wait for 1b */ 43 | GTRS_AwaitAcks /* 2a sent, wait for 2b */ 44 | } GlobalTxResolvingStage; 45 | 46 | typedef struct 47 | { 48 | GlobalTxTerm proposal; /* nextBal in terms of The Part-Time Parliament */ 49 | GlobalTxTerm accepted; /* prevBal in terms of The Part-Time Parliament */ 50 | GlobalTxStatus status; /* 51 | * prevDec in terms of The Part-Time Parliament 52 | * (or special never voted | commit | abort) 53 | */ 54 | } GTxState; 55 | 56 | /* 57 | * Constant xact metadata which we encode into state_3pc. We could (and 58 | * previously did) carry that directly in gid, but this intervenes with 59 | * explicit 2PC usage: applier must know generation of the xact, and 60 | * scribbling over user-provided gid is ugly and/or inefficient. 61 | */ 62 | typedef struct 63 | { 64 | int coordinator; /* node id who initiated the transaction */ 65 | TransactionId xid; /* xid at coordinator */ 66 | uint64 gen_num; /* the number of generation xact belongs to */ 67 | nodemask_t configured; /* mask of configured nodes of this generation; 68 | * the idea was to use this by resolver, but it 69 | * wasn't finished. We shouldn't have any problems 70 | * with this anyway if all xacts created before 71 | * first node add-rm are resolved before the 72 | * second one is started 73 | */ 74 | } XactInfo; 75 | 76 | typedef struct GlobalTx 77 | { 78 | char gid[GIDSIZE]; 79 | XactInfo xinfo; 80 | XLogRecPtr coordinator_end_lsn; 81 | BackendId acquired_by; 82 | /* paxos voting state for this xact */ 83 | GTxState state; 84 | /* transient thing used to rm shmem entry on error */ 85 | bool prepared; 86 | 87 | /* resolver corner */ 88 | bool orphaned; /* Indication for resolver that current tx needs 89 | * to be picked up. Comes from a failed backend or 90 | * a disabled node. */ 91 | GTxState phase1_acks[MTM_MAX_NODES]; 92 | /* 93 | * Technically phase2 ack contains just one term, which is acked. However, 94 | * we 1) collect decrees (in 'status') to perform sanity checks 95 | * 2) make it GTxState to reuse quorum() function. 96 | */ 97 | GTxState phase2_acks[MTM_MAX_NODES]; 98 | GlobalTxResolvingStage resolver_stage; 99 | } GlobalTx; 100 | 101 | typedef struct 102 | { 103 | LWLock *lock; 104 | HTAB *gid2gtx; 105 | } gtx_shared_data; 106 | 107 | extern gtx_shared_data *gtx_shared; 108 | 109 | void MtmGlobalTxInit(void); 110 | void MtmGlobalTxShmemStartup(void); 111 | void GlobalTxEnsureBeforeShmemExitHook(void); 112 | GlobalTx *GlobalTxAcquire(const char *gid, bool create, bool nowait_own_live, 113 | bool *busy, int coordinator); 114 | void GlobalTxRelease(GlobalTx *gtx); 115 | void GlobalTxAtExit(int code, Datum arg); 116 | void GlobalTxLoadAll(void); 117 | char *serialize_xstate(XactInfo *xinfo, GTxState *gtx_state); 118 | int term_cmp(GlobalTxTerm t1, GlobalTxTerm t2); 119 | int deserialize_xstate(const char *state, XactInfo *xinfo, GTxState *gtx_state, 120 | int elevel); 121 | GlobalTxTerm GlobalTxGetMaxProposal(void); 122 | void GlobalTxSaveInTable(const char *gid, XLogRecPtr coordinator_end_lsn, 123 | GlobalTxStatus status, 124 | GlobalTxTerm term_prop, GlobalTxTerm term_acc); 125 | void GlobalTxMarkOrphaned(int node_id); 126 | 127 | char *GlobalTxToString(GlobalTx *gtx); 128 | 129 | #endif /* GLOBAL_TX_H */ 130 | -------------------------------------------------------------------------------- /src/include/logger.h: -------------------------------------------------------------------------------- 1 | /*---------------------------------------------------------------------------- 2 | * 3 | * logger.h 4 | * GUC-controlled map from application meaningful log tags to actual log 5 | * levels. 6 | * 7 | * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group 8 | * Portions Copyright (c) 1994, Regents of the University of California 9 | * Portions Copyright (c) 2021, Postgres Professional 10 | * 11 | *---------------------------------------------------------------------------- 12 | */ 13 | 14 | #include "postgres.h" 15 | 16 | #include "postmaster/bgworker.h" 17 | #include "utils/elog.h" 18 | #include "utils/memutils.h" 19 | 20 | /* 21 | * this hack allows to use mtm_log with direct log level (e.g. ERROR), see 22 | * mtm_log 23 | */ 24 | #define FIRST_UNUSED_ERRCODE (PANIC + 1) 25 | 26 | /* keep it in sync with mtm_log_gucs */ 27 | typedef enum MtmLogTag 28 | { 29 | /* general */ 30 | MtmTxTrace = FIRST_UNUSED_ERRCODE, 31 | MtmTxFinish, 32 | 33 | /* coordinator */ 34 | MtmCoordinatorTrace, 35 | 36 | /* dmq */ 37 | DmqStateIntermediate, 38 | DmqStateFinal, 39 | DmqTraceOutgoing, 40 | DmqTraceIncoming, 41 | DmqTraceShmMq, 42 | DmqPqTiming, 43 | 44 | /* resolver */ 45 | ResolverState, 46 | ResolverTx, 47 | ResolverTasks, 48 | 49 | /* status worker */ 50 | StatusRequest, 51 | 52 | /* pool */ 53 | BgwPoolEvent, 54 | BgwPoolEventDebug, 55 | 56 | /* ddd */ 57 | DeadlockCheck, 58 | DeadlockUpdate, 59 | DeadlockSerialize, 60 | 61 | /* ddl */ 62 | DDLStmtOutgoing, 63 | DDLStmtIncoming, 64 | DDLProcessingTrace, 65 | 66 | /* walsender's proto */ 67 | ProtoTraceFilter, 68 | ProtoTraceSender, 69 | ProtoTraceMessage, 70 | ProtoTraceState, 71 | 72 | /* receiver */ 73 | MtmReceiverState, 74 | MtmReceiverStateDebug, 75 | MtmReceiverFilter, 76 | MtmApplyMessage, 77 | MtmApplyTrace, 78 | MtmApplyError, 79 | MtmApplyBgwFinish, 80 | MtmReceiverFeedback, 81 | 82 | /* state */ 83 | MtmStateMessage, 84 | MtmStateSwitch, 85 | MtmStateDebug, 86 | 87 | /* syncpoints */ 88 | SyncpointCreated, 89 | SyncpointApply, 90 | 91 | /* Node add/drop */ 92 | NodeMgmt 93 | } MtmLogTag; 94 | 95 | typedef struct MtmLogGuc 96 | { 97 | const char *name; 98 | int default_val; 99 | int val; 100 | } MtmLogGuc; 101 | 102 | extern MtmLogGuc mtm_log_gucs[]; 103 | 104 | #define MTM_TAG "[MTM]%s" 105 | 106 | /* 107 | * I tried to use get_ps_display instead of MyBgworkerEntry, but it returns 108 | * only dynamic 'activity' part which doesn't include bgw name. Apparently 109 | * there is no way to retrieve main part. Weird. 110 | */ 111 | extern bool MtmBackgroundWorker; /* avoid including multimaster.h for this */ 112 | extern char *walsender_name; /* same for pglogical_proto.h */ 113 | static inline char * 114 | am(void) 115 | { 116 | char *res = " "; 117 | char *name = NULL; 118 | 119 | if (MtmBackgroundWorker) 120 | name = MyBgworkerEntry->bgw_name; 121 | else if (walsender_name) 122 | name = walsender_name; 123 | if (name) 124 | { 125 | /* this is for elog, so alloc in ErrorContext where fmt is evaluated */ 126 | MemoryContext old_ctx = MemoryContextSwitchTo(ErrorContext); 127 | res = psprintf(" [%s] ", name); 128 | MemoryContextSwitchTo(old_ctx); 129 | } 130 | return res; 131 | } 132 | 133 | #define MTM_ERRMSG(fmt,...) errmsg(MTM_TAG fmt, am(), ## __VA_ARGS__) 134 | 135 | /* 136 | * tag can either one of MtmLogTag values (in which case corresponding GUC 137 | * defines the actual log level) or direct level like ERROR 138 | */ 139 | #define mtm_log(tag, fmt, ...) ereport( \ 140 | ((tag) >= FIRST_UNUSED_ERRCODE ? \ 141 | mtm_log_gucs[tag - FIRST_UNUSED_ERRCODE].val : (tag)), \ 142 | (errmsg(MTM_TAG fmt, \ 143 | am(), ## __VA_ARGS__), \ 144 | errhidestmt(true), errhidecontext(true))) 145 | -------------------------------------------------------------------------------- /src/include/messaging.h: -------------------------------------------------------------------------------- 1 | 2 | /***************************************************************************** 3 | * 4 | * Messaging 5 | * 6 | *****************************************************************************/ 7 | #ifndef MESSAGING_H 8 | #define MESSAGING_H 9 | 10 | #include "global_tx.h" 11 | #include "state.h" 12 | 13 | /* 14 | * All messages are stamped with MtmMessageTag that should came before the rest 15 | * of the message. That is used upon receival as typecasting criterion. 16 | */ 17 | typedef enum 18 | { 19 | T_MtmPrepareResponse = 0, 20 | T_Mtm2AResponse, 21 | T_MtmTxRequest, 22 | T_MtmTxStatusResponse, 23 | T_MtmHeartbeat, 24 | T_MtmGenVoteRequest, 25 | T_MtmGenVoteResponse 26 | } MtmMessageTag; 27 | 28 | typedef struct MtmMessage 29 | { 30 | MtmMessageTag tag; 31 | } MtmMessage; 32 | 33 | #define messageTag(msgptr) (((const MtmMessage *)(msgptr))->tag) 34 | 35 | /* Response to PREPARE by apply worker */ 36 | typedef struct 37 | { 38 | MtmMessageTag tag; 39 | int node_id; 40 | /* for PREPARE we care only about, well, prepare success */ 41 | bool prepared; 42 | int32 errcode; 43 | const char *errmsg; 44 | TransactionId xid; /* identifies the message */ 45 | } MtmPrepareResponse; 46 | 47 | /* 48 | * Response to 2A msg by apply worker or by replier (during resolving). 49 | * This could be named just 2B, ha. 50 | * It is also abused for COMMIT PREPARED ack (with .status = GTXCommitted). 51 | */ 52 | typedef struct 53 | { 54 | MtmMessageTag tag; 55 | int node_id; 56 | /* 57 | * Our prevVote in terms of the Part-Time Parliament paper. Actually there 58 | * is no need to carry the decree (status) itself, ballot (term) is 59 | * enough, but it is kept for convenience. 60 | */ 61 | GlobalTxStatus status; 62 | GlobalTxTerm accepted_term; 63 | int32 errcode; 64 | const char *errmsg; 65 | const char *gid; /* identifies the message */ 66 | } Mtm2AResponse; 67 | 68 | /* 69 | * Response on MtmLastTermRequest request, holds last proposal value. 70 | */ 71 | typedef struct 72 | { 73 | MtmMessageTag tag; 74 | GlobalTxTerm term; 75 | } MtmLastTermResponse; 76 | 77 | /* 78 | * Request to change transaction state. This messages are duplicate of 79 | * corresponding WAL records, but we need them during transaction resolution 80 | * upon recovery as WAL receiver may be blocked by a transaction that we 81 | * are actually resolving. 82 | * 83 | * Sent from mtm-resolver to mtm-status worker. 84 | */ 85 | typedef enum 86 | { 87 | MTReq_Abort = 0, 88 | MTReq_Commit, 89 | MTReq_Precommit, /* 2a with value commit */ 90 | MTReq_Preabort, /* 2a with value abort */ 91 | MTReq_Status /* 1a */ 92 | } MtmTxRequestValue; 93 | 94 | typedef struct 95 | { 96 | MtmMessageTag tag; 97 | MtmTxRequestValue type; 98 | GlobalTxTerm term; 99 | const char *gid; 100 | int coordinator; 101 | uint64 gen_num; 102 | XLogRecPtr coordinator_end_lsn; /* matters for 1a */ 103 | } MtmTxRequest; 104 | 105 | extern char const * const MtmTxRequestValueMnem[]; 106 | 107 | /* 108 | * Status response, phase 1b of paxos on a given transaction result. 109 | * Sent from mtm-status to mtm-resolver worker. 110 | */ 111 | typedef struct 112 | { 113 | MtmMessageTag tag; 114 | int node_id; 115 | GTxState state; 116 | const char *gid; 117 | } MtmTxStatusResponse; 118 | 119 | /* 120 | * Data sent in dmq heartbeats. 121 | */ 122 | typedef struct 123 | { 124 | MtmMessageTag tag; 125 | MtmGeneration current_gen; 126 | uint64 donors; /* xxx nodemask_t */ 127 | uint64 last_online_in; 128 | uint64 connected_mask; /* xxx nodemask_t */ 129 | } MtmHeartbeat; 130 | 131 | /* 132 | * Request to vote for new generation. 133 | */ 134 | typedef struct 135 | { 136 | MtmMessageTag tag; 137 | MtmGeneration gen; 138 | } MtmGenVoteRequest; 139 | 140 | /* 141 | * Reply to new generation vote request. 142 | */ 143 | typedef struct 144 | { 145 | MtmMessageTag tag; 146 | uint64 gen_num; /* identifies the message */ 147 | uint8 vote_ok; 148 | /* last_online_in of replier on the moment of voting, determines donors */ 149 | uint64 last_online_in; 150 | /* 151 | * if vote_ok is false this might be a valid gen number showing that 152 | * replier couldn't vote because its last_vote is higher. 153 | */ 154 | uint64 last_vote_num; 155 | /* 156 | * curr gen donors of the responder and its donors. Sometimes we wish to 157 | * send it along with refusal to vote, see HandleGenVoteRequest. 158 | */ 159 | MtmGeneration curr_gen; 160 | uint64_t curr_gen_donors; 161 | } MtmGenVoteResponse; 162 | 163 | 164 | StringInfo MtmMessagePack(MtmMessage *anymsg); 165 | MtmMessage *MtmMessageUnpack(StringInfo s); 166 | char *MtmMesageToString(MtmMessage *anymsg); 167 | 168 | #endif /* MESSAGING_H */ 169 | -------------------------------------------------------------------------------- /src/include/mtm_utils.h: -------------------------------------------------------------------------------- 1 | /*------------------------------------------------------------------------- 2 | * 3 | * mtm_utils.h 4 | * Utility functions: 5 | * - disable global timeouts settings; 6 | * - libpq connect function wrappers. 7 | * 8 | * 9 | * Copyright (c) 2022, Postgres Professional 10 | * 11 | *------------------------------------------------------------------------- 12 | */ 13 | #ifndef MTM_UTILS_H 14 | #define MTM_UTILS_H 15 | 16 | #include "libpq/pqformat.h" 17 | #include "libpq-fe.h" 18 | 19 | extern void MtmDisableTimeouts(void); 20 | 21 | extern PostgresPollingStatusType MtmPQconnectPoll(PGconn *conn); 22 | extern PGconn* MtmPQconnectdb(const char *conninfo); 23 | 24 | #endif 25 | -------------------------------------------------------------------------------- /src/include/pglogical_config.h: -------------------------------------------------------------------------------- 1 | #ifndef PG_LOGICAL_CONFIG_H 2 | #define PG_LOGICAL_CONFIG_H 3 | 4 | #ifndef PG_VERSION_NUM 5 | #error must be included first 6 | #endif 7 | 8 | #include "nodes/pg_list.h" 9 | #include "pglogical_output.h" 10 | 11 | inline static bool 12 | server_float4_byval(void) 13 | { 14 | #ifdef USE_FLOAT4_BYVAL 15 | return true; 16 | #else 17 | return false; 18 | #endif 19 | } 20 | 21 | inline static bool 22 | server_float8_byval(void) 23 | { 24 | #ifdef USE_FLOAT8_BYVAL 25 | return true; 26 | #else 27 | return false; 28 | #endif 29 | } 30 | 31 | inline static bool 32 | server_integer_datetimes(void) 33 | { 34 | #ifdef USE_INTEGER_DATETIMES 35 | return true; 36 | #else 37 | return false; 38 | #endif 39 | } 40 | 41 | inline static bool 42 | server_bigendian(void) 43 | { 44 | #ifdef WORDS_BIGENDIAN 45 | return true; 46 | #else 47 | return false; 48 | #endif 49 | } 50 | 51 | extern int process_parameters(List *options, PGLogicalOutputData *data); 52 | 53 | extern List *prepare_startup_message(PGLogicalOutputData *data); 54 | 55 | #endif 56 | -------------------------------------------------------------------------------- /src/include/pglogical_hooks.h: -------------------------------------------------------------------------------- 1 | #ifndef PGLOGICAL_HOOKS_H 2 | #define PGLOGICAL_HOOKS_H 3 | 4 | #include "replication/reorderbuffer.h" 5 | 6 | /* public interface for hooks */ 7 | #include "pglogical_output/hooks.h" 8 | #include "pglogical_output.h" 9 | 10 | extern void load_hooks(PGLogicalOutputData *data); 11 | 12 | extern void call_startup_hook(PGLogicalOutputData *data, List *plugin_params); 13 | 14 | extern void call_shutdown_hook(PGLogicalOutputData *data); 15 | 16 | extern bool call_row_filter_hook(PGLogicalOutputData *data, 17 | ReorderBufferTXN *txn, Relation rel, ReorderBufferChange *change); 18 | 19 | extern bool call_txn_filter_hook(PGLogicalOutputData *data, 20 | RepOriginId txn_origin); 21 | 22 | 23 | #endif 24 | -------------------------------------------------------------------------------- /src/include/pglogical_output.h: -------------------------------------------------------------------------------- 1 | /*------------------------------------------------------------------------- 2 | * 3 | * pglogical_output.h 4 | * pglogical output plugin 5 | * 6 | * Copyright (c) 2015, PostgreSQL Global Development Group 7 | * Portions Copyright (c) 2021, Postgres Professional 8 | * 9 | * IDENTIFICATION 10 | * pglogical_output.h 11 | * 12 | *------------------------------------------------------------------------- 13 | */ 14 | #ifndef PG_LOGICAL_OUTPUT_H 15 | #define PG_LOGICAL_OUTPUT_H 16 | 17 | #include "nodes/parsenodes.h" 18 | 19 | #include "replication/logical.h" 20 | #include "replication/output_plugin.h" 21 | 22 | #include "storage/lock.h" 23 | 24 | #include "pglogical_output/hooks.h" 25 | 26 | #include "pglogical_proto.h" 27 | 28 | #include "multimaster.h" 29 | 30 | #define PG_LOGICAL_PROTO_VERSION_NUM 1 31 | #define PG_LOGICAL_PROTO_MIN_VERSION_NUM 1 32 | 33 | /* 34 | * The name of a hook function. This is used instead of the usual List* 35 | * because can serve as a hash key. 36 | * 37 | * Must be zeroed on allocation if used as a hash key since padding is 38 | * *not* ignored on compare. 39 | */ 40 | typedef struct HookFuncName 41 | { 42 | /* funcname is more likely to be unique, so goes first */ 43 | char function[NAMEDATALEN]; 44 | char schema[NAMEDATALEN]; 45 | } HookFuncName; 46 | 47 | typedef struct MtmDecoderPrivate 48 | { 49 | int receiver_node_id; 50 | bool is_recovery; 51 | MtmConfig *cfg; 52 | } MtmDecoderPrivate; 53 | 54 | typedef struct PGLogicalOutputData 55 | { 56 | MemoryContext context; 57 | 58 | PGLogicalProtoAPI *api; 59 | 60 | /* protocol */ 61 | bool allow_internal_basetypes; 62 | bool allow_binary_basetypes; 63 | bool forward_changesets; 64 | bool forward_changeset_origins; 65 | int field_datum_encoding; 66 | 67 | /* 68 | * client info 69 | * 70 | * Lots of this should move to a separate shorter-lived struct used only 71 | * during parameter reading, since it contains what the client asked for. 72 | * Once we've processed this during startup we don't refer to it again. 73 | */ 74 | uint32 client_pg_version; 75 | uint32 client_max_proto_version; 76 | uint32 client_min_proto_version; 77 | const char *client_expected_encoding; 78 | const char *client_protocol_format; 79 | uint32 client_binary_basetypes_major_version; 80 | bool client_want_internal_basetypes_set; 81 | bool client_want_internal_basetypes; 82 | bool client_want_binary_basetypes_set; 83 | bool client_want_binary_basetypes; 84 | bool client_binary_bigendian_set; 85 | bool client_binary_bigendian; 86 | uint32 client_binary_sizeofdatum; 87 | uint32 client_binary_sizeofint; 88 | uint32 client_binary_sizeoflong; 89 | bool client_binary_float4byval_set; 90 | bool client_binary_float4byval; 91 | bool client_binary_float8byval_set; 92 | bool client_binary_float8byval; 93 | bool client_binary_intdatetimes_set; 94 | bool client_binary_intdatetimes; 95 | bool client_forward_changesets_set; 96 | bool client_forward_changesets; 97 | bool client_no_txinfo; 98 | 99 | /* hooks */ 100 | List *hooks_setup_funcname; 101 | struct PGLogicalHooks hooks; 102 | MemoryContext hooks_mctxt; 103 | 104 | /* DefElem list populated by startup hook */ 105 | List *extra_startup_params; 106 | } PGLogicalOutputData; 107 | 108 | typedef struct PGLogicalTupleData 109 | { 110 | Datum values[MaxTupleAttributeNumber]; 111 | bool nulls[MaxTupleAttributeNumber]; 112 | bool changed[MaxTupleAttributeNumber]; 113 | } PGLogicalTupleData; 114 | 115 | extern void MtmOutputPluginWrite(LogicalDecodingContext *ctx, bool last_write, bool flush); 116 | extern void MtmOutputPluginPrepareWrite(LogicalDecodingContext *ctx, bool last_write, bool flush); 117 | 118 | #endif /* PG_LOGICAL_OUTPUT_H */ 119 | -------------------------------------------------------------------------------- /src/include/pglogical_output/compat.h: -------------------------------------------------------------------------------- 1 | #ifndef PG_LOGICAL_COMPAT_H 2 | #define PG_LOGICAL_COMPAT_H 3 | 4 | #include "pg_config.h" 5 | 6 | /* 9.4 lacks replication origins */ 7 | #if PG_VERSION_NUM >= 90500 8 | #define HAVE_REPLICATION_ORIGINS 9 | #else 10 | /* To allow the same signature on hooks in 9.4 */ 11 | typedef uint16 RepOriginId; 12 | #define InvalidRepOriginId 0 13 | #endif 14 | 15 | /* 9.4 lacks PG_UINT32_MAX */ 16 | #ifndef PG_UINT32_MAX 17 | #define PG_UINT32_MAX UINT32_MAX 18 | #endif 19 | 20 | #ifndef PG_INT32_MAX 21 | #define PG_INT32_MAX INT32_MAX 22 | #endif 23 | 24 | #ifndef PG_INT32_MIN 25 | #define PG_INT32_MIN INT32_MIN 26 | #endif 27 | 28 | #endif 29 | -------------------------------------------------------------------------------- /src/include/pglogical_output/hooks.h: -------------------------------------------------------------------------------- 1 | #ifndef PGLOGICAL_OUTPUT_HOOKS_H 2 | #define PGLOGICAL_OUTPUT_HOOKS_H 3 | 4 | #include "access/xlogdefs.h" 5 | #include "nodes/pg_list.h" 6 | #include "utils/rel.h" 7 | #include "utils/palloc.h" 8 | #include "replication/reorderbuffer.h" 9 | 10 | #include "pglogical_output/compat.h" 11 | 12 | /* 13 | * This header is to be included by extensions that implement pglogical output 14 | * plugin callback hooks for transaction origin and row filtering, etc. It is 15 | * installed as "pglogical_output/hooks.h" 16 | * 17 | * See the README.md and the example in examples/hooks/ for details on hooks. 18 | */ 19 | 20 | 21 | struct PGLogicalStartupHookArgs 22 | { 23 | void *private_data; 24 | List *in_params; 25 | List *out_params; 26 | }; 27 | 28 | typedef void (*pglogical_startup_hook_fn) (struct PGLogicalStartupHookArgs *args); 29 | 30 | 31 | struct PGLogicalTxnFilterArgs 32 | { 33 | void *private_data; 34 | RepOriginId origin_id; 35 | }; 36 | 37 | typedef bool (*pglogical_txn_filter_hook_fn) (struct PGLogicalTxnFilterArgs *args); 38 | 39 | 40 | struct PGLogicalRowFilterArgs 41 | { 42 | void *private_data; 43 | Relation changed_rel; 44 | enum ReorderBufferChangeType change_type; 45 | /* detailed row change event from logical decoding */ 46 | ReorderBufferChange *change; 47 | }; 48 | 49 | typedef bool (*pglogical_row_filter_hook_fn) (struct PGLogicalRowFilterArgs *args); 50 | 51 | 52 | struct PGLogicalShutdownHookArgs 53 | { 54 | void *private_data; 55 | }; 56 | 57 | typedef void (*pglogical_shutdown_hook_fn) (struct PGLogicalShutdownHookArgs *args); 58 | 59 | /* 60 | * This struct is passed to the pglogical_get_hooks_fn as the first argument, 61 | * typed 'internal', and is unwrapped with `DatumGetPointer`. 62 | */ 63 | struct PGLogicalHooks 64 | { 65 | pglogical_startup_hook_fn startup_hook; 66 | pglogical_shutdown_hook_fn shutdown_hook; 67 | pglogical_txn_filter_hook_fn txn_filter_hook; 68 | pglogical_row_filter_hook_fn row_filter_hook; 69 | void *hooks_private_data; 70 | }; 71 | 72 | 73 | #endif /* PGLOGICAL_OUTPUT_HOOKS_H */ 74 | -------------------------------------------------------------------------------- /src/include/pglogical_proto.h: -------------------------------------------------------------------------------- 1 | /*------------------------------------------------------------------------- 2 | * 3 | * pglogical_proto.h 4 | * pglogical protocol 5 | * 6 | * Copyright (c) 2015, PostgreSQL Global Development Group 7 | * Portions Copyright (c) 2021, Postgres Professional 8 | * 9 | * IDENTIFICATION 10 | * pglogical_proto.h 11 | * 12 | *------------------------------------------------------------------------- 13 | */ 14 | #ifndef PG_LOGICAL_PROTO_H 15 | #define PG_LOGICAL_PROTO_H 16 | 17 | struct PGLogicalOutputData; 18 | struct PGLRelMetaCacheEntry; 19 | 20 | extern char *walsender_name; 21 | 22 | typedef void (*pglogical_write_rel_fn) (StringInfo out, struct PGLogicalOutputData *data, 23 | Relation rel /* , struct 24 | * PGLRelMetaCacheEntry 25 | * *cache_entry */ ); 26 | 27 | typedef void (*pglogical_write_begin_fn) (StringInfo out, struct PGLogicalOutputData *data, 28 | ReorderBufferTXN *txn); 29 | typedef void (*pglogical_write_message_fn) (StringInfo out, LogicalDecodingContext *ctx, 30 | XLogRecPtr end_lsn, 31 | const char *prefix, Size sz, const char *message); 32 | typedef void (*pglogical_write_commit_fn) (StringInfo out, struct PGLogicalOutputData *data, 33 | ReorderBufferTXN *txn, XLogRecPtr commit_lsn); 34 | 35 | typedef void (*pglogical_write_origin_fn) (StringInfo out, const char *origin, 36 | XLogRecPtr origin_lsn); 37 | 38 | typedef void (*pglogical_write_insert_fn) (StringInfo out, struct PGLogicalOutputData *data, 39 | Relation rel, HeapTuple newtuple); 40 | typedef void (*pglogical_write_update_fn) (StringInfo out, struct PGLogicalOutputData *data, 41 | Relation rel, HeapTuple oldtuple, 42 | HeapTuple newtuple); 43 | typedef void (*pglogical_write_delete_fn) (StringInfo out, struct PGLogicalOutputData *data, 44 | Relation rel, HeapTuple oldtuple); 45 | 46 | typedef void (*pglogical_write_caughtup_fn) (StringInfo out, struct PGLogicalOutputData *data, 47 | XLogRecPtr wal_end_ptr); 48 | 49 | typedef void (*write_startup_message_fn) (StringInfo out, List *msg); 50 | 51 | typedef void (*pglogical_setup_hooks_fn) (struct PGLogicalHooks *hooks); 52 | 53 | typedef struct PGLogicalProtoAPI 54 | { 55 | pglogical_write_rel_fn write_rel; 56 | pglogical_write_begin_fn write_begin; 57 | pglogical_write_message_fn write_message; 58 | pglogical_write_commit_fn write_commit; 59 | pglogical_write_origin_fn write_origin; 60 | pglogical_write_insert_fn write_insert; 61 | pglogical_write_update_fn write_update; 62 | pglogical_write_delete_fn write_delete; 63 | pglogical_write_caughtup_fn write_caughtup; 64 | pglogical_setup_hooks_fn setup_hooks; 65 | write_startup_message_fn write_startup_message; 66 | } PGLogicalProtoAPI; 67 | 68 | 69 | typedef enum PGLogicalProtoType 70 | { 71 | PGLogicalProtoNative, 72 | PGLogicalProtoJson 73 | } PGLogicalProtoType; 74 | 75 | extern PGLogicalProtoAPI *pglogical_init_api(PGLogicalProtoType typ); 76 | 77 | 78 | extern void pglogical_write_abort(StringInfo out, 79 | struct PGLogicalOutputData *data, 80 | ReorderBufferTXN *txn, XLogRecPtr lsn); 81 | extern void pglogical_write_prepare(StringInfo out, 82 | struct PGLogicalOutputData *data, 83 | ReorderBufferTXN *txn, XLogRecPtr lsn); 84 | extern void pglogical_write_commit_prepared(StringInfo out, 85 | struct PGLogicalOutputData *data, 86 | ReorderBufferTXN *txn, XLogRecPtr lsn); 87 | extern void pglogical_write_abort_prepared(StringInfo out, 88 | struct PGLogicalOutputData *data, 89 | ReorderBufferTXN *txn, XLogRecPtr lsn); 90 | 91 | #endif /* PG_LOGICAL_PROTO_H */ 92 | -------------------------------------------------------------------------------- /src/include/pglogical_relid_map.h: -------------------------------------------------------------------------------- 1 | #ifndef PGLOGICAL_RELID_MAP 2 | #define PGLOGICAL_RELID_MAP 3 | 4 | #define PGL_INIT_RELID_MAP_SIZE 256 5 | 6 | typedef struct PGLRelidMapEntry 7 | { 8 | Oid remote_relid; 9 | Oid local_relid; 10 | } PGLRelidMapEntry; 11 | 12 | extern Oid pglogical_relid_map_get(Oid relid); 13 | extern bool pglogical_relid_map_put(Oid remote_relid, Oid local_relid); 14 | extern void pglogical_relid_map_reset(void); 15 | #endif 16 | -------------------------------------------------------------------------------- /src/include/receiver.h: -------------------------------------------------------------------------------- 1 | #ifndef MTM_RECEIVER_H 2 | #define MTM_RECEIVER_H 3 | 4 | #include "libpq-fe.h" 5 | 6 | typedef enum 7 | { 8 | REPLMODE_DISABLED, /* stop the receiver */ 9 | REPLMODE_RECOVERY, /* pull changes of all origins */ 10 | REPLMODE_NORMAL /* pull only sender changes, apply in parallel */ 11 | } MtmReplicationMode; 12 | 13 | /* ugly exported for the sake of MtmDetectGlobalDeadLock */ 14 | extern MtmReplicationMode curr_replication_mode; 15 | 16 | #define BGW_POOL_BY_NODE_ID(node_id) (&Mtm->pools[(node_id) - 1]) 17 | 18 | extern char const *const MtmReplicationModeMnem[]; 19 | 20 | /* forward decl to avoid including global_tx.h */ 21 | struct GlobalTx; 22 | 23 | /* same for bgwpool.h */ 24 | struct BgwPool; 25 | 26 | /* 27 | * Part of MtmReceiverContext used by both main receiver and parallel workers. 28 | * Exposed for bgwpool/apply needs. 29 | */ 30 | typedef struct 31 | { 32 | int sender_node_id; 33 | MtmReplicationMode mode; 34 | /* allows to release gtx on ERROR in apply */ 35 | struct GlobalTx *gtx; 36 | /* 37 | * For parallel workers: position of current job in txlist. 38 | */ 39 | int txlist_pos; 40 | /* 41 | * Info about xact currently being executed 42 | */ 43 | TransactionId origin_xid; 44 | bool reply_pending; 45 | /* 46 | * true means this is xact with plain commit, so we cannot ignore 47 | * apply failure 48 | */ 49 | bool bdr_like; 50 | 51 | struct BgwPool *pool; 52 | } MtmReceiverWorkerContext; 53 | 54 | extern void MtmWakeupReceivers(void); 55 | 56 | extern void MtmExecutor(void *work, size_t size, MtmReceiverWorkerContext *rwctx); 57 | extern void ApplyCancelHandler(SIGNAL_ARGS); 58 | extern void MtmUpdateLsnMapping(int node_id, XLogRecPtr end_lsn); 59 | 60 | extern void MtmBeginSession(int nodeId); 61 | extern void MtmEndSession(int nodeId, bool unlock); 62 | 63 | #endif 64 | -------------------------------------------------------------------------------- /src/include/resolver.h: -------------------------------------------------------------------------------- 1 | #ifndef RESOLVER_H 2 | #define RESOLVER_H 3 | 4 | #include "postmaster/bgworker.h" 5 | 6 | extern void ResolverMain(Datum main_arg); 7 | void ResolverWake(void); 8 | 9 | #endif /* RESOLVER_H */ 10 | -------------------------------------------------------------------------------- /src/include/spill.h: -------------------------------------------------------------------------------- 1 | #ifndef __SPILL_H__ 2 | #define __SPILL_H__ 3 | 4 | void MtmSpillToFile(int fd, char const *data, size_t size); 5 | void MtmCreateSpillDirectory(int node_id); 6 | int MtmCreateSpillFile(int node_id, int *file_id); 7 | int MtmOpenSpillFile(int node_id, int file_id); 8 | void MtmReadSpillFile(int fd, char *data, size_t size); 9 | void MtmCloseSpillFile(int fd); 10 | 11 | #endif 12 | -------------------------------------------------------------------------------- /src/include/state.h: -------------------------------------------------------------------------------- 1 | #ifndef STATE_H 2 | #define STATE_H 3 | 4 | /* 5 | * Generation is a uniquely numbered subset of configured nodes allowed to 6 | * commit transactions. Each xact is stamped with generation it belongs 7 | * to. Transaction must be PREPAREd on *all* generation members before commit; 8 | * this provides recovery -> normal work transition without risk of reordering 9 | * xacts. 10 | * 11 | * The two main properties of generations are 12 | * - At each node all prepares of generation n who might ever be committed 13 | * lie strictly before all such prepares of generation n+1. 14 | * - Node which is MTM_GEN_ONLINE in generation n holds all committable 15 | * xacts of all generations < n. 16 | * See generations2.md and MtmGenerations.tla for details. 17 | * 18 | * Normal (making xacts) generation contains at least majority 19 | * members. However, we allow to elect generation with less members as a sort 20 | * of mark that its members are recovered enough to be included in the 21 | * following normal generations. It allows nodes always add *only myself* (but 22 | * remove anyone else) when campaigning for new generations; thus only node 23 | * itself decides when it is recovered enough to force others wait for it, 24 | * which simplifies reasoning who should be next gen members. 25 | * 26 | * Another reason for minority gens existence is usage of generations to 27 | * directly abort transactions when we know they can't ever be prepared; this 28 | * allows to participate in normal transaction resolution iff node has 29 | * PREPARE. For that to work, we must be sure live connectivity clique forming 30 | * majority eventually forms its generation regardless of recovery process. 31 | * c.f. handle_1a for details. 32 | */ 33 | typedef struct MtmGeneration 34 | { 35 | uint64 num; /* logical clock aka term number aka ballot */ 36 | uint64 members; /* xxx extract nodemask.h and use it here */ 37 | /* 38 | * Generation has fixed set of configured nodes, which helps consistent 39 | * xact resolving with dynamic add/rm of nodes. 40 | */ 41 | uint64 configured; /* xxx extract nodemask.h and use it here */ 42 | } MtmGeneration; 43 | 44 | #define MtmInvalidGenNum 0 45 | #define EQUAL_GENS(g1, g2) \ 46 | ((g1).num == (g2).num && (g1).members == (g2).members && (g1).configured == (g2).configured) 47 | /* 48 | * Referee is enabled only with 2 nodes and single member gen is ever proposed 49 | * as referee one (requiring referee vote and allowing to be online this 50 | * single node), so instead of separate flag use this check. 51 | * 52 | * First condition is important as single node cluster shouldn't access 53 | * referee; also, with > 2 nodes there is at least theoretical possibility of 54 | * electing single-node generation after two consecutive minority gen 55 | * elections. 56 | */ 57 | #define IS_REFEREE_GEN(members, configured) \ 58 | (popcount(configured) == 2 && popcount(members) == 1) 59 | 60 | typedef enum 61 | { 62 | MTM_GEN_DEAD, /* can't ever be online in this gen */ 63 | MTM_GEN_RECOVERY, /* need to pull in recovery latest xacts before */ 64 | /* starting making my own and receiving normally */ 65 | MTM_GEN_ONLINE /* participating normally */ 66 | } MtmStatusInGen; 67 | 68 | typedef enum 69 | { 70 | /* 71 | * We were not excluded to the best of our knowledge, but we don't see all 72 | * peers from current generation, so commits will likely fail. 73 | */ 74 | MTM_ISOLATED, 75 | 76 | /* 77 | * We were excluded and definitely need recovery, but not yet sure from 78 | * whom as we don't see majority. 79 | */ 80 | MTM_DISABLED, 81 | 82 | /* 83 | * We are catching up, eating changes committed without us participating. 84 | * Other nodes don't wait for us yet, so this doesn't freeze the cluster. 85 | */ 86 | MTM_CATCHUP, 87 | 88 | /* 89 | * Generation with us was elected and others started waiting for us, but 90 | * we need to eat the latest changes in recovery mode to participate 91 | * normally. 92 | */ 93 | MTM_RECOVERY, 94 | 95 | /* 96 | * It's Twelve O'clock and All's Well. 97 | */ 98 | MTM_ONLINE, 99 | } MtmNodeStatus; 100 | 101 | extern char const *const MtmNodeStatusMnem[]; 102 | 103 | extern void MtmStateInit(void); 104 | extern void MtmStateShmemStartup(void); 105 | extern void MtmStateStartup(void); 106 | 107 | /* generation management */ 108 | extern uint64 MtmGetCurrentGenNum(void); 109 | extern MtmGeneration MtmGetCurrentGen(bool locked); 110 | extern void MtmConsiderGenSwitch(MtmGeneration gen, nodemask_t donors); 111 | extern bool MtmHandleParallelSafe(MtmGeneration ps_gen, nodemask_t ps_donors, 112 | bool is_recovery, XLogRecPtr end_lsn); 113 | extern MtmStatusInGen MtmGetCurrentStatusInGen(void); 114 | extern MtmStatusInGen MtmGetCurrentStatusInGenNotLocked(void); 115 | extern MtmNodeStatus MtmGetCurrentStatus(bool gen_locked, bool vote_locked); 116 | 117 | /* receiver bits */ 118 | extern void MtmReportReceiverCaughtup(int node_id); 119 | /* we should recover, but not not sure from whom yet */ 120 | #define RECEIVE_MODE_DISABLED (~(uint32)0) 121 | /* all receivers work normally */ 122 | #define RECEIVE_MODE_NORMAL 0 123 | #define IS_RECEIVE_MODE_DONOR(rcv_mode) ((rcv_mode) != RECEIVE_MODE_NORMAL && \ 124 | ((rcv_mode) != RECEIVE_MODE_DISABLED)) 125 | extern MtmReplicationMode MtmGetReceiverMode(int nodeId); 126 | 127 | /* connectivity */ 128 | extern nodemask_t MtmGetDmqReceiversMask(void); 129 | extern nodemask_t MtmGetConnectedMask(bool locked); 130 | extern nodemask_t MtmGetConnectedMaskWithMe(bool locked); 131 | extern void *MtmOnDmqReceiverConnect(char *node_name); 132 | extern void MtmOnDmqReceiverHeartbeat(char *node_name, StringInfo msg, void *extra); 133 | extern void MtmOnDmqReceiverDisconnect(char *node_name); 134 | extern void MtmOnDmqSenderConnect(char *node_name); 135 | extern void MtmOnDmqSenderHeartbeat(char *node_name, StringInfo buf); 136 | extern void MtmOnDmqSenderDisconnect(char *node_name); 137 | 138 | extern void AcquirePBByPreparer(bool backend); 139 | extern void AcquirePBByHolder(bool full); 140 | extern void ReleasePB(void); 141 | 142 | /* bgws */ 143 | extern void CampaignerMain(Datum main_arg); 144 | extern void ReplierMain(Datum main_arg); 145 | extern void MtmMonitor(Datum arg); 146 | extern void MtmMonitorStart(Oid db_id, Oid user_id); 147 | 148 | /* not cleaned up yet */ 149 | extern void MtmRefreshClusterStatus(void); 150 | extern nodemask_t MtmGetDisabledNodeMask(void); 151 | extern nodemask_t MtmGetEnabledNodeMask(bool ignore_disabled); 152 | extern void CampaignerStop(void); 153 | 154 | #endif 155 | -------------------------------------------------------------------------------- /src/include/syncpoint.h: -------------------------------------------------------------------------------- 1 | /*------------------------------------------------------------------------- 2 | * 3 | * syncpoint.h 4 | * 5 | * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group 6 | * Portions Copyright (c) 1994, Regents of the University of California 7 | * Portions Copyright (c) 2021, Postgres Professional 8 | * 9 | *------------------------------------------------------------------------- 10 | */ 11 | #ifndef SYNCPOINT_H 12 | #define SYNCPOINT_H 13 | 14 | #include "access/xlogdefs.h" 15 | #include "libpq-fe.h" 16 | #include "utils/hsearch.h" 17 | #include "replication/walsender.h" 18 | 19 | typedef struct 20 | { 21 | XLogRecPtr origin_lsn; 22 | XLogRecPtr local_lsn; 23 | } Syncpoint; 24 | 25 | /* 26 | * Used as a hashkey in recovery filter. 27 | * 28 | * NB: make sure to memset this structure to zeroes before using as hashkey 29 | * because it contains 4-byte padding hole in the middle. 30 | */ 31 | typedef struct 32 | { 33 | int node_id; 34 | XLogRecPtr origin_lsn; 35 | } FilterEntry; 36 | 37 | 38 | extern int MtmSyncpointInterval; 39 | 40 | 41 | extern void MaybeLogSyncpoint(void); 42 | extern void SyncpointRegister(int origin_node_id, XLogRecPtr origin_lsn, 43 | XLogRecPtr receiver_lsn); 44 | extern Syncpoint SyncpointGetLatest(int origin_node_id); 45 | extern Syncpoint *SyncpointGetAllLatest(int sender_node_id); 46 | extern XLogRecPtr GetRecoveryHorizon(int sender_node_id); 47 | extern void UpdateRecoveryHorizons(void); 48 | extern HTAB *RecoveryFilterLoad(int filter_node_id, Syncpoint *spvector, MtmConfig *mtm_cfg); 49 | 50 | extern char* pg_lsn_out_c(XLogRecPtr lsn); 51 | 52 | #endif /* SYNCPOINT_H */ 53 | -------------------------------------------------------------------------------- /src/mtm_utils.c: -------------------------------------------------------------------------------- 1 | /*---------------------------------------------------------------------------- 2 | * 3 | * mtm_utils.c 4 | * Utility functions 5 | * 6 | * Copyright (c) 2022, Postgres Professional 7 | * 8 | *---------------------------------------------------------------------------- 9 | */ 10 | 11 | #include "logger.h" 12 | #include "mtm_utils.h" 13 | 14 | #include "utils/timeout.h" 15 | 16 | /* 17 | * Disables timeouts on a client side: 18 | * - statement_timeout; 19 | * - lock_timeout; 20 | * - idle_in_transaction_session_timeout; 21 | * - idle_session_timeout. 22 | * 23 | * This timeouts, when set in the postgres config file, affect all process. 24 | * The multimaster needs his sessions not to be interrupted, so we disable 25 | * these timeouts. 26 | * 27 | * This function raises an error on PQExec failed. 28 | */ 29 | static bool 30 | disable_client_timeouts(PGconn *conn) 31 | { 32 | PGresult *res; 33 | 34 | res = PQexec(conn, "SET statement_timeout = 0"); 35 | if (PQresultStatus(res) != PGRES_COMMAND_OK) 36 | { 37 | mtm_log(WARNING, "failed to set statement_timeout: %s", 38 | pchomp(PQerrorMessage(conn))); 39 | return false; 40 | } 41 | 42 | res = PQexec(conn, "SET lock_timeout = 0"); 43 | if (PQresultStatus(res) != PGRES_COMMAND_OK) 44 | { 45 | mtm_log(WARNING, "failed to set lock_timeout: %s", 46 | pchomp(PQerrorMessage(conn))); 47 | return false; 48 | } 49 | 50 | res = PQexec(conn, "SET idle_in_transaction_session_timeout = 0"); 51 | if (PQresultStatus(res) != PGRES_COMMAND_OK) 52 | { 53 | mtm_log(WARNING, "failed to set idle_in_transaction_session_timeout: %s", 54 | pchomp(PQerrorMessage(conn))); 55 | return false; 56 | } 57 | 58 | res = PQexec(conn, "SET idle_session_timeout = 0"); 59 | if (PQresultStatus(res) != PGRES_COMMAND_OK) 60 | { 61 | mtm_log(WARNING, "failed to set idle_session_timeout: %s", 62 | pchomp(PQerrorMessage(conn))); 63 | return false; 64 | } 65 | 66 | return true; 67 | } 68 | 69 | /* 70 | * Disable timeouts for a current process 71 | * - statement_timeout; 72 | * - lock_timeout; 73 | * - idle_in_transaction_session_timeout; 74 | * - idle_session_timeout. 75 | * 76 | * We disable these timeout for the same reason as in the disable_client_timeout() 77 | */ 78 | extern void 79 | MtmDisableTimeouts(void) 80 | { 81 | if (get_timeout_active(STATEMENT_TIMEOUT)) 82 | disable_timeout(STATEMENT_TIMEOUT, false); 83 | if (get_timeout_active(LOCK_TIMEOUT)) 84 | disable_timeout(LOCK_TIMEOUT, false); 85 | if (get_timeout_active(IDLE_IN_TRANSACTION_SESSION_TIMEOUT)) 86 | disable_timeout(IDLE_IN_TRANSACTION_SESSION_TIMEOUT, false); 87 | if (get_timeout_active(IDLE_SESSION_TIMEOUT)) 88 | disable_timeout(IDLE_SESSION_TIMEOUT, false); 89 | } 90 | 91 | /* 92 | * Wrapper on PQconnectPoll 93 | * 94 | * On connect disables timeouts on a client side 95 | */ 96 | PostgresPollingStatusType 97 | MtmPQconnectPoll(PGconn *conn) 98 | { 99 | PostgresPollingStatusType status; 100 | 101 | status = PQconnectPoll(conn); 102 | if (status != PGRES_POLLING_OK) 103 | return status; 104 | 105 | if (!disable_client_timeouts(conn)) 106 | status = PGRES_POLLING_FAILED; 107 | 108 | return status; 109 | } 110 | 111 | /* 112 | * Wrapper on PQconnectdb 113 | * 114 | * On connect disables timeouts on a client side 115 | */ 116 | PGconn * 117 | MtmPQconnectdb(const char *conninfo) 118 | { 119 | PGconn *conn; 120 | 121 | conn = PQconnectdb(conninfo); 122 | if (PQstatus(conn) != CONNECTION_OK) 123 | return conn; 124 | 125 | if (!disable_client_timeouts(conn)) 126 | { 127 | PQfinish(conn); 128 | return NULL; 129 | } 130 | 131 | return conn; 132 | } 133 | 134 | -------------------------------------------------------------------------------- /src/pglogical_hooks.c: -------------------------------------------------------------------------------- 1 | /*------------------------------------------------------------------------- 2 | * 3 | * pglogical_hooks.c 4 | * 5 | * Portions Copyright (c) 2015-2021, Postgres Professional 6 | * Portions Copyright (c) 2015-2020, PostgreSQL Global Development Group 7 | * 8 | *------------------------------------------------------------------------- 9 | */ 10 | #include "postgres.h" 11 | 12 | #include "access/xact.h" 13 | 14 | #include "catalog/pg_proc.h" 15 | #include "catalog/pg_type.h" 16 | 17 | #include "replication/origin.h" 18 | 19 | #include "parser/parse_func.h" 20 | 21 | #include "utils/acl.h" 22 | #include "utils/lsyscache.h" 23 | 24 | #include "miscadmin.h" 25 | 26 | #include "pglogical_hooks.h" 27 | #include "pglogical_output.h" 28 | 29 | #include "multimaster.h" 30 | #include "logger.h" 31 | 32 | /* 33 | * Returns Oid of the hooks function specified in funcname. 34 | * 35 | * Error is thrown if function doesn't exist or doen't return correct datatype 36 | * or is volatile. 37 | */ 38 | static Oid 39 | get_hooks_function_oid(List *funcname) 40 | { 41 | Oid funcid; 42 | Oid funcargtypes[1]; 43 | 44 | funcargtypes[0] = INTERNALOID; 45 | 46 | /* find the the function */ 47 | funcid = LookupFuncName(funcname, 1, funcargtypes, false); 48 | 49 | /* Validate that the function returns void */ 50 | if (get_func_rettype(funcid) != VOIDOID) 51 | { 52 | ereport(ERROR, 53 | (errcode(ERRCODE_WRONG_OBJECT_TYPE), 54 | MTM_ERRMSG("function %s must return void", 55 | NameListToString(funcname)))); 56 | } 57 | 58 | if (func_volatile(funcid) == PROVOLATILE_VOLATILE) 59 | { 60 | ereport(ERROR, 61 | (errcode(ERRCODE_WRONG_OBJECT_TYPE), 62 | MTM_ERRMSG("function %s must not be VOLATILE", 63 | NameListToString(funcname)))); 64 | } 65 | 66 | if (pg_proc_aclcheck(funcid, GetUserId(), ACL_EXECUTE) != ACLCHECK_OK) 67 | { 68 | const char *username; 69 | #if PG_VERSION_NUM >= 90500 70 | username = GetUserNameFromId(GetUserId(), false); 71 | #else 72 | username = GetUserNameFromId(GetUserId()); 73 | #endif 74 | ereport(ERROR, 75 | (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), 76 | MTM_ERRMSG("current user %s does not have permission to call function %s", 77 | username, NameListToString(funcname)))); 78 | } 79 | 80 | return funcid; 81 | } 82 | 83 | /* 84 | * If a hook setup function was specified in the startup parameters, look it up 85 | * in the catalogs, check permissions, call it, and store the resulting hook 86 | * info struct. 87 | */ 88 | void 89 | load_hooks(PGLogicalOutputData *data) 90 | { 91 | Oid hooks_func; 92 | MemoryContext old_ctxt; 93 | bool txn_started = false; 94 | 95 | if (!IsTransactionState()) 96 | { 97 | txn_started = true; 98 | StartTransactionCommand(); 99 | } 100 | 101 | if (data->hooks_setup_funcname != NIL) 102 | { 103 | hooks_func = get_hooks_function_oid(data->hooks_setup_funcname); 104 | 105 | old_ctxt = MemoryContextSwitchTo(data->hooks_mctxt); 106 | (void) OidFunctionCall1(hooks_func, PointerGetDatum(&data->hooks)); 107 | MemoryContextSwitchTo(old_ctxt); 108 | 109 | elog(DEBUG3, "pglogical_output: Loaded hooks from function %u. Hooks are: \n" 110 | "\tstartup_hook: %p\n" 111 | "\tshutdown_hook: %p\n" 112 | "\trow_filter_hook: %p\n" 113 | "\ttxn_filter_hook: %p\n" 114 | "\thooks_private_data: %p\n", 115 | hooks_func, 116 | data->hooks.startup_hook, 117 | data->hooks.shutdown_hook, 118 | data->hooks.row_filter_hook, 119 | data->hooks.txn_filter_hook, 120 | data->hooks.hooks_private_data); 121 | } 122 | else if (data->api->setup_hooks) 123 | { 124 | old_ctxt = MemoryContextSwitchTo(data->hooks_mctxt); 125 | (*data->api->setup_hooks) (&data->hooks); 126 | MemoryContextSwitchTo(old_ctxt); 127 | } 128 | 129 | if (txn_started) 130 | CommitTransactionCommand(); 131 | } 132 | 133 | void 134 | call_startup_hook(PGLogicalOutputData *data, List *plugin_params) 135 | { 136 | struct PGLogicalStartupHookArgs args; 137 | MemoryContext old_ctxt; 138 | 139 | if (data->hooks.startup_hook != NULL) 140 | { 141 | bool tx_started = false; 142 | 143 | args.private_data = data->hooks.hooks_private_data; 144 | args.in_params = plugin_params; 145 | args.out_params = NIL; 146 | 147 | elog(DEBUG3, "calling pglogical startup hook"); 148 | 149 | if (!IsTransactionState()) 150 | { 151 | tx_started = true; 152 | StartTransactionCommand(); 153 | } 154 | 155 | old_ctxt = MemoryContextSwitchTo(data->hooks_mctxt); 156 | (void) (*data->hooks.startup_hook) (&args); 157 | MemoryContextSwitchTo(old_ctxt); 158 | 159 | if (tx_started) 160 | CommitTransactionCommand(); 161 | 162 | data->extra_startup_params = args.out_params; 163 | /* The startup hook might change the private data seg */ 164 | data->hooks.hooks_private_data = args.private_data; 165 | 166 | elog(DEBUG3, "called pglogical startup hook"); 167 | } 168 | } 169 | 170 | void 171 | call_shutdown_hook(PGLogicalOutputData *data) 172 | { 173 | struct PGLogicalShutdownHookArgs args; 174 | MemoryContext old_ctxt; 175 | 176 | if (data->hooks.shutdown_hook != NULL) 177 | { 178 | args.private_data = data->hooks.hooks_private_data; 179 | 180 | elog(DEBUG3, "calling pglogical shutdown hook"); 181 | 182 | old_ctxt = MemoryContextSwitchTo(data->hooks_mctxt); 183 | (void) (*data->hooks.shutdown_hook) (&args); 184 | MemoryContextSwitchTo(old_ctxt); 185 | 186 | data->hooks.hooks_private_data = args.private_data; 187 | 188 | elog(DEBUG3, "called pglogical shutdown hook"); 189 | } 190 | } 191 | 192 | /* 193 | * Decide if the individual change should be filtered out by 194 | * calling a client-provided hook. 195 | */ 196 | bool 197 | call_row_filter_hook(PGLogicalOutputData *data, ReorderBufferTXN *txn, 198 | Relation rel, ReorderBufferChange *change) 199 | { 200 | struct PGLogicalRowFilterArgs hook_args; 201 | MemoryContext old_ctxt; 202 | bool ret = true; 203 | 204 | if (data->hooks.row_filter_hook != NULL) 205 | { 206 | hook_args.change_type = change->action; 207 | hook_args.private_data = data->hooks.hooks_private_data; 208 | hook_args.changed_rel = rel; 209 | 210 | elog(DEBUG3, "calling pglogical row filter hook"); 211 | 212 | old_ctxt = MemoryContextSwitchTo(data->hooks_mctxt); 213 | ret = (*data->hooks.row_filter_hook) (&hook_args); 214 | MemoryContextSwitchTo(old_ctxt); 215 | 216 | /* Filter hooks shouldn't change the private data ptr */ 217 | Assert(data->hooks.hooks_private_data == hook_args.private_data); 218 | 219 | elog(DEBUG3, "called pglogical row filter hook, returned %d", (int) ret); 220 | } 221 | 222 | return ret; 223 | } 224 | 225 | bool 226 | call_txn_filter_hook(PGLogicalOutputData *data, RepOriginId txn_origin) 227 | { 228 | struct PGLogicalTxnFilterArgs hook_args; 229 | bool ret = true; 230 | MemoryContext old_ctxt; 231 | 232 | if (data->hooks.txn_filter_hook != NULL) 233 | { 234 | hook_args.private_data = data->hooks.hooks_private_data; 235 | hook_args.origin_id = txn_origin; 236 | 237 | elog(DEBUG3, "calling pglogical txn filter hook"); 238 | 239 | old_ctxt = MemoryContextSwitchTo(data->hooks_mctxt); 240 | ret = (*data->hooks.txn_filter_hook) (&hook_args); 241 | MemoryContextSwitchTo(old_ctxt); 242 | 243 | /* Filter hooks shouldn't change the private data ptr */ 244 | Assert(data->hooks.hooks_private_data == hook_args.private_data); 245 | 246 | elog(DEBUG3, "called pglogical txn filter hook, returned %d", (int) ret); 247 | } 248 | 249 | return ret; 250 | } 251 | -------------------------------------------------------------------------------- /src/pglogical_relid_map.c: -------------------------------------------------------------------------------- 1 | /*------------------------------------------------------------------------- 2 | * 3 | * pglogical_relid_map.c 4 | * Logical Replication map of local Oids to to remote 5 | * 6 | * Portions Copyright (c) 2015-2021, Postgres Professional 7 | * Portions Copyright (c) 2015-2020, PostgreSQL Global Development Group 8 | * 9 | * 10 | * IDENTIFICATION 11 | * pglogical_relid_map.c 12 | * 13 | *------------------------------------------------------------------------- 14 | */ 15 | #include "postgres.h" 16 | #include "utils/hsearch.h" 17 | #include "pglogical_relid_map.h" 18 | 19 | static HTAB *relid_map; 20 | 21 | static void 22 | pglogical_relid_map_init(void) 23 | { 24 | HASHCTL ctl; 25 | 26 | Assert(relid_map == NULL); 27 | 28 | MemSet(&ctl, 0, sizeof(ctl)); 29 | ctl.keysize = sizeof(Oid); 30 | ctl.entrysize = sizeof(PGLRelidMapEntry); 31 | relid_map = hash_create("pglogical_relid_map", PGL_INIT_RELID_MAP_SIZE, &ctl, HASH_ELEM | HASH_BLOBS); 32 | 33 | Assert(relid_map != NULL); 34 | } 35 | 36 | Oid 37 | pglogical_relid_map_get(Oid relid) 38 | { 39 | if (relid_map != NULL) 40 | { 41 | PGLRelidMapEntry *entry = (PGLRelidMapEntry *) hash_search(relid_map, &relid, HASH_FIND, NULL); 42 | 43 | return entry ? entry->local_relid : InvalidOid; 44 | } 45 | return InvalidOid; 46 | } 47 | 48 | bool 49 | pglogical_relid_map_put(Oid remote_relid, Oid local_relid) 50 | { 51 | bool found; 52 | PGLRelidMapEntry *entry; 53 | 54 | if (relid_map == NULL) 55 | { 56 | pglogical_relid_map_init(); 57 | } 58 | entry = hash_search(relid_map, &remote_relid, HASH_ENTER, &found); 59 | if (found) 60 | { 61 | entry->local_relid = local_relid; 62 | return false; 63 | } 64 | entry->local_relid = local_relid; 65 | return true; 66 | } 67 | 68 | void 69 | pglogical_relid_map_reset(void) 70 | { 71 | if (relid_map != NULL) 72 | { 73 | hash_destroy(relid_map); 74 | relid_map = NULL; 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /src/spill.c: -------------------------------------------------------------------------------- 1 | /*----------------------------------------------------------------------------- 2 | * spill.c 3 | * 4 | * Copyright (c) 2017-2021, Postgres Professional 5 | * 6 | *----------------------------------------------------------------------------- 7 | */ 8 | #include "postgres.h" 9 | 10 | #include 11 | #include 12 | #include "storage/fd.h" 13 | #include "spill.h" 14 | #include "pgstat.h" 15 | 16 | #include "multimaster.h" 17 | #include "logger.h" 18 | 19 | void 20 | MtmSpillToFile(int fd, char const *data, size_t size) 21 | { 22 | Assert(fd >= 0); 23 | while (size != 0) 24 | { 25 | int written = write(fd, data, size); 26 | 27 | if (written <= 0) 28 | { 29 | close(fd); 30 | ereport(ERROR, 31 | (errcode_for_file_access(), 32 | MTM_ERRMSG("pglogical_recevier failed to spill transaction to file: %m"))); 33 | } 34 | data += written; 35 | size -= written; 36 | } 37 | } 38 | 39 | void 40 | MtmCreateSpillDirectory(int node_id) 41 | { 42 | char path[MAXPGPATH]; 43 | struct dirent *spill_de; 44 | DIR *spill_dir; 45 | 46 | mkdir("pg_mtm", S_IRWXU); 47 | sprintf(path, "pg_mtm/%d", node_id); 48 | mkdir(path, S_IRWXU); 49 | 50 | spill_dir = AllocateDir(path); 51 | if (spill_dir == NULL) 52 | { 53 | ereport(PANIC, 54 | (errcode_for_file_access(), 55 | MTM_ERRMSG("pglogical_receiver failed to create spill directory \"%s\": %m", 56 | path))); 57 | } 58 | /* cleanup old files in case of previous crash */ 59 | while ((spill_de = ReadDir(spill_dir, path)) != NULL) 60 | { 61 | if (strncmp(spill_de->d_name, "txn", 3) == 0) 62 | { 63 | sprintf(path, "pg_mtm/%d/%s", node_id, spill_de->d_name); 64 | 65 | if (unlink(path) != 0) 66 | ereport(PANIC, 67 | (errcode_for_file_access(), 68 | MTM_ERRMSG("pglogical_receiver could not remove spill file \"%s\": %m", 69 | path))); 70 | } 71 | } 72 | FreeDir(spill_dir); 73 | } 74 | 75 | 76 | int 77 | MtmCreateSpillFile(int node_id, int *file_id) 78 | { 79 | static int spill_file_id; 80 | char path[MAXPGPATH]; 81 | int fd; 82 | 83 | sprintf(path, "pg_mtm/%d/txn-%d.snap", 84 | node_id, ++spill_file_id); 85 | fd = BasicOpenFile(path, 86 | O_CREAT | O_TRUNC | O_WRONLY | O_APPEND | PG_BINARY); 87 | if (fd < 0) 88 | { 89 | ereport(PANIC, 90 | (errcode_for_file_access(), 91 | MTM_ERRMSG("pglogical_receiver could not create spill file \"%s\": %m", 92 | path))); 93 | } 94 | *file_id = spill_file_id; 95 | return fd; 96 | } 97 | 98 | int 99 | MtmOpenSpillFile(int node_id, int file_id) 100 | { 101 | static char path[MAXPGPATH]; 102 | int fd; 103 | 104 | sprintf(path, "pg_mtm/%d/txn-%d.snap", 105 | node_id, file_id); 106 | fd = OpenTransientFile(path, 107 | O_RDONLY | PG_BINARY); 108 | if (fd < 0) 109 | { 110 | ereport(PANIC, 111 | (errcode_for_file_access(), 112 | MTM_ERRMSG("pglogical_apply could not open spill file \"%s\": %m", 113 | path))); 114 | } 115 | if (unlink(path) < 0) 116 | { /* Should remove file on close */ 117 | ereport(LOG, 118 | (errcode_for_file_access(), 119 | MTM_ERRMSG("pglogical_apply failed to unlink spill file: %m"))); 120 | } 121 | return fd; 122 | } 123 | 124 | void 125 | MtmReadSpillFile(int fd, char *data, size_t size) 126 | { 127 | Assert(fd >= 0); 128 | while (size != 0) 129 | { 130 | int rc = read(fd, data, size); 131 | 132 | if (rc <= 0) 133 | { 134 | CloseTransientFile(fd); 135 | ereport(ERROR, 136 | (errcode_for_file_access(), 137 | MTM_ERRMSG("pglogical_apply failed to read spill file: %m"))); 138 | } 139 | data += rc; 140 | size -= rc; 141 | } 142 | } 143 | 144 | void 145 | MtmCloseSpillFile(int fd) 146 | { 147 | if (close(fd) < 0) 148 | ereport(ERROR, 149 | (errcode_for_file_access(), 150 | MTM_ERRMSG("pglogical_recevier failed to close spill file: %m"))); 151 | } 152 | -------------------------------------------------------------------------------- /src/test_bkb.sage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sage 2 | import sys, os 3 | 4 | from sage.all import * 5 | from subprocess import Popen, PIPE, STDOUT 6 | from random import randrange, randint 7 | import unittest 8 | 9 | def run_stdin(input): 10 | mydir = os.path.dirname(os.path.realpath(__file__)) 11 | binfile = mydir + "/../src/a.out" 12 | 13 | p = Popen(binfile, stdout=PIPE, stdin=PIPE, stderr=STDOUT) 14 | grep_stdout = p.communicate(input=input)[0] 15 | return grep_stdout.decode() 16 | 17 | def run_bkb(g): 18 | n = len(g) 19 | params = str(n) + "\n" 20 | for i in range(n): 21 | row = 0 22 | row |= 1 << i 23 | for j in range(n): 24 | if g.has_edge(i, j): 25 | row |= 1 << j 26 | params += str(row) + "\n" 27 | 28 | # print(params) 29 | res = run_stdin(params).strip() 30 | res = [int(n) for n in res.split(' ')] 31 | return res 32 | 33 | 34 | class TestCliqueBKB(unittest.TestCase): 35 | 36 | # test only that max clique size is ok 37 | def test_random_graphs_size(self): 38 | 39 | for _ in range(1000): 40 | n_nodes = randint(1, 60) 41 | n_edges = randrange(1 + (n_nodes * (n_nodes - 1) / 2)) 42 | print("graph |V|={}, |E|={}>".format(n_nodes, n_edges)) 43 | g = graphs.RandomGNM(n_nodes, n_edges) 44 | 45 | clique, clique_size = run_bkb(g) 46 | clique_members = [] 47 | for i in range(n_nodes): 48 | if (clique & (1 << i)) != 0: 49 | clique_members.append(i) 50 | 51 | sage_clique_maximum = g.clique_maximum() 52 | 53 | print(clique, clique_members, clique_size, sage_clique_maximum, len(sage_clique_maximum)) 54 | self.assertEqual(clique_size, len(sage_clique_maximum)) 55 | 56 | # test that found graph is indeed the clique, much more expensive 57 | def test_random_graphs(self): 58 | 59 | for _ in range(1000): 60 | n_nodes = randint(1, 30) 61 | n_edges = randrange(1 + (n_nodes * (n_nodes - 1) / 2)) 62 | print("graph |V|={}, |E|={}>".format(n_nodes, n_edges)) 63 | g = graphs.RandomGNM(n_nodes, n_edges) 64 | 65 | clique, clique_size = run_bkb(g) 66 | clique_members = [] 67 | for i in range(n_nodes): 68 | if (clique & (1 << i)) != 0: 69 | clique_members.append(i) 70 | 71 | sage_maxcliques = g.cliques_maximal() 72 | print(sage_maxcliques[0]) 73 | 74 | found = False 75 | for sc in sage_maxcliques: 76 | if sc == clique_members: 77 | found = True 78 | self.assertTrue(found) 79 | 80 | print(clique, clique_members, clique_size, sage_maxcliques[0], len(sage_maxcliques[0])) 81 | 82 | 83 | 84 | if __name__ == '__main__': 85 | unittest.main() 86 | -------------------------------------------------------------------------------- /t/000_cross._pl: -------------------------------------------------------------------------------- 1 | # based on 2 | # "Distributed snapshot isolation: global transactions pay globally, 3 | # local transactions pay locally" 4 | # by Binnig et al cross-phenomenon. 5 | 6 | use strict; 7 | use warnings; 8 | 9 | use Cluster; 10 | use TestLib; 11 | use Test::More tests => 2; 12 | use IPC::Run qw(start finish); 13 | use Cwd; 14 | 15 | my $nnodes = 2; 16 | my $nclients = 2; 17 | my $nkeys = $nnodes * $nclients; 18 | my $cluster = new Cluster($nnodes); 19 | 20 | $cluster->init(); 21 | $cluster->configure(); 22 | $cluster->start(); 23 | 24 | my ($rc, $in, $out, $err); 25 | 26 | $cluster->await_nodes( [0,1] ); 27 | 28 | note("preparing the tables"); 29 | if ($cluster->psql(0, 'postgres', "create table t (k int primary key, v int)")) 30 | { 31 | $cluster->bail_out_with_logs('failed to create t'); 32 | } 33 | 34 | if ($cluster->psql(0, 'postgres', "insert into t (select generate_series(0, $nkeys - 1), 0)")) 35 | { 36 | $cluster->bail_out_with_logs('failed to fill t'); 37 | } 38 | 39 | sub appender 40 | { 41 | my ($appender_id, $clients, $seconds, $node, $inref, $outref) = @_; 42 | 43 | my @argv = ( 44 | 'pgbench', 45 | '-n', 46 | -c => $clients, 47 | -j => $clients, 48 | -T => $seconds, 49 | -h => $node->host(), 50 | -p => $node->port(), 51 | -D => "appender_id=$appender_id", 52 | -D => "clients=$clients", 53 | -f => 'tests/appender.pgb', 54 | 'postgres', 55 | ); 56 | 57 | note("running[" . getcwd() . "]: " . join(' ', @argv)); 58 | 59 | return start(\@argv, $inref, $outref); 60 | } 61 | 62 | sub state_dump 63 | { 64 | my $state = shift; 65 | 66 | note("<<<<<"); 67 | while (my ($key, $value) = each(%{$state})) 68 | { 69 | note("$key -> $value"); 70 | } 71 | note(">>>>>"); 72 | } 73 | 74 | sub state_leq 75 | { 76 | my ($a, $b) = @_; 77 | 78 | while (my ($key, $value) = each(%{$a})) 79 | { 80 | if (!exists($b->{$key})) 81 | { 82 | note("b has no key $key\n"); 83 | return 0; 84 | } 85 | 86 | if ($b->{$key} < $value) 87 | { 88 | note($b->{$key} . " < $value\n"); 89 | return 0; 90 | } 91 | } 92 | 93 | return 1; 94 | } 95 | 96 | sub parse_state 97 | { 98 | my $str = shift; 99 | my $state = {}; 100 | 101 | while ($str =~ /(\d+)\|(\d+)/g) 102 | { 103 | $state->{$1} = $2; 104 | } 105 | 106 | return $state; 107 | } 108 | 109 | note("starting appenders"); 110 | note("starting benches"); 111 | $in = ''; 112 | $out = ''; 113 | my @appenders = (); 114 | my $appender_id = 0; 115 | my $seconds = 30; 116 | foreach my $node (@{$cluster->{nodes}}) 117 | { 118 | push(@appenders, appender($appender_id, $nclients, $seconds, $node, \$in, \$out)); 119 | $appender_id++; 120 | } 121 | 122 | my $selects = 0; 123 | my $anomalies = 0; 124 | my $started = time(); 125 | my $node_id = 0; 126 | my $state_a = undef; 127 | my $state_b = undef; 128 | my $out_a = ''; 129 | my $out_b = ''; 130 | while (time() - $started < $seconds) 131 | { 132 | $node_id = ($node_id + 1) % $nnodes; 133 | $state_a = $state_b; 134 | $out_a = $out_b; 135 | ($rc, $out, $err) = $cluster->psql($node_id, 'postgres', "select * from t;"); 136 | $selects++; 137 | $state_b = parse_state($out); 138 | $out_b = $out; 139 | if (defined $state_a) 140 | { 141 | if (!state_leq($state_a, $state_b) && !state_leq($state_a, $state_b)) 142 | { 143 | note("cross anomaly detected:\n===a\n$out_a\n+++b\n$out_b\n---\n"); 144 | $anomalies++; 145 | } 146 | } 147 | } 148 | 149 | note("finishing benches"); 150 | foreach my $appender (@appenders) 151 | { 152 | if (!finish($appender)) 153 | { 154 | $cluster->dumplogs(); 155 | $cluster->bail_out_with_logs("pgbench exited with $?"); 156 | } 157 | } 158 | 159 | is($anomalies, 0, "no cross anomalies after $selects selects"); 160 | 161 | ok($cluster->stop('fast'), "cluster stops"); 162 | 1; 163 | -------------------------------------------------------------------------------- /t/000_deadlock.pl: -------------------------------------------------------------------------------- 1 | # simple deadlock test 2 | 3 | use strict; 4 | use warnings; 5 | 6 | use Cluster; 7 | use TestLib; 8 | 9 | # Test whether we have both DBI and DBD::pg 10 | my $dbdpg_rc = eval 11 | { 12 | require DBI; 13 | require DBD::Pg; 14 | DBD::Pg->import(':async'); 15 | 1; 16 | }; 17 | 18 | # And tell Test::More to skip the test entirely if not 19 | require Test::More; 20 | if (not $dbdpg_rc) 21 | { 22 | Test::More->import(skip_all => 'DBI and DBD::Pg are not available'); 23 | } 24 | else 25 | { 26 | Test::More->import(tests => 1); 27 | } 28 | 29 | sub query_row 30 | { 31 | my ($dbi, $sql, @keys) = @_; 32 | my $sth = $dbi->prepare($sql) || die; 33 | $sth->execute(@keys) || die; 34 | my $ret = $sth->fetchrow_array || undef; 35 | return $ret; 36 | } 37 | 38 | sub query_exec 39 | { 40 | my ($dbi, $sql) = @_; 41 | my $rv = $dbi->do($sql) || die; 42 | return $rv; 43 | } 44 | 45 | sub query_exec_async 46 | { 47 | my ($dbi, $sql) = @_; 48 | # Since we are not importing DBD::Pg at compilation time, we can't use 49 | # constants from it. 50 | my $DBD_PG_PG_ASYNC = 1; 51 | my $rv = $dbi->do($sql, {pg_async => $DBD_PG_PG_ASYNC}) || die; 52 | return $rv; 53 | } 54 | 55 | my $cluster = new Cluster(2); 56 | 57 | $cluster->init(); 58 | $cluster->start(); 59 | $cluster->create_mm('regression'); 60 | 61 | my ($rc, $out, $err); 62 | sleep(10); 63 | 64 | $cluster->safe_psql(0, "create table t(k int primary key, v text)"); 65 | $cluster->safe_psql(0, "insert into t values (1, 'hello'), (2, 'world')"); 66 | 67 | my @conns = map { DBI->connect('DBI:Pg:' . $cluster->connstr($_)) } 0..1; 68 | 69 | query_exec($conns[0], "begin"); 70 | query_exec($conns[1], "begin"); 71 | 72 | query_exec($conns[0], "update t set v = 'asd' where k = 1"); 73 | query_exec($conns[1], "update t set v = 'bsd'"); 74 | 75 | query_exec($conns[0], "update t set v = 'bar' where k = 2"); 76 | query_exec($conns[1], "update t set v = 'foo'"); 77 | 78 | query_exec_async($conns[0], "commit"); 79 | query_exec_async($conns[1], "commit"); 80 | 81 | my $timeout = 16; 82 | while (--$timeout > 0) 83 | { 84 | my $r0 = $conns[0]->pg_ready(); 85 | my $r1 = $conns[1]->pg_ready(); 86 | if ($r0 && $r1) { 87 | last; 88 | } 89 | sleep(1); 90 | } 91 | 92 | if ($timeout > 0) 93 | { 94 | my $succeeded = 0; 95 | $succeeded++ if $conns[0]->pg_result(); 96 | $succeeded++ if $conns[1]->pg_result(); 97 | 98 | pass("queries finished"); 99 | } 100 | else 101 | { 102 | $conns[0]->pg_cancel() unless $conns[0]->pg_ready(); 103 | $conns[1]->pg_cancel() unless $conns[1]->pg_ready(); 104 | 105 | fail("queries timed out"); 106 | } 107 | 108 | query_row($conns[0], "select * from t where k = 1"); 109 | 110 | $cluster->stop('fast'); 111 | -------------------------------------------------------------------------------- /t/000_init._pl: -------------------------------------------------------------------------------- 1 | # test that after create_mm awaited nodes we won't get non-online state 2 | # immediately later. Catches races in MtmGetCurrentStatus logic. 3 | # It is expensive, so not run in the regular suite. 4 | 5 | use Cluster; 6 | use Test::More tests => 1; 7 | 8 | my $cluster = new Cluster(3); 9 | $cluster->init(q{ 10 | }); 11 | $cluster->start(); 12 | $cluster->create_mm('regression'); 13 | 14 | foreach(0..1000) # hopefully enough to catch all related races 15 | { 16 | foreach (0..2) 17 | { 18 | $cluster->safe_psql($_, "select 42"); 19 | } 20 | } 21 | 22 | is(0, 0, "dummy"); # Test::More doesn't like 0 tests, ha 23 | -------------------------------------------------------------------------------- /t/001_regress.pl: -------------------------------------------------------------------------------- 1 | # run core regression tests on multimaster 2 | 3 | # tests known to fail currently and failure reasons: 4 | # - create_index (CREATE INDEX CONCURRENTLY not supported due to deadlock 5 | # issues, see ddl.c) 6 | # - same for index_including, index_including_gist 7 | # - create_table (due to CTAS prepared statement) 8 | # - sanity check (due to pg_publication/subscription masking and other mtm tables) 9 | # - transactions (lack of COMMIT AND CHAIN support) 10 | # - rowsecurity 11 | # - atx, atx5 12 | # - rules (_pg_prepared_xacts and similar) 13 | # - publication, subscription (_pg_publication/subscription masking) 14 | # - prepare (CTAS prepared statement) 15 | # - indexing (again CIC). 16 | # 17 | # original test output/diffs are at $ENV{TESTDIR}/tmp_check/regress_outdir; 18 | # (in normal build TESTDIR is just mmts/; in vpath it is 'external' mmts/) 19 | # then diff is censored and copied to $ENV{TESTDIR}/results. 20 | 21 | use Cluster; 22 | use File::Basename; 23 | use IPC::Run 'run'; 24 | use Test::More; 25 | 26 | # With PGXS the sources are unavailable, so we can't obtain schedules and core 27 | # test themselves. 28 | if ($ENV{'PGXS'}) 29 | { 30 | # Test::More doesn't like no tests at all 31 | is(0, 0, "dummy"); 32 | done_testing(); 33 | exit(0); 34 | } 35 | 36 | # determenistic ports for expected files 37 | $PostgresNode::last_port_assigned = 55431; 38 | 39 | my $cluster = new Cluster(3); 40 | $cluster->init(q{ 41 | multimaster.volkswagen_mode = on 42 | # allow to spoof pg_prepared_xacts view 43 | allow_system_table_mods = on 44 | }); 45 | $cluster->start(); 46 | $cluster->create_mm('regression'); 47 | 48 | ############################################################################### 49 | # postgres regression tests 50 | ############################################################################### 51 | 52 | # configure db output format like pg_regress 53 | # In particular, pg_regress explicitly sets PGTZ=PST8PDT, and it turns out some 54 | # tests (including DDL! (see volatile_partbound_test)) depend on current_time, 55 | # so mtm receiver ought to use the same timezone to pass them. 56 | $cluster->{nodes}->[0]->safe_psql('regression', q{ 57 | ALTER DATABASE "regression" SET lc_messages TO 'C'; 58 | ALTER DATABASE "regression" SET lc_monetary TO 'C'; 59 | ALTER DATABASE "regression" SET lc_numeric TO 'C'; 60 | ALTER DATABASE "regression" SET lc_time TO 'C'; 61 | ALTER DATABASE "regression" SET timezone_abbreviations TO 'Default'; 62 | ALTER DATABASE "regression" SET TimeZone TO 'PST8PDT'; 63 | }); 64 | 65 | # do not show transaction from concurrent backends in pg_prepared_xacts 66 | $cluster->{nodes}->[0]->safe_psql('regression', q{ 67 | ALTER VIEW pg_prepared_xacts RENAME TO _pg_prepared_xacts; 68 | CREATE VIEW pg_prepared_xacts AS 69 | select * from _pg_prepared_xacts where gid not like 'MTM-%' 70 | ORDER BY transaction::text::bigint; 71 | ALTER TABLE pg_publication RENAME TO _pg_publication; 72 | CREATE VIEW pg_catalog.pg_publication AS SELECT * FROM pg_catalog._pg_publication WHERE pubname<>'multimaster'; 73 | ALTER TABLE pg_subscription RENAME TO _pg_subscription; 74 | CREATE VIEW pg_catalog.pg_subscription AS SELECT * FROM pg_catalog._pg_subscription WHERE subname NOT LIKE 'mtm_sub_%'; 75 | }); 76 | 77 | $cluster->{nodes}->[0]->safe_psql('regression', q{ 78 | ALTER SYSTEM SET allow_system_table_mods = 'off'; 79 | }); 80 | foreach my $node (@{$cluster->{nodes}}){ 81 | $node->restart; 82 | } 83 | $cluster->await_nodes( [0,1,2] ); 84 | 85 | # load schedule without tablespace test which is not expected 86 | # to work with several postgreses on a single node 87 | my $schedule = TestLib::slurp_file('../../src/test/regress/parallel_schedule'); 88 | $schedule =~ s/test: tablespace/#test: tablespace/g; 89 | $schedule =~ s/test: cfs/#test: cfs/g; 90 | $schedule =~ s/test: largeobject//; # serial schedule 91 | $schedule =~ s/largeobject//; # parallel schedule 92 | $schedule =~ s/atx0//; # parallel schedule 93 | unlink('parallel_schedule'); 94 | TestLib::append_to_file('parallel_schedule', $schedule); 95 | 96 | my $regress_shlib = $ENV{REGRESS_SHLIB}; 97 | my $regress_libdir = dirname($regress_shlib); 98 | my $regress_outdir = "$ENV{TESTDIR}/tmp_check/regress_outdir"; 99 | mkdir($regress_outdir); 100 | # REMOVEME: not needed in 14+, pg_regress fixed in upstream 101 | mkdir("${regress_outdir}/sql"); 102 | mkdir("${regress_outdir}/expected"); 103 | TestLib::system_log($ENV{'PG_REGRESS'}, 104 | '--host=' . $cluster->{nodes}->[0]->host, '--port=' . $cluster->{nodes}->[0]->port, 105 | '--use-existing', '--bindir=', 106 | '--schedule=parallel_schedule', 107 | "--dlpath=${regress_libdir}", 108 | '--inputdir=../../src/test/regress', 109 | "--outputdir=${regress_outdir}"); 110 | unlink('parallel_schedule'); 111 | 112 | # rename s/diffs/diff as some upper level testing systems are searching for all 113 | # *.diffs files. 114 | rename "${regress_outdir}/regression.diffs", "${regress_outdir}/regression.diff" 115 | or die "cannot rename file: $!"; 116 | 117 | # strip absolute paths and dates out of resulted regression.diffs 118 | my $res_diff = TestLib::slurp_file("${regress_outdir}/regression.diff"); 119 | # In <= 11 default diff format was context, since 12 unified; handing lines 120 | # starting with ---|+++|*** covers both. 121 | # To make someone's life easier, we prepend .. to make relative paths correct. 122 | # (it allows goto file comparison in editors) 123 | # This of course unfortunately doesn't work for VPATH. 124 | $res_diff =~ s/(--- |\+\+\+ |\*\*\* ).+contrib\/mmts(.+\.out)\t.+\n/$1..$2\tCENSORED\n/g; 125 | # Since 12 header like 126 | # diff -U3 /blabla/contrib/mmts/../../src/test/regress/expected/opr_sanity.out /blabla/mmts/../../src/test/regress/results/opr_sanity.out 127 | # was added to each file diff 128 | $res_diff =~ s/(diff ).+contrib\/mmts(.+\.out).+contrib\/mmts(.+\.out\n)/$1..$2 ..$3/g; 129 | $res_diff =~ s/(lo_import[ \(]')\/[^']+\//$1\/CENSORED\//g; 130 | #SELECT lo_export(loid, '/home/alex/projects/ppro/postgrespro/contrib/mmts/../../src/test/regress/results/lotest.txt') FROM lotest_stash_values; 131 | $res_diff =~ s/(lo_export.*\'\/).+\//$1CENSORED\//g; 132 | mkdir("$ENV{TESTDIR}/results"); 133 | unlink("$ENV{TESTDIR}/results/regression.diff"); 134 | 135 | # finally compare regression.diffs with our version 136 | # Do not use diffs extension as some upper level testing systems are searching for all 137 | # *.diffs files. 138 | TestLib::append_to_file("$ENV{TESTDIR}/results/regression.diff", $res_diff); 139 | # TODO: work with diffs on per-test basis 140 | my $expected_file; 141 | if (Cluster::is_ee()) 142 | { 143 | $expected_file = "expected/regression_ee.diff" 144 | } 145 | else 146 | { 147 | $expected_file = "expected/regression_vanilla.diff" 148 | } 149 | $diff = TestLib::system_log("diff -U3 ${expected_file} $ENV{TESTDIR}/results/regression.diff"); 150 | run [ "diff", "-U3", "${expected_file}", "$ENV{TESTDIR}/results/regression.diff" ], ">", "$ENV{TESTDIR}/regression.diff.diff"; 151 | my $res = $?; 152 | 153 | is($res, 0, "postgres regress"); 154 | 155 | done_testing(); 156 | -------------------------------------------------------------------------------- /t/002_regressmm.pl: -------------------------------------------------------------------------------- 1 | # run sql/multimaster.sql tests 2 | use Cluster; 3 | use Test::More tests => 1; 4 | 5 | # determenistic ports for expected files 6 | $PostgresNode::last_port_assigned = 55431; 7 | 8 | my $cluster = new Cluster(3); 9 | $cluster->init(q{ 10 | multimaster.volkswagen_mode = off 11 | }); 12 | $cluster->start(); 13 | $cluster->create_mm('regression'); 14 | 15 | ############################################################################### 16 | # multimaster regression tests 17 | ############################################################################### 18 | 19 | my @tests = ('multimaster'); 20 | # run atx test only on ee 21 | if (Cluster::is_ee()) 22 | { 23 | push @tests, 'atx'; 24 | } 25 | 26 | my $ret = TestLib::system_log($ENV{'PG_REGRESS'}, 27 | '--host=' . $cluster->{nodes}->[0]->host, '--port=' . $cluster->{nodes}->[0]->port, 28 | '--use-existing', '--bindir=', @tests); 29 | if ($ret != 0) 30 | { 31 | print "### Got regression! \n", TestLib::slurp_file('regression.diffs'); 32 | } 33 | is($ret, 0, "multimaster regress"); 34 | -------------------------------------------------------------------------------- /t/003_basic_recovery.pl: -------------------------------------------------------------------------------- 1 | # Basic recovery: some inserts, get node down, some inserts, get node up, some 2 | # inserts. There is no failures with concurrent load, so an easy variant. 3 | 4 | use strict; 5 | use warnings; 6 | use Cluster; 7 | use TestLib; 8 | use Test::More tests => 4; 9 | 10 | my $cluster = new Cluster(3); 11 | $cluster->init(); 12 | $cluster->start(); 13 | $cluster->create_mm(); 14 | 15 | my $ret; 16 | my $psql_out; 17 | 18 | ############################################################################### 19 | # Replication check 20 | ############################################################################### 21 | 22 | $cluster->{nodes}->[0]->safe_psql('postgres', q{ 23 | create table if not exists t(k int primary key, v int); 24 | insert into t values(1, 10); 25 | }); 26 | $psql_out = $cluster->{nodes}->[2]->safe_psql('postgres', q{ 27 | select v from t where k=1; 28 | }); 29 | is($psql_out, '10', "Check replication while all nodes are up."); 30 | 31 | ############################################################################### 32 | # Isolation regress checks 33 | ############################################################################### 34 | 35 | # we can call pg_regress here 36 | 37 | ############################################################################### 38 | # Work after node stop 39 | ############################################################################### 40 | 41 | note("stopping node 2"); 42 | $cluster->{nodes}->[2]->stop; 43 | 44 | $cluster->await_nodes_after_stop( [0,1] ); 45 | 46 | $cluster->safe_psql(0, "insert into t values(2, 20);"); 47 | $cluster->safe_psql(1, "insert into t values(3, 30);"); 48 | $cluster->safe_psql(0, "insert into t values(4, 40);"); 49 | $cluster->safe_psql(1, "insert into t values(5, 50);"); 50 | 51 | $psql_out = $cluster->safe_psql(0, "select v from t where k=4;"); 52 | is($psql_out, '40', "Check replication after node failure."); 53 | 54 | ############################################################################### 55 | # Work after node start 56 | ############################################################################### 57 | 58 | note("starting node 2"); 59 | $cluster->{nodes}->[2]->start; 60 | 61 | # intentionally start from 2 62 | $cluster->await_nodes( [2,0,1] ); 63 | 64 | $cluster->safe_psql(0, "insert into t values(6, 60);"); 65 | $cluster->safe_psql(1, "insert into t values(7, 70);"); 66 | $cluster->safe_psql(0, "insert into t values(8, 80);"); 67 | $cluster->safe_psql(1, "insert into t values(9, 90);"); 68 | 69 | $psql_out = $cluster->safe_psql(2, "select v from t where k=8;"); 70 | is($psql_out, '80', "Check replication after failed node recovery."); 71 | 72 | $psql_out = $cluster->safe_psql(2, "select v from t where k=5;"); 73 | is($psql_out, '50', "Check replication after failed node recovery."); 74 | 75 | $cluster->stop(); 76 | 77 | 1; 78 | -------------------------------------------------------------------------------- /t/004_recovery.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings; 3 | 4 | use Cluster; 5 | use TestLib; 6 | use Test::More tests => 6; 7 | 8 | my $cluster = new Cluster(3); 9 | $cluster->init(); 10 | $cluster->start(); 11 | $cluster->create_mm(); 12 | 13 | ######################################################## 14 | # Check data integrity before and after recovery of single node. 15 | # Easy variant: sequential pgbenches, recovery without concurrent load. 16 | ######################################################## 17 | 18 | my $hash0; my $hash1; my $hash2; my $oldhash; 19 | my $hash_query = q{ 20 | select 21 | md5('(' || string_agg(aid::text || ', ' || abalance::text , '),(') || ')') 22 | from 23 | (select * from pgbench_accounts order by aid) t; 24 | }; 25 | 26 | $cluster->pgbench(1, ('-i', -s => '10') ); 27 | $cluster->pgbench(0, ('-n','-N', -T => '4') ); 28 | $cluster->pgbench(1, ('-n','-N', -T => '4') ); 29 | $cluster->pgbench(2, ('-n','-N', -T => '4') ); 30 | 31 | $cluster->{nodes}->[2]->stop('fast'); 32 | $cluster->await_nodes_after_stop( [0,1] ); 33 | 34 | $cluster->pgbench(0, ('-n','-N', -T => '4') ); 35 | $cluster->pgbench(1, ('-n','-N', -T => '4') ); 36 | 37 | $cluster->await_nodes( [0,1] ); # just in case we've faced random timeout before 38 | $hash0 = $cluster->safe_psql(0, $hash_query); 39 | $hash1 = $cluster->safe_psql(1, $hash_query); 40 | is($hash0, $hash1, "Check that hash is the same before recovery"); 41 | 42 | $cluster->{nodes}->[2]->start; 43 | $cluster->await_nodes( [2,0,1] ); 44 | 45 | $oldhash = $hash0; 46 | $hash0 = $cluster->safe_psql(0, $hash_query); 47 | $hash1 = $cluster->safe_psql(1, $hash_query); 48 | $hash2 = $cluster->safe_psql(2, $hash_query); 49 | 50 | note("$oldhash, $hash0, $hash1, $hash2"); 51 | is( (($hash0 eq $hash1) and ($hash1 eq $hash2) and ($oldhash eq $hash0)) , 1, 52 | "Check that hash is the same after recovery"); 53 | 54 | ######################################################## 55 | # Check start after all nodes were disconnected 56 | ######################################################## 57 | 58 | $cluster->safe_psql(0, "create table if not exists t(k int primary key, v int);"); 59 | 60 | $cluster->safe_psql(0, "insert into t values(1, 10);"); 61 | $cluster->safe_psql(1, "insert into t values(2, 20);"); 62 | $cluster->safe_psql(2, "insert into t values(3, 30);"); 63 | 64 | my $sum0; my $sum1; my $sum2; 65 | 66 | $cluster->{nodes}->[1]->stop('fast'); 67 | $cluster->{nodes}->[2]->stop('fast'); 68 | 69 | $cluster->{nodes}->[1]->start; 70 | $cluster->{nodes}->[2]->start; 71 | 72 | $cluster->await_nodes( [1,2,0] ); 73 | 74 | $sum0 = $cluster->safe_psql(0, "select sum(v) from t;"); 75 | $sum1 = $cluster->safe_psql(1, "select sum(v) from t;"); 76 | $sum2 = $cluster->safe_psql(2, "select sum(v) from t;"); 77 | is( (($sum0 == 60) and ($sum1 == $sum0) and ($sum2 == $sum0)) , 1, 78 | "Check that nodes are working and sync"); 79 | 80 | ######################################################## 81 | # Check recovery during some load 82 | ######################################################## 83 | 84 | $cluster->pgbench(0, ('-i', -s => '10') ); 85 | $cluster->pgbench(0, ('-N', -T => '1') ); 86 | $cluster->pgbench(1, ('-N', -T => '1') ); 87 | $cluster->pgbench(2, ('-N', -T => '1') ); 88 | 89 | # kill node while neighbour is under load 90 | my $pgb_handle = $cluster->pgbench_async(1, ('-N', -T => '20', -c => '5') ); 91 | sleep(5); 92 | $cluster->{nodes}->[2]->stop('fast'); 93 | $cluster->pgbench_await($pgb_handle); 94 | 95 | # start node while neighbour is under load 96 | $pgb_handle = $cluster->pgbench_async(0, ('-N', -T => '20', -c => '5') ); 97 | sleep(5); 98 | $cluster->{nodes}->[2]->start; 99 | $cluster->pgbench_await($pgb_handle); 100 | 101 | # await recovery 102 | $cluster->await_nodes( [2,0,1] ); 103 | 104 | # check data identity 105 | $hash0 = $cluster->safe_psql(0, $hash_query); 106 | $hash1 = $cluster->safe_psql(1, $hash_query); 107 | $hash2 = $cluster->safe_psql(2, $hash_query); 108 | note("$hash0, $hash1, $hash2"); 109 | is( (($hash0 eq $hash1) and ($hash1 eq $hash2)) , 1, "Check that hash is the same"); 110 | 111 | $sum0 = $cluster->safe_psql(0, "select sum(abalance) from pgbench_accounts;"); 112 | $sum1 = $cluster->safe_psql(1, "select sum(abalance) from pgbench_accounts;"); 113 | $sum2 = $cluster->safe_psql(2, "select sum(abalance) from pgbench_accounts;"); 114 | 115 | note("Sums: $sum0, $sum1, $sum2"); 116 | is($sum2, $sum0, "Check that sum_2 == sum_0"); 117 | is($sum2, $sum1, "Check that sum_2 == sum_1"); 118 | 119 | $sum0 = $cluster->safe_psql(0, "select count(*) from pg_prepared_xacts;"); 120 | $sum1 = $cluster->safe_psql(1, "select count(*) from pg_prepared_xacts;"); 121 | $sum2 = $cluster->safe_psql(2, "select count(*) from pg_prepared_xacts;"); 122 | 123 | note("Number of prepared tx: $sum0, $sum1, $sum2"); 124 | 125 | $cluster->stop; 126 | -------------------------------------------------------------------------------- /t/005_pgbench.pl: -------------------------------------------------------------------------------- 1 | # Kinda bank test: on each node multiple clients transfer money from one acc to 2 | # another, another bunch of clients make sure sum is constant always. 3 | 4 | use strict; 5 | use warnings; 6 | 7 | use Cluster; 8 | use TestLib; 9 | use Test::More tests => 2; 10 | 11 | my $cluster = new Cluster(2); 12 | $cluster->init(); 13 | $cluster->start(); 14 | $cluster->create_mm(); 15 | 16 | $cluster->safe_psql(0, q{ 17 | create table t (k int primary key, v int); 18 | insert into t (select generate_series(0, 999), 0); 19 | create table reader_log (v int); 20 | }); 21 | 22 | my $clients = 5; 23 | my $seconds = 30; 24 | my @benches = (); 25 | foreach (0..$#{$cluster->{nodes}}) 26 | { 27 | push @benches, $cluster->pgbench_async($_, 28 | ('-n', -T => $seconds, -c => $clients, -f => 'tests/reader.pgb')); 29 | push @benches, $cluster->pgbench_async($_, 30 | ('-n', -T => $seconds, -c => $clients, -f => 'tests/writer.pgb', -R => 10)); 31 | } 32 | 33 | $cluster->pgbench_await($_) foreach @benches; 34 | 35 | my $out; 36 | 37 | $out = $cluster->safe_psql(0, 38 | "select count(*) from reader_log where v != 0"); 39 | is($out, 0, "there is nothing except zeros in reader_log"); 40 | 41 | $out = $cluster->safe_psql(0, 42 | "select count(*) from reader_log where v = 0"); 43 | isnt($out, 0, "reader_log is not empty"); 44 | 45 | $cluster->stop; 46 | -------------------------------------------------------------------------------- /t/006_pgbenchdl.pl: -------------------------------------------------------------------------------- 1 | # Like pgbench.pl, but the probability of deadlocks is much higher; check that 2 | # they get detected. 3 | 4 | use strict; 5 | use warnings; 6 | 7 | use Cluster; 8 | use TestLib; 9 | use Test::More tests => 1; 10 | use Data::Dumper; 11 | 12 | use POSIX ":sys_wait_h"; 13 | 14 | my $cluster = new Cluster(3); 15 | $cluster->init(); 16 | $cluster->start(); 17 | $cluster->create_mm(); 18 | 19 | $cluster->safe_psql(0, q{ 20 | create table transactions (id SERIAL primary key, dt timestamp default now(), uid int, amount int); 21 | create index on transactions using btree(uid); 22 | create table users (uid int primary key, sum bigint); 23 | }); 24 | 25 | my $clients = 10; 26 | my $seconds = 90; 27 | my @benches = (); 28 | foreach (0..$#{$cluster->{nodes}}) 29 | { 30 | push @benches, $cluster->pgbench_async($_, 31 | ('-n', -T => $seconds, -c => $clients, -f => 'tests/deadl.pgb')); 32 | } 33 | 34 | sub isalive { 35 | my $benches = $_[0]; 36 | my $any_alive = 0; 37 | waitpid(-1, WNOHANG); 38 | $any_alive = ($any_alive or (kill 0,$_->{'KIDS'}->[0]->{'PID'})) foreach @{$benches}; 39 | return $any_alive; 40 | } 41 | 42 | # ensure num of successfull xacts steadily goes up, i.e. deadlocks are detected 43 | # in time. 44 | my $ptrans = 0; 45 | my $dead_count = 0; 46 | while (isalive(\@benches)) { 47 | my $trans = $cluster->safe_psql(0, 48 | "select count(*) from transactions"); 49 | if ($ptrans == 0) { 50 | $ptrans = $trans; 51 | } elsif ($ptrans == $trans) { 52 | $dead_count++; 53 | } else { 54 | $dead_count = 0; 55 | $ptrans = $trans; 56 | } 57 | if ($dead_count >=3) { 58 | last; 59 | } 60 | sleep 2; 61 | } 62 | 63 | ok($dead_count < 3, 'at least one xact was committed during 6 seconds'); 64 | $cluster->stop; 65 | -------------------------------------------------------------------------------- /t/007_add_stop_node.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings; 3 | 4 | use Carp; 5 | use PostgresNode; 6 | use Cluster; 7 | use TestLib; 8 | use Test::More tests => 8; 9 | 10 | # Generally add node with concurrent load (and failures) is not supported 11 | # because of at least 12 | # 1) it is not clear why non-donor nodes should properly keep WAL for new node; 13 | # 2) if donor fails, it is not clear whether new node will obtain suitable 14 | # syncpoints to pull from non-donors; 15 | # 3) A problem with slot creation and receiver start deadlocking each other, 16 | # see PGPRO-3618. 17 | # 18 | # drop_node with concurrent load is not safe at least because once it is done we 19 | # can't determine origin node properly, so no its xacts would be replicated. 20 | # 21 | # An option is left for experiments/future work. 22 | my $concurrent_load = 0; 23 | 24 | my $cluster = new Cluster(3); 25 | $cluster->init(); 26 | $cluster->start(); 27 | 28 | # XXXX: delete all '-n' ? 29 | 30 | ################################################################################ 31 | # manually setup nodes with sparse node_id's 32 | ################################################################################ 33 | 34 | foreach (0..$#{$cluster->{nodes}}) 35 | { 36 | my $node = $cluster->{nodes}->[$_]; 37 | $node->{dbname} = 'postgres'; 38 | } 39 | 40 | foreach (0..$#{$cluster->{nodes}}) 41 | { 42 | my $node = $cluster->{nodes}->[$_]; 43 | 44 | note($cluster->connstr($_)); 45 | 46 | $cluster->safe_psql($_, qq{ 47 | create extension multimaster; 48 | select mtm.state_create('{2, 4, 5}'); 49 | insert into mtm.cluster_nodes values 50 | (2, \$\$@{[ $cluster->connstr(0) ]}\$\$, '@{[ $_ == 0 ? 't' : 'f' ]}'), 51 | (4, \$\$@{[ $cluster->connstr(1) ]}\$\$, '@{[ $_ == 1 ? 't' : 'f' ]}'), 52 | (5, \$\$@{[ $cluster->connstr(2) ]}\$\$, '@{[ $_ == 2 ? 't' : 'f' ]}'); 53 | }); 54 | } 55 | 56 | $cluster->await_nodes( [0..$#{$cluster->{nodes}}] ); 57 | 58 | $cluster->pgbench(0, ('-i', '-n', -s => '10') ); 59 | $cluster->pgbench(0, ('-N', '-n', -t => '100') ); 60 | $cluster->pgbench(1, ('-N', '-n', -t => '100') ); # XXX: pgbench stucks here for quite a long time 61 | $cluster->pgbench(2, ('-N', '-n', -t => '100') ); 62 | 63 | ################################################################################ 64 | # auto recovery 65 | ################################################################################ 66 | 67 | $cluster->{nodes}->[2]->stop('fast'); 68 | $cluster->await_nodes_after_stop( [0,1] ); 69 | $cluster->pgbench(0, ('-N', '-n', -T => '1') ); 70 | $cluster->{nodes}->[2]->start; 71 | 72 | $cluster->await_nodes( [2,0,1] ); 73 | is($cluster->is_data_identic( (0,1,2) ), 1, "check auto recovery"); 74 | 75 | ################################################################################ 76 | # add basebackuped node 77 | ################################################################################ 78 | 79 | # add table with sequence to check sequences after n_nodes change 80 | $cluster->safe_psql(0, "create table test_seq(id serial primary key)"); 81 | $cluster->safe_psql(0, "insert into test_seq values(DEFAULT)"); 82 | $cluster->safe_psql(1, "insert into test_seq values(DEFAULT)"); 83 | $cluster->safe_psql(2, "insert into test_seq values(DEFAULT)"); 84 | 85 | my $pgb1; 86 | my $pgb2; 87 | if ($concurrent_load) 88 | { 89 | $pgb1= $cluster->pgbench_async(0, ('-N', '-n', -T => '3600', -c => '2') ); 90 | $pgb2= $cluster->pgbench_async(1, ('-N', '-n', -T => '3600', -c => '2') ); 91 | } 92 | 93 | my $new_node_off = $cluster->add_node(); 94 | $cluster->{nodes}->[$new_node_off]->{dbname} = 'postgres'; 95 | my $connstr = $cluster->connstr($new_node_off); 96 | my $new_node_id = $cluster->safe_psql(0, "SELECT mtm.add_node(\$\$$connstr\$\$)"); 97 | 98 | is($new_node_id, 1, "sparse id assignment"); 99 | is($new_node_off, 3, "sparse id assignment"); 100 | if ($concurrent_load) 101 | { 102 | $cluster->pgbench(0, ('-N', '-n', -t => '100') ); 103 | } 104 | # Ensure monitor creates slot for new node on donor. We don't use it for 105 | # basebackup anymore, but this is still a good idea (it would be even better to 106 | # wait for logical slot creation too). 107 | $cluster->poll_query_until(0, "select exists(select * from pg_replication_slots where slot_name = 'mtm_filter_slot_${new_node_id}');") 108 | or croak "timed out waiting for slot creation"; 109 | my $end_lsn = $cluster->backup_and_init(0, $new_node_off, $new_node_id); 110 | 111 | # Prevent recovery of new node further than the end point returned by 112 | # basebackup as streaming will be requested since it, so not doing this might 113 | # result in attempting to receive already existing data. This realistically 114 | # happens with syncpoint rows, leading to insertion conflict. 115 | # 116 | # It would be much nicer to learn the correct (end of recovery) LSN at the new 117 | # node itself and not burden user with carrying it around, but there seems no 118 | # easy way to do that without core changes. 119 | $cluster->{nodes}->[$new_node_off]->append_conf( 120 | "postgresql.conf", qq( 121 | restore_command = 'false' 122 | recovery_target = 'immediate' 123 | recovery_target_action = 'promote' 124 | )); 125 | # create recovery.signal 126 | $cluster->{nodes}->[$new_node_off]->set_recovery_mode(); 127 | $cluster->{nodes}->[$new_node_off]->start; 128 | $cluster->await_nodes([3,0,1,2], 0); 129 | $cluster->safe_psql(0, "SELECT mtm.join_node('$new_node_id', '$end_lsn')"); 130 | note("join_node done"); 131 | 132 | if ($concurrent_load) 133 | { 134 | sleep(5); 135 | IPC::Run::kill_kill($pgb1); 136 | IPC::Run::kill_kill($pgb2); 137 | } 138 | 139 | $cluster->await_nodes( [3,0,1,2] ); 140 | $cluster->pgbench(0, ('-N', '-n', -t => '100') ); 141 | $cluster->pgbench(3, ('-N', '-n', -t => '100') ); 142 | 143 | is($cluster->is_data_identic( (0,1,2,3) ), 1, "add basebackuped node"); 144 | 145 | my $bb_keycount = $cluster->safe_psql(3, q{ 146 | select count(*) from mtm.config where key='basebackup' 147 | }); 148 | 149 | is($bb_keycount, 0, "basebackup key was deleted"); 150 | 151 | # check that sequences in proper state 152 | $cluster->safe_psql(0, "insert into test_seq values(DEFAULT)"); 153 | $cluster->safe_psql(1, "insert into test_seq values(DEFAULT)"); 154 | $cluster->safe_psql(2, "insert into test_seq values(DEFAULT)"); 155 | $cluster->safe_psql(3, "insert into test_seq values(DEFAULT)"); 156 | 157 | ################################################################################ 158 | # basic check of recovery after add node succeeded 159 | ################################################################################ 160 | 161 | $cluster->{nodes}->[0]->stop('fast'); 162 | $cluster->await_nodes_after_stop( [1,2,3] ); 163 | $cluster->pgbench(3, ('-N', '-n', -T => '1') ); 164 | $cluster->{nodes}->[0]->start; 165 | 166 | $cluster->await_nodes( [2,0,1] ); 167 | is($cluster->is_data_identic((0,1,2,3)), 1, "check recovery after add_node"); 168 | 169 | ################################################################################ 170 | # drop one of the initial nodes 171 | ################################################################################ 172 | 173 | $cluster->{nodes}->[0]->stop('fast'); 174 | $cluster->await_nodes_after_stop( [1,2,3] ); 175 | $cluster->safe_psql(1, "select mtm.drop_node(2)"); 176 | 177 | # check basic recovery after drop_node 178 | $cluster->{nodes}->[1]->stop('fast'); 179 | $cluster->await_nodes_after_stop( [2,3] ); 180 | $cluster->pgbench(3, ('-N', '-n', -T => '1') ); 181 | $cluster->pgbench(2, ('-N', '-n', -T => '1') ); 182 | $cluster->{nodes}->[1]->start; 183 | $cluster->await_nodes( [3,2,1] ); 184 | is($cluster->is_data_identic((1,2,3)), 1, "check recovery after drop_node"); 185 | 186 | 187 | # TODO: check that WALs are not kept for dropped node anymore 188 | 189 | ################################################################################ 190 | # XXX: check remove/add of same node 191 | ################################################################################ 192 | 193 | ################################################################################ 194 | # XXX: check self remove 195 | ################################################################################ 196 | -------------------------------------------------------------------------------- /t/008_bugfixes.pl: -------------------------------------------------------------------------------- 1 | use Carp; 2 | use POSIX; 3 | use strict; 4 | use Test::More; 5 | use TestLib; 6 | use Time::HiRes qw(usleep); 7 | use warnings; 8 | 9 | use PostgresNode; 10 | use Cluster; 11 | 12 | use Test::More tests => Cluster::is_ee() ? 6 : 5; 13 | 14 | my $cluster = new Cluster(3); 15 | $cluster->init(); 16 | $cluster->start(); 17 | $cluster->create_mm(); 18 | 19 | my $hash0; my $hash1; my $hash2; my $hash_query; 20 | 21 | # run pathman test only on ee 22 | if (Cluster::is_ee()) 23 | { 24 | $cluster->safe_psql(0, q{ 25 | CREATE EXTENSION pg_pathman; 26 | CREATE SCHEMA test_update_node; 27 | SET pg_pathman.enable_partitionrouter = ON; 28 | 29 | CREATE TABLE test_update_node.test_range(val NUMERIC NOT NULL, comment TEXT); 30 | CREATE INDEX val_idx ON test_update_node.test_range (val); 31 | INSERT INTO test_update_node.test_range SELECT i, i FROM generate_series(1, 100) i; 32 | SELECT create_range_partitions('test_update_node.test_range', 'val', 1, 10); 33 | 34 | ALTER TABLE test_update_node.test_range DROP COLUMN comment CASCADE; 35 | 36 | UPDATE test_update_node.test_range SET val = 115 WHERE val = 55; 37 | }); 38 | 39 | $hash_query = q{ 40 | select 41 | md5('(' || string_agg(val::text, '),(') || ')') 42 | from 43 | (select * from test_update_node.test_range order by val) t; 44 | }; 45 | $hash0 = $cluster->safe_psql(0, $hash_query); 46 | $hash1 = $cluster->safe_psql(1, $hash_query); 47 | $hash2 = $cluster->safe_psql(2, $hash_query); 48 | note("$hash0, $hash1, $hash2"); 49 | is( (($hash0 eq $hash1) and ($hash1 eq $hash2)) , 1, 50 | "Check that hash is the same after query"); 51 | } 52 | 53 | $cluster->safe_psql(0, q{ 54 | CREATE TABLE unique_tbl (i int UNIQUE DEFERRABLE, t text); 55 | INSERT INTO unique_tbl VALUES (0, 'one'); 56 | INSERT INTO unique_tbl VALUES (1, 'two'); 57 | INSERT INTO unique_tbl VALUES (2, 'tree'); 58 | INSERT INTO unique_tbl VALUES (3, 'four'); 59 | INSERT INTO unique_tbl VALUES (4, 'five'); 60 | }); 61 | $cluster->{nodes}->[1]->psql($cluster->{nodes}->[1]->{dbname}, q{ 62 | -- default is immediate so this should fail right away 63 | UPDATE unique_tbl SET i = 1 WHERE i = 0; 64 | }); 65 | $cluster->safe_psql(0, q{ 66 | UPDATE unique_tbl SET i = i+1; 67 | }); 68 | 69 | $hash_query = q{ 70 | select 71 | md5('(' || string_agg(i::text || ', ' || t::text , '),(') || ')') 72 | from 73 | (select * from unique_tbl order by i) t; 74 | }; 75 | $hash0 = $cluster->safe_psql(0, $hash_query); 76 | $hash1 = $cluster->safe_psql(1, $hash_query); 77 | $hash2 = $cluster->safe_psql(2, $hash_query); 78 | note("$hash0, $hash1, $hash2"); 79 | is( (($hash0 eq $hash1) and ($hash1 eq $hash2)) , 1, 80 | "Check that hash is the same after query"); 81 | 82 | # ############################################################################## 83 | # 84 | # Check the PGPRO-3146 bug. Hard crash of backend causes restart of all postgres 85 | # processes. Multimaster node must be survived after the crash and included into 86 | # the multimaster after recovery. 87 | # 88 | # ############################################################################## 89 | 90 | # Set GUC restart_after_crash in 'on' value 91 | $cluster->stop(); 92 | foreach (0..$#{$cluster->{nodes}}) 93 | { 94 | $cluster->{nodes}->[$_]->append_conf('postgresql.conf', q{restart_after_crash = on}); 95 | } 96 | $cluster->start(); 97 | $cluster->await_nodes( [0,1,2] ); 98 | 99 | # Simulate payload 100 | $cluster->pgbench(0, ('-i', '-n', -s => '1') ); 101 | note( strftime('%Y-%m-%d %H:%M:%S', localtime) . ": starting async pgbench" ); 102 | my $pgb1 = $cluster->pgbench_async(0, ('-n', -T => '25', -j => '1', -c => '5') ); 103 | 104 | my $pid0; 105 | my $attempts = 0; 106 | 107 | note( strftime('%Y-%m-%d %H:%M:%S', localtime) . ": starting polling of backend pid" ); 108 | while (1) 109 | { 110 | $pid0 = $cluster->safe_psql(0, "SELECT pid FROM pg_stat_activity 111 | WHERE backend_type LIKE 'client backend' 112 | AND query LIKE 'UPDATE%' LIMIT 1;"); 113 | 114 | # bf says we might be really unlucky to find no backend doing update 115 | # It does not make much sense to try longer than pgbench run lasts, 116 | # since we need an active backend to kill. So let it be 25 seconds 117 | # both for pgbench_async() and this pg_stat_activity polling. 118 | if ( ($pid0 ne "") || $attempts >= 25*10 ) 119 | { 120 | last; 121 | } 122 | 123 | # Wait 0.1 second before retrying. 124 | usleep(100_000); 125 | $attempts++; 126 | } 127 | note( strftime('%Y-%m-%d %H:%M:%S', localtime) . ": finished polling of backend pid" ); 128 | is( ($pid0 ne ""), 1, 129 | "found an active backend doing UPDATE" ); 130 | 131 | # Simulate hard crash 132 | note("Simulate hard crash of a backend by SIGKILL to $pid0"); 133 | kill -9, $pid0; 134 | 135 | $cluster->pgbench_await($pgb1); 136 | $cluster->await_nodes( [0,1,2] ); 137 | is($cluster->is_data_identic( (0,1,2) ), 1, "check consistency after crash"); 138 | 139 | 140 | # ############################################################################## 141 | # 142 | # [PGPRO-3047] Test ALTER DOMAIN .. CONSTRAINT .. NOT VALID 143 | # 144 | # ############################################################################## 145 | 146 | $hash0 = $cluster->safe_psql(0, " 147 | CREATE DOMAIN things AS INT; 148 | CREATE TABLE thethings (stuff things); 149 | INSERT INTO thethings (stuff) VALUES (55); 150 | ALTER DOMAIN things ADD CONSTRAINT meow CHECK (VALUE < 11) NOT VALID; 151 | UPDATE thethings SET stuff = 10; 152 | ALTER DOMAIN things VALIDATE CONSTRAINT meow; 153 | "); 154 | my $result0 = $cluster->safe_psql(0, "SELECT * FROM thethings"); 155 | my $result1 = $cluster->safe_psql(1, "SELECT * FROM thethings"); 156 | my $result2 = $cluster->safe_psql(2, "SELECT * FROM thethings"); 157 | note("Value in the stuff column of thethings table is $result0 at the node1 and match to corresponding values from another nodes: 2 - $result1 and 3 - $result2 "); 158 | is( (($result0 eq 10) and ($result0 eq $result1) and ($result1 eq $result2)), 1, 159 | "Check that update not aborted by violation of constraint on old tuple value"); 160 | 161 | # ############################################################################## 162 | # 163 | # [PGPRO-3047] Check for problems with different OIDs on multimaster nodes 164 | # during logical replication of tuples contained attribute with domain over 165 | # arrays of composite. 166 | # 167 | # ############################################################################## 168 | 169 | # Check that OIDs are different. 170 | $result0 = $cluster->safe_psql(0, 171 | "select oid from pg_class where relname like 'thethings';"); 172 | $result1 = $cluster->safe_psql(1, 173 | "select oid from pg_class where relname like 'thethings';"); 174 | $result2 = $cluster->safe_psql(2, 175 | "select oid from pg_class where relname like 'thethings';"); 176 | note("OIDS of the thethings relation: node1 - $result0, node2 - $result1, node3 - $result2"); 177 | is( ( ($result0 ne $result1) and ($result0 ne $result2) and ($result1 ne $result2) ), 1, 178 | "Check that oid of the thethings relation are different on each node"); 179 | 180 | # Do the test. Insertion of array type must be passed successfully. 181 | # Source: regression test domain.sql 182 | $cluster->safe_psql(0, " 183 | CREATE TYPE comptype AS (r float8, i float8); 184 | CREATE domain dcomptypea AS comptype[]; 185 | CREATE table dcomptable (d1 dcomptypea UNIQUE); 186 | INSERT INTO dcomptable VALUES (array[row(1,2)]::dcomptypea); 187 | "); 188 | 189 | $cluster->stop(); 190 | 191 | done_testing(); 192 | -------------------------------------------------------------------------------- /t/009_identity_func.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings; 3 | use PostgresNode; 4 | use Cluster; 5 | use TestLib; 6 | use Test::More tests => 29; 7 | 8 | my $cluster = new Cluster(3); 9 | $cluster->init(); 10 | $cluster->start(); 11 | $cluster->create_mm(undef); 12 | 13 | my $dbname = $cluster->{nodes}->[0]->{dbname}; 14 | my $nodes = $cluster->{nodes}; 15 | my $output; 16 | my $err_out; 17 | 18 | # ############################################################################## 19 | # 20 | # Incorrect query 21 | # 22 | # ############################################################################## 23 | my $invalid_expr_pattern = 24 | ".*failed to run query on node[0-9]+, snapshot .*: " 25 | . "ERROR: relation \"t1\" does not exist\n"; 26 | 27 | # test node 1 28 | $nodes->[0]->psql($dbname, 29 | "SELECT mtm.check_query('SELECT * FROM t1')", 30 | stdout => \$output, stderr => \$err_out); 31 | is ( (($output eq '') and ($err_out ne '')), 1, "node1: check zero out on error"); 32 | like($err_out, qr{$invalid_expr_pattern}, "node1: check error output correctness"); 33 | 34 | # test node 2 35 | $nodes->[1]->psql($dbname, 36 | "SELECT mtm.check_query('SELECT * FROM t1')", 37 | stdout => \$output, stderr => \$err_out); 38 | is ( (($output eq '') and ($err_out ne '')), 1, "node2: check zero out on error"); 39 | like($err_out, qr{$invalid_expr_pattern}, "node2: check error output correctness"); 40 | 41 | # test node 3 42 | $nodes->[2]->psql($dbname, 43 | "SELECT mtm.check_query('SELECT * FROM t1')", 44 | stdout => \$output, stderr => \$err_out); 45 | is ( (($output eq '') and ($err_out ne '')), 1, "node3: check zero out on error"); 46 | like($err_out, qr{$invalid_expr_pattern}, "node3: check error output correctness"); 47 | 48 | # Substep: check no problems without one node 49 | $nodes->[2]->stop(); 50 | $cluster->await_nodes_after_stop( [0,1] ); 51 | $nodes->[0]->psql($dbname, 52 | "SELECT mtm.check_query('SELECT * FROM t1')", 53 | stdout => \$output, stderr => \$err_out); 54 | is ( (($output eq '') and ($err_out ne '')), 1, "node1: check zero out on error"); 55 | like($err_out, qr{$invalid_expr_pattern}, "node1: check error output correctness"); 56 | 57 | $nodes->[1]->psql($dbname, 58 | "SELECT mtm.check_query('SELECT * FROM t1')", 59 | stdout => \$output, stderr => \$err_out); 60 | is ( (($output eq '') and ($err_out ne '')), 1, "node2: check zero out on error"); 61 | like($err_out, qr{$invalid_expr_pattern}, "node2: check error output correctness"); 62 | 63 | # Substep: node1 will be isolated 64 | my $isolation_pattern = ".*node is not online\: current status .*"; 65 | $nodes->[1]->stop(); 66 | $nodes->[0]->psql($dbname, 67 | "SELECT mtm.check_query('SELECT * FROM t1')", 68 | stdout => \$output, stderr => \$err_out); 69 | is ( (($output eq '') and ($err_out ne '')), 1, "node1: check zero out on error"); 70 | like($err_out, qr{$isolation_pattern}, "Check access to isolated node"); 71 | 72 | $nodes->[1]->start(); 73 | $nodes->[2]->start(); 74 | $cluster->await_nodes( [2,0,1] ); 75 | 76 | # ############################################################################## 77 | # 78 | # Interface functions protection. 79 | # 80 | # ############################################################################## 81 | my $protection_pattern = "this function should only be called by mtm.check_query()"; 82 | $nodes->[0]->psql($dbname, 83 | "SELECT mtm.hold_backends();", 84 | stdout => \$output, stderr => \$err_out); 85 | is ( (($output eq '') and ($err_out ne '')), 1, "hold_all() protection"); 86 | like($err_out, qr{$protection_pattern}, "Check error output"); 87 | 88 | $nodes->[0]->psql($dbname, 89 | "SELECT mtm.release_backends();", 90 | stdout => \$output, stderr => \$err_out); 91 | is ( (($output eq '') and ($err_out ne '')), 1, "release_all() protection"); 92 | like($err_out, qr{$protection_pattern}, "Check error output"); 93 | 94 | $cluster->safe_psql(0, "CREATE TABLE t1 (a int PRIMARY KEY, b text);"); 95 | $nodes->[0]->psql($dbname, 96 | "SELECT mtm.check_query('SELECT * FROM t1')", 97 | stdout => \$output); 98 | is( (($output eq 't')) , 1, "Check tables equivalence with no tuples"); 99 | 100 | # Check consistency in the case of two nodes 101 | $nodes->[1]->stop(); 102 | $cluster->await_nodes_after_stop( [0,2] ); 103 | $nodes->[0]->psql($dbname, 104 | "SELECT mtm.check_query('SELECT * FROM t1')", 105 | stdout => \$output); 106 | is( (($output eq 't')) , 1, "Check tables equivalence with one off node"); 107 | 108 | $cluster->safe_psql(0, "INSERT INTO t1 (a, b) VALUES (1, NULL);"); 109 | $nodes->[0]->psql($dbname, 110 | "SELECT mtm.check_query('SELECT * FROM t1')", 111 | stdout => \$output); 112 | 113 | is( (($output eq 't')) , 1, "Check primitive table"); 114 | $nodes->[1]->start(); 115 | $cluster->await_nodes( [2,0,1] ); 116 | 117 | $cluster->safe_psql(0, 118 | "INSERT INTO t1 (a,b) (SELECT *, 'test' FROM generate_series(2,100) AS x1); 119 | "); 120 | $nodes->[0]->psql($dbname, 121 | "SELECT mtm.check_query('SELECT * FROM t1 ORDER BY a')", 122 | stdout => \$output); 123 | is( (($output eq 't')) , 1, "Check big table"); 124 | $nodes->[0]->psql($dbname, 125 | "SELECT mtm.check_query('SELECT md5(string_agg(x1::text,'''')) 126 | FROM (SELECT * FROM t1 ORDER BY a) AS x1');", 127 | stdout => \$output); 128 | is( (($output eq 't')) , 1, "Another approach to check big table"); 129 | 130 | $nodes->[0]->psql($dbname, 131 | "SELECT mtm.check_query('SELECT mtm.status();');", 132 | stdout => \$output); 133 | note("Check result: $output"); 134 | is( (($output eq 'f')) , 1, "Unsuccessful check"); 135 | 136 | $nodes->[2]->stop(); 137 | $cluster->await_nodes_after_stop( [0,1] ); 138 | $nodes->[0]->psql($dbname, 139 | "SELECT mtm.check_query('SELECT md5(string_agg(x1::text,'''')) 140 | FROM (SELECT * FROM t1 ORDER BY a) AS x1');", 141 | stdout => \$output); 142 | is( (($output eq 't')) , 1, "Check tables identity after one node was down"); 143 | 144 | $nodes->[2]->start(); 145 | $cluster->await_nodes( [2,0,1] ); 146 | $nodes->[0]->psql($dbname, 147 | "SELECT mtm.check_query('SELECT my_node_id FROM mtm.status();');", 148 | stdout => \$output); 149 | is( (($output eq 'f')) , 1, "Check warning message on mismatch"); 150 | 151 | $nodes->[2]->psql($dbname, 152 | "SELECT mtm.check_query('SELECT a,b FROM t1, mtm.status() AS ms WHERE a > ms.my_node_id');", 153 | stdout => \$output, stderr => \$err_out); 154 | note("Check result: $output"); 155 | is( (($output eq 'f')) , 1, "Check warning message on difference in rows number"); 156 | like($err_out, 157 | qr{.*query results mismatch\: 99 rows and 2 columns on node1\, 98 rows and 2 columns on node2}, 158 | "Check format of the error message"); 159 | 160 | $nodes->[2]->psql($dbname, 161 | "SELECT mtm.check_query('SELECT b FROM t1 WHERE a = 1');", 162 | stdout => \$output); 163 | note("Check result: $output"); 164 | is( (($output eq 't')) , 1, "Check equivalence of nulls"); 165 | 166 | $nodes->[0]->psql($dbname, 167 | "SELECT mtm.check_query('SELECT b FROM t1, mtm.status() AS ms WHERE a = ms.my_node_id');", 168 | stdout => \$output, stderr => \$err_out); 169 | note("Check result: $output"); 170 | is( (($output eq 'f')) , 1, "Check warning message on difference in null and not null values"); 171 | like($err_out, 172 | qr{.*mismatch in column \'b\' of row 0\: null on node1\, test on node2}, 173 | "Check format of the error message"); 174 | 175 | exit(0); 176 | 177 | # Full pgbench test 178 | $cluster->pgbench(0, ('-i', -s => '10') ); 179 | my $pgb0 = $cluster->pgbench_async(0, ('-N', -T => '30', -c => '5') ); 180 | my $pgb1 = $cluster->pgbench_async(1, ('-N', -T => '30', -c => '5') ); 181 | my $pgb2 = $cluster->pgbench_async(2, ('-N', -T => '30', -c => '5') ); 182 | 183 | $output='t'; 184 | for (my $i = 0; ($i < 3) and ($output eq 't'); $i++) 185 | { 186 | $nodes->[0]->psql($dbname, 187 | "SELECT mtm.check_query('SELECT md5(string_agg(x1::text,'''')) 188 | FROM (SELECT * FROM pgbench_accounts ORDER BY aid) AS x1');", 189 | stdout => \$output); 190 | note("check iteration $i, result: $output"); 191 | is( (($output eq 't')) , 1, "Data on nodes are identic"); 192 | sleep(6); 193 | } 194 | 195 | $cluster->pgbench_await($pgb0); 196 | $cluster->pgbench_await($pgb1); 197 | $cluster->pgbench_await($pgb2); 198 | 199 | $cluster->stop(); 200 | -------------------------------------------------------------------------------- /tests/.gitignore: -------------------------------------------------------------------------------- 1 | .blockade 2 | .vagrant 3 | *.swp 4 | *.pyc 5 | -------------------------------------------------------------------------------- /tests/Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | name = "pypi" 3 | url = "https://pypi.org/simple" 4 | verify_ssl = true 5 | 6 | [dev-packages] 7 | 8 | [packages] 9 | aiopg = "==1.0.0" 10 | aioprocessing = "==1.0.1" 11 | docker-compose = "==1.26.2" 12 | docker = "*" 13 | 14 | [requires] 15 | python_version = "3.7" 16 | -------------------------------------------------------------------------------- /tests/deadl.pgb: -------------------------------------------------------------------------------- 1 | \set fromuser random(1,64) 2 | \set touser random(1,64) 3 | \set amount random(1,10000) 4 | BEGIN; 5 | INSERT INTO transactions (uid,amount) VALUES (:fromuser, -:amount); 6 | INSERT INTO transactions (uid,amount) VALUES (:touser, :amount); 7 | INSERT INTO users (uid,sum) VALUES (:fromuser, -:amount) ON CONFLICT(uid) DO UPDATE SET sum=users.sum-:amount WHERE users.uid=:fromuser; 8 | INSERT INTO users (uid,sum) VALUES (:touser, :amount) ON CONFLICT(uid) DO UPDATE SET sum=users.sum+:amount WHERE users.uid=:touser; 9 | END; 10 | 11 | -------------------------------------------------------------------------------- /tests/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '2.2' 2 | 3 | services: 4 | 5 | node1: 6 | container_name: node1 7 | build: .. 8 | shm_size: '1024mb' 9 | privileged: true 10 | ulimits: 11 | core: 14294967296 12 | environment: 13 | POSTGRES_USER: 'pg' 14 | POSTGRES_DB: 'regression' 15 | NODE_ID: 1 16 | CONNSTRS: >- 17 | dbname=regression user=pg host=node1, 18 | dbname=regression user=pg host=node2, 19 | dbname=regression user=pg host=node3 20 | ports: 21 | - "15432:5432" 22 | networks: 23 | mtm_bridge: 24 | ipv4_address: 192.168.253.1 25 | 26 | node2: 27 | container_name: node2 28 | build: .. 29 | shm_size: '1024mb' 30 | privileged: true 31 | ulimits: 32 | core: 14294967296 33 | environment: 34 | POSTGRES_USER: 'pg' 35 | POSTGRES_DB: 'regression' 36 | NODE_ID: 2 37 | CONNSTRS: >- 38 | dbname=regression user=pg host=node1, 39 | dbname=regression user=pg host=node2, 40 | dbname=regression user=pg host=node3 41 | ports: 42 | - "15433:5432" 43 | networks: 44 | mtm_bridge: 45 | ipv4_address: 192.168.253.2 46 | 47 | node3: 48 | container_name: node3 49 | build: .. 50 | shm_size: '1024mb' 51 | privileged: true 52 | ulimits: 53 | core: 14294967296 54 | environment: 55 | POSTGRES_USER: 'pg' 56 | POSTGRES_DB: 'regression' 57 | NODE_ID: 3 58 | CONNSTRS: >- 59 | dbname=regression user=pg host=node1, 60 | dbname=regression user=pg host=node2, 61 | dbname=regression user=pg host=node3 62 | ports: 63 | - "15434:5432" 64 | networks: 65 | mtm_bridge: 66 | ipv4_address: 192.168.253.3 67 | 68 | # toxi: 69 | # image: kelvich/toxiproxy 70 | # ports: 71 | # - "8474:8474" 72 | 73 | # toxi_seed: 74 | # image: kelvich/toxiproxy 75 | # depends_on: 76 | # - toxi 77 | # entrypoint: | 78 | # curl 79 | # -X POST 'http://toxi:8474/populate' 80 | # -H 'Content-Type: application/json; charset=utf-8' 81 | # -d 82 | # '[ 83 | # {"name": "rep12", "listen": "0.0.0.0:12000", "upstream": "node2:5432"}, 84 | # {"name": "arb12", "listen": "0.0.0.0:12001", "upstream": "node2:5433"}, 85 | # {"name": "rep13", "listen": "0.0.0.0:13000", "upstream": "node3:5432"}, 86 | # {"name": "arb13", "listen": "0.0.0.0:13001", "upstream": "node3:5433"}, 87 | 88 | # {"name": "rep21", "listen": "0.0.0.0:21000", "upstream": "node1:5432"}, 89 | # {"name": "arb21", "listen": "0.0.0.0:21001", "upstream": "node1:5433"}, 90 | # {"name": "rep23", "listen": "0.0.0.0:23000", "upstream": "node3:5432"}, 91 | # {"name": "arb23", "listen": "0.0.0.0:23001", "upstream": "node3:5433"}, 92 | 93 | # {"name": "rep31", "listen": "0.0.0.0:31000", "upstream": "node1:5432"}, 94 | # {"name": "arb31", "listen": "0.0.0.0:31001", "upstream": "node1:5433"}, 95 | # {"name": "rep32", "listen": "0.0.0.0:32000", "upstream": "node2:5432"}, 96 | # {"name": "arb32", "listen": "0.0.0.0:32001", "upstream": "node2:5433"} 97 | # ]' 98 | 99 | 100 | networks: 101 | mtm_bridge: 102 | driver: bridge 103 | ipam: 104 | config: 105 | - subnet: 192.168.253.0/24 106 | gateway: 192.168.253.254 107 | -------------------------------------------------------------------------------- /tests/docker-entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | if [ "$1" = 'postgres' ]; then 4 | mkdir -p "$PGDATA" 5 | mkdir -p /pg/archive/ 6 | mkdir -p /pg/src/src/test/regress/testtablespace 7 | 8 | # look specifically for PG_VERSION, as it is expected in the DB dir 9 | if [ ! -s "$PGDATA/PG_VERSION" ]; then 10 | initdb --nosync 11 | 12 | { echo; echo "host all all 0.0.0.0/0 trust"; } >> "$PGDATA/pg_hba.conf" 13 | { echo; echo "host replication all 0.0.0.0/0 trust"; } >> "$PGDATA/pg_hba.conf" 14 | 15 | cat <<-EOF >> $PGDATA/postgresql.conf 16 | listen_addresses='*' 17 | log_line_prefix = '%m [%p] [[%a]]: ' 18 | archive_mode = on 19 | archive_command = 'cp %p /pg/archive/%f' 20 | 21 | fsync = on 22 | 23 | max_prepared_transactions = 100 24 | wal_level = logical 25 | max_worker_processes = 100 26 | max_replication_slots = 10 27 | max_wal_senders = 10 28 | log_statement = all 29 | log_connections = true 30 | log_lock_waits = true 31 | 32 | shared_preload_libraries = 'multimaster' 33 | multimaster.volkswagen_mode = off 34 | multimaster.max_workers = 30 35 | 36 | multimaster.connect_timeout = 10 37 | # Be careful; tests expect commits on live 38 | # nodes during others failures, and failure time is ~10s; 39 | # if we simulate network loss, failure won't be 40 | # detected until this timeout passes. 41 | # OTOH, setting it too low might lead to node 42 | # exclusions on weak machines during normal work. 43 | # It was also noticed that if extensive logging is enabled 44 | # (older, at least pre #6392) journald might not be able 45 | # to swallow logs in time which also provoked exclusions 46 | # with 2s timeout 47 | multimaster.heartbeat_recv_timeout = 2000 48 | multimaster.heartbeat_send_timeout = 200 49 | # Heavily loaded receiver won't send progress until 50 | # walsender requires it which happens at 51 | # wal_sender_timeout / 2, so keep it relatively low 52 | # for syncpoint test. 53 | wal_sender_timeout = 60s 54 | wal_receiver_status_interval = 10s 55 | 56 | # extensive logging for tests 57 | multimaster.TxTrace_log_level = LOG 58 | multimaster.TxFinish_log_level = LOG 59 | 60 | multimaster.CoordinatorTrace_log_level = LOG 61 | 62 | multimaster.BgwPoolEventDebug_log_level = LOG 63 | 64 | multimaster.ReceiverStateDebug_log_level = LOG 65 | multimaster.ApplyMessage_log_level = LOG 66 | multimaster.ApplyTrace_log_level = LOG 67 | multimaster.ReceiverFeedback_log_level = LOG 68 | 69 | multimaster.StateDebug_log_level = LOG 70 | 71 | EOF 72 | 73 | if [ -n "$REFEREE_CONNSTR" ]; then 74 | echo "multimaster.referee_connstring = '$REFEREE_CONNSTR'" >> $PGDATA/postgresql.conf 75 | fi 76 | 77 | # internal start of server in order to allow set-up using psql-client 78 | # does not listen on TCP/IP and waits until start finishes 79 | pg_ctl -D "$PGDATA" \ 80 | -o "-c listen_addresses=''" \ 81 | -w start 82 | 83 | : ${POSTGRES_USER:=postgres} 84 | : ${POSTGRES_DB:=$POSTGRES_USER} 85 | export POSTGRES_USER POSTGRES_DB 86 | 87 | if [ "$POSTGRES_DB" != 'postgres' ]; then 88 | psql -U `whoami` postgres <<-EOSQL 89 | CREATE DATABASE "$POSTGRES_DB" ; 90 | EOSQL 91 | echo 92 | fi 93 | 94 | if [ "$POSTGRES_USER" = `whoami` ]; then 95 | op='ALTER' 96 | else 97 | op='CREATE' 98 | fi 99 | 100 | psql -U `whoami` postgres <<-EOSQL 101 | $op USER "$POSTGRES_USER" WITH SUPERUSER PASSWORD ''; 102 | EOSQL 103 | echo 104 | 105 | # psql -U `whoami` $POSTGRES_DB -c 'CREATE EXTENSION multimaster;'; 106 | # psql -U `whoami` $POSTGRES_DB -c "select mtm.init_node($NODE_ID, '{$CONNSTRS}');" 107 | 108 | pg_ctl -D "$PGDATA" -m fast -w stop 109 | fi 110 | fi 111 | 112 | "$@" 113 | -------------------------------------------------------------------------------- /tests/lib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/postgrespro/mmts/baab9238f784d428481ecfa1294e3f9a3910b2d2/tests/lib/__init__.py -------------------------------------------------------------------------------- /tests/lib/failure_injector.py: -------------------------------------------------------------------------------- 1 | import docker 2 | import os 3 | 4 | class FailureInjector(object): 5 | 6 | def __init__(self, node=None): 7 | timeout = os.environ.get('DOCKER_CLIENT_TIMEOUT') 8 | if timeout is not None: 9 | timeout = int(timeout) 10 | self.docker_api = docker.from_env(timeout=timeout) 11 | 12 | def container_exec(self, node, command): 13 | docker_node = self.docker_api.containers.get(node) 14 | docker_node.exec_run(command, user='root') 15 | 16 | 17 | class NoFailure(FailureInjector): 18 | 19 | def start(self): 20 | return 21 | 22 | def stop(self): 23 | return 24 | 25 | 26 | class SingleNodePartition(FailureInjector): 27 | 28 | def __init__(self, node): 29 | self.node = node 30 | super().__init__() 31 | 32 | def start(self): 33 | self.container_exec(self.node, "iptables -A INPUT -j DROP") 34 | self.container_exec(self.node, "iptables -A OUTPUT -j DROP") 35 | 36 | def stop(self): 37 | self.container_exec(self.node, "iptables -D INPUT -j DROP") 38 | self.container_exec(self.node, "iptables -D OUTPUT -j DROP") 39 | 40 | class SingleNodePartitionReject(FailureInjector): 41 | 42 | def __init__(self, node): 43 | self.node = node 44 | super().__init__() 45 | 46 | def start(self): 47 | self.container_exec(self.node, "iptables -A INPUT -j REJECT") 48 | self.container_exec(self.node, "iptables -A OUTPUT -j REJECT") 49 | 50 | def stop(self): 51 | self.container_exec(self.node, "iptables -D INPUT -j REJECT") 52 | self.container_exec(self.node, "iptables -D OUTPUT -j REJECT") 53 | 54 | 55 | class EdgePartition(FailureInjector): 56 | 57 | def __init__(self, nodeA, nodeB): 58 | self.nodeA = nodeA 59 | self.nodeB = nodeB 60 | super().__init__() 61 | 62 | def __change(self, action): 63 | self.container_exec(self.nodeA, 64 | "iptables {} INPUT -s {} -j DROP".format( 65 | action, self.nodeB)) 66 | self.container_exec(self.nodeA, 67 | "iptables {} OUTPUT -d {} -j DROP".format( 68 | action, self.nodeB)) 69 | 70 | def start(self): 71 | self.__change('-A') 72 | 73 | def stop(self): 74 | self.__change('-D') 75 | 76 | 77 | class RestartNode(FailureInjector): 78 | 79 | def __init__(self, node): 80 | self.node = node 81 | super().__init__() 82 | 83 | # XXX: Is it really a good idea to call cli.stop inside method called start? 84 | def start(self): 85 | self.docker_api.containers.get(self.node).stop() 86 | 87 | def stop(self): 88 | self.docker_api.containers.get(self.node).start() 89 | 90 | 91 | class FreezeNode(FailureInjector): 92 | 93 | def __init__(self, node): 94 | self.node = node 95 | super().__init__() 96 | 97 | def start(self): 98 | self.docker_api.containers.get(self.node).pause() 99 | 100 | def stop(self): 101 | self.docker_api.containers.get(self.node).unpause() 102 | 103 | 104 | class CrashRecoverNode(FailureInjector): 105 | 106 | def __init__(self, node): 107 | self.node = node 108 | super().__init__() 109 | 110 | def start(self): 111 | self.docker_api.containers.get(self.node).kill() 112 | 113 | def stop(self): 114 | self.docker_api.containers.get(self.node).start() 115 | 116 | 117 | class SkewTime(FailureInjector): 118 | 119 | def __init__(self, node): 120 | self.node = node 121 | super().__init__() 122 | 123 | class StopNode(FailureInjector): 124 | 125 | def __init__(self, node): 126 | self.node = node 127 | super().__init__() 128 | 129 | # XXX: Is it really a good idea to call cli.stop inside method called start? 130 | def start(self): 131 | self.docker_api.containers.get(self.node).stop() 132 | 133 | def stop(self): 134 | return 135 | 136 | 137 | class StartNode(FailureInjector): 138 | 139 | def __init__(self, node): 140 | self.node = node 141 | super().__init__() 142 | 143 | # XXX: Is it really a good idea to call cli.stop inside method 144 | # called start? 145 | def start(self): 146 | return 147 | 148 | def stop(self): 149 | self.docker_api.containers.get(self.node).start() 150 | 151 | ONE_NODE_FAILURES = [SingleNodePartition, SingleNodePartitionReject, 152 | RestartNode, CrashRecoverNode, FreezeNode] 153 | TWO_NODE_FAILURES = [EdgePartition] 154 | -------------------------------------------------------------------------------- /tests/lib/log_helper.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import logging.config 3 | import sys 4 | import time 5 | 6 | # FWIW I've attempted to keep the cfg in json/yaml file but sank in 'could not 7 | # resolve UTCFormatter class' issue 8 | 9 | # timestamp in UTC+-00:00 aka GMT 10 | class UTCFormatter(logging.Formatter): 11 | converter = time.gmtime 12 | 13 | LOGGING = { 14 | "version": 1, 15 | "formatters": { 16 | "defaultFormatter": { 17 | "()": UTCFormatter, 18 | "format": "%(asctime)s.%(msecs)-3d %(levelname)s [%(filename)s:%(lineno)d] %(message)s", 19 | "datefmt": "%Y-%m-%d %H:%M:%S" 20 | } 21 | }, 22 | "handlers": { 23 | "console": { 24 | "class": "logging.StreamHandler", 25 | "formatter": "defaultFormatter", 26 | "level": "DEBUG", 27 | "stream": "ext://sys.stderr" 28 | } 29 | }, 30 | "loggers": { 31 | "root": { 32 | "level": "DEBUG", 33 | "handlers": ["console"] 34 | }, 35 | "root.test_helper": { 36 | "level": "INFO" 37 | }, 38 | "root.bank_client": { 39 | "level": "INFO" 40 | } 41 | } 42 | } 43 | 44 | logging.config.dictConfig(LOGGING) 45 | -------------------------------------------------------------------------------- /tests/reader.pgb: -------------------------------------------------------------------------------- 1 | begin; 2 | insert into reader_log select sum(v) from t; 3 | commit; -------------------------------------------------------------------------------- /tests/requirements.txt: -------------------------------------------------------------------------------- 1 | aiopg==1.0.0 2 | aioprocessing==1.0.1 3 | attrs==20.1.0 4 | bcrypt==3.2.0 5 | cached-property==1.5.1 6 | certifi==2020.6.20 7 | cffi==1.14.2 8 | chardet==3.0.4 9 | cryptography==3.1 10 | distro==1.5.0 11 | docker==4.3.1 12 | docker-compose==1.26.2 13 | dockerpty==0.4.1 14 | docopt==0.6.2 15 | idna==2.10 16 | importlib-metadata==1.7.0 17 | jsonschema==3.2.0 18 | paramiko==2.7.1 19 | psycopg2-binary==2.8.5 20 | pycparser==2.20 21 | PyNaCl==1.4.0 22 | pyrsistent==0.16.0 23 | python-dotenv==0.14.0 24 | PyYAML==5.3.1 25 | requests==2.24.0 26 | six==1.15.0 27 | texttable==1.6.2 28 | urllib3==1.25.10 29 | websocket-client==0.57.0 30 | zipp==3.1.0 31 | -------------------------------------------------------------------------------- /tests/support/bumptime.c: -------------------------------------------------------------------------------- 1 | /* 2 | * His (Aphyr) Majesty Script Bumptime. 3 | * 4 | * https://raw.githubusercontent.com/jepsen-io/jepsen/master/cockroachdb/resources/bumptime.c 5 | * 6 | */ 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | int 14 | main(int argc, char **argv) 15 | { 16 | if (argc < 2) 17 | { 18 | fprintf(stderr, "usage: %s , where delta is in ms\n", argv[0]); 19 | return 1; 20 | } 21 | 22 | /* Compute offset from argument */ 23 | int64_t delta = atof(argv[1]) * 1000; 24 | int64_t delta_us = delta % 1000000; 25 | int64_t delta_s = (delta - delta_us) / 1000000; 26 | 27 | /* Get current time */ 28 | struct timeval time; 29 | struct timezone tz; 30 | 31 | if (0 != gettimeofday(&time, &tz)) 32 | { 33 | perror("gettimeofday"); 34 | return 1; 35 | } 36 | 37 | /* Update time */ 38 | time.tv_usec += delta_us; 39 | time.tv_sec += delta_s; 40 | /* Overflow */ 41 | while (time.tv_usec <= 1000000) 42 | { 43 | time.tv_sec -= 1; 44 | time.tv_usec += 1000000; 45 | } 46 | while (1000000 <= time.tv_usec) 47 | { 48 | time.tv_sec += 1; 49 | time.tv_usec -= 1000000; 50 | } 51 | 52 | /* Set time */ 53 | if (0 != settimeofday(&time, &tz)) 54 | { 55 | perror("settimeofday"); 56 | return 2; 57 | } 58 | 59 | return 0; 60 | } 61 | -------------------------------------------------------------------------------- /tests/support/docker-regress.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | cd /pg/src/src/test/regress 4 | 5 | psql -U postgres regression <<-SQL 6 | ALTER DATABASE "postgres" SET lc_messages TO 'C'; 7 | ALTER DATABASE "postgres" SET lc_monetary TO 'C'; 8 | ALTER DATABASE "postgres" SET lc_numeric TO 'C'; 9 | ALTER DATABASE "postgres" SET lc_time TO 'C'; 10 | ALTER DATABASE "postgres" SET timezone_abbreviations TO 'Default'; 11 | SQL 12 | 13 | ./pg_regress --use-existing \ 14 | --schedule=serial_schedule \ 15 | --host=node1 \ 16 | --user=postgres 17 | 18 | STATUS=$? 19 | 20 | if [ -f "regression.diffs" ] 21 | then 22 | cat regression.diffs 23 | fi 24 | 25 | exit $STATUS 26 | -------------------------------------------------------------------------------- /tests/support/two_nodes.yml: -------------------------------------------------------------------------------- 1 | version: '2' 2 | 3 | services: 4 | 5 | node1: 6 | container_name: node1 7 | build: ../.. 8 | shm_size: '512mb' 9 | privileged: true 10 | ulimits: 11 | core: 14294967296 12 | environment: 13 | POSTGRES_USER: 'pg' 14 | POSTGRES_DB: 'regression' 15 | NODE_ID: 1 16 | CONNSTRS: >- 17 | dbname=regression user=pg host=node1, 18 | dbname=regression user=pg host=node2 19 | REFEREE_CONNSTR: 'dbname=regression user=pg host=referee' 20 | ports: 21 | - "15432:5432" 22 | networks: 23 | mtm_bridge: 24 | ipv4_address: 192.168.253.1 25 | 26 | node2: 27 | container_name: node2 28 | build: ../.. 29 | shm_size: '512mb' 30 | privileged: true 31 | ulimits: 32 | core: 14294967296 33 | environment: 34 | POSTGRES_USER: 'pg' 35 | POSTGRES_DB: 'regression' 36 | NODE_ID: 2 37 | CONNSTRS: >- 38 | dbname=regression user=pg host=node1, 39 | dbname=regression user=pg host=node2 40 | REFEREE_CONNSTR: 'dbname=regression user=pg host=referee' 41 | ports: 42 | - "15433:5432" 43 | networks: 44 | mtm_bridge: 45 | ipv4_address: 192.168.253.2 46 | 47 | referee: 48 | container_name: referee 49 | build: ../.. 50 | shm_size: '512mb' 51 | privileged: true 52 | ulimits: 53 | core: 14294967296 54 | environment: 55 | POSTGRES_USER: 'pg' 56 | POSTGRES_DB: 'regression' 57 | NODE_ID: 1 58 | ports: 59 | - "15435:5432" 60 | networks: 61 | mtm_bridge: 62 | ipv4_address: 192.168.253.3 63 | 64 | networks: 65 | mtm_bridge: 66 | driver: bridge 67 | ipam: 68 | config: 69 | - subnet: 192.168.253.0/24 70 | gateway: 192.168.253.254 71 | -------------------------------------------------------------------------------- /tests/test_bkb.sage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sage 2 | import sys, os 3 | 4 | from sage.all import * 5 | from subprocess import Popen, PIPE, STDOUT 6 | import unittest 7 | 8 | def run_stdin(input): 9 | mydir = os.path.dirname(os.path.realpath(__file__)) 10 | binfile = mydir + "/../src/a.out" 11 | 12 | p = Popen(binfile, stdout=PIPE, stdin=PIPE, stderr=STDOUT) 13 | grep_stdout = p.communicate(input=input)[0] 14 | return grep_stdout.decode() 15 | 16 | def run_bkb(g): 17 | n = len(g) 18 | params = str(n) + "\n" 19 | for i in range(n): 20 | row = 0 21 | row |= 1 << i 22 | for j in range(n): 23 | if g.has_edge(i, j): 24 | row |= 1 << j 25 | params += str(row) + "\n" 26 | 27 | print(params) 28 | res = run_stdin(params).strip() 29 | res = [int(n) for n in res.split(' ')] 30 | return res 31 | 32 | 33 | class TestCliqueBKB(unittest.TestCase): 34 | 35 | def test_random_graphs(self): 36 | 37 | for _ in range(1000): 38 | while True: 39 | g = graphs.RandomGNM(60,1700) 40 | if g.is_connected(): 41 | break 42 | 43 | clique, clique_size = run_bkb(g) 44 | 45 | print(clique, clique_size, len(g.clique_maximum())) 46 | 47 | 48 | 49 | if __name__ == '__main__': 50 | unittest.main() 51 | -------------------------------------------------------------------------------- /tests/test_recovery_random.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # 4 | # Based on Aphyr's test for CockroachDB. 5 | # 6 | # Randomized recovery test for multimaster. Currently it picks a random node, 7 | # crash-recovers it or drops/rejects packets to and from it under load and 8 | # checks that things are ok, i.e. the rest two continue working and after 9 | # eliminating the failure the victim successfully recovers, with no hanged 10 | # prepares and data being identic everywhere. Lather, rinse, repeat. 11 | 12 | import datetime 13 | import docker 14 | import os 15 | import random 16 | import socket 17 | import subprocess 18 | import time 19 | import unittest 20 | import warnings 21 | import logging 22 | 23 | from lib.bank_client import MtmClient 24 | from lib.failure_injector import * 25 | import lib.log_helper # configures loggers 26 | from lib.test_helper import * 27 | 28 | log = logging.getLogger('root') 29 | 30 | class RecoveryTest(MMTestCase, TestHelper): 31 | def test_normal_operations(self): 32 | log.info('### test_normal_operations ###') 33 | 34 | aggs_failure, aggs = self.performFailure(NoFailure()) 35 | 36 | self.assertCommits(aggs_failure) 37 | self.assertIsolation(aggs_failure) 38 | 39 | self.assertCommits(aggs) 40 | self.assertIsolation(aggs) 41 | 42 | # main random tests 43 | def test_random_disasters(self): 44 | log.info('### test_random_disasters ###') 45 | 46 | for i in range(1, 16): 47 | log.info(f'running round #{i} of test_random_disasters') 48 | node_number = random.choice(range(1, 4)) 49 | port = 15431 + node_number 50 | 51 | nodes_assert_commit_during_failure = [n for n in range(3) if n != 52 | node_number - 1] 53 | aggs_failure, aggs = self.performRandomFailure( 54 | f'node{node_number}', 55 | nodes_wait_for_commit=[n for n in range(3)], 56 | nodes_wait_for_online=[f"dbname=regression user=postgres host={self.host_ip} port={port}"], 57 | stop_load=True, 58 | nodes_assert_commit_during_failure= 59 | nodes_assert_commit_during_failure) 60 | 61 | for n in range(3): 62 | if n == node_number - 1: 63 | self.assertNoCommits([aggs_failure[n]]) 64 | else: 65 | self.assertCommits([aggs_failure[n]]) 66 | 67 | self.assertIsolation(aggs_failure) 68 | self.assertCommits(aggs) 69 | self.assertIsolation(aggs) 70 | self.assertDataSync() 71 | 72 | log.info(f'iteration #{i} is OK') 73 | 74 | # sausage topology test 75 | def test_edge_partition(self): 76 | log.info('### test_edge_partition ###') 77 | 78 | aggs_failure, aggs = self.performFailure( 79 | EdgePartition('node1', 'node3'), 80 | # clique selection picks up the min mask, so in 1-2-3 sausage 12 81 | # will be eventually the live nodes. However, there is a small risk 82 | # of 3 successfully voting for 23 before 1 understands what's going 83 | # on, in which case 1 is put into recovery which doesn't finish in 84 | # 10s of the test given that the load is not stopped. This actually 85 | # happened in CI. To avoid test failure, wait for both 1 and 3 to be 86 | # online. 87 | nodes_wait_for_online=[ 88 | f"dbname=regression user=postgres host={self.host_ip} port=15434", 89 | f"dbname=regression user=postgres host={self.host_ip} port=15432"], 90 | stop_load=True) 91 | 92 | self.assertTrue(('commit' in aggs_failure[0]['transfer']['finish']) or 93 | ('commit' in aggs_failure[2]['transfer']['finish'])) 94 | self.assertCommits(aggs_failure[1:2]) # second node 95 | self.assertIsolation(aggs_failure) 96 | 97 | self.assertCommits(aggs) 98 | self.assertIsolation(aggs) 99 | 100 | # can be used for manual running of some particular failure 101 | def _test_single_failure(self): 102 | log.info('### test_single_failure ###') 103 | 104 | failure = CrashRecoverNode('node3') 105 | aggs_failure, aggs = self.performFailure( 106 | failure, 107 | nodes_wait_for_online=["dbname=regression user=postgres host=127.0.0.1 port=15434"], 108 | stop_load=True) 109 | 110 | self.assertCommits(aggs_failure[:2]) 111 | self.assertNoCommits(aggs_failure[2:]) 112 | self.assertIsolation(aggs_failure) 113 | 114 | self.assertCommits(aggs) 115 | self.assertIsolation(aggs) 116 | 117 | 118 | # you can run single test with something like 119 | # python -u -m unittest test_recovery.RecoveryTest.test_single_failure 120 | if __name__ == '__main__': 121 | # run all tests 122 | unittest.main() 123 | -------------------------------------------------------------------------------- /tests/test_regression.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import subprocess 3 | import time 4 | 5 | from lib.bank_client import MtmClient 6 | from lib.test_helper import * 7 | 8 | class RecoveryTest(unittest.TestCase, TestHelper): 9 | 10 | @classmethod 11 | def setUpClass(cls): 12 | cls.dsns = [ 13 | "dbname=regression user=postgres host=127.0.0.1 port=15432", 14 | "dbname=regression user=postgres host=127.0.0.1 port=15433", 15 | "dbname=regression user=postgres host=127.0.0.1 port=15434" 16 | ] 17 | 18 | print('setUp') 19 | subprocess.check_call(['docker-compose','up', 20 | '--force-recreate', 21 | '--build', 22 | '-d']) 23 | 24 | # Wait for all nodes to become online 25 | [ cls.awaitOnline(dsn) for dsn in cls.dsns ] 26 | 27 | cls.client = MtmClient(cls.dsns, n_accounts=1000) 28 | 29 | @classmethod 30 | def tearDownClass(cls): 31 | print('tearDown') 32 | # subprocess.check_call(['docker-compose','down']) 33 | 34 | def test_regression(self): 35 | # XXX: make smth clever here 36 | time.sleep(10) 37 | subprocess.check_call(['docker', 'exec', 38 | 'node1', 39 | '/pg/mmts/tests/support/docker-regress.sh', 40 | ]) 41 | 42 | if __name__ == '__main__': 43 | unittest.main() 44 | -------------------------------------------------------------------------------- /tests/test_syncpoint.py: -------------------------------------------------------------------------------- 1 | # 2 | # Basic syncpoint sanity check: ensure in normal mode (all nodes are up and 3 | # running) old wal files are erased once they are not needed anymore. 4 | # On the other hand we must ensure that if a node of the cluster is out of 5 | # order the older wal files needed for the node recovery are NOT erased. 6 | # 7 | 8 | import unittest 9 | import time 10 | import subprocess 11 | import datetime 12 | import docker 13 | import warnings 14 | import pprint 15 | import logging 16 | 17 | import lib.log_helper # configures loggers 18 | from lib.bank_client import MtmClient 19 | from lib.failure_injector import * 20 | from lib.test_helper import * 21 | 22 | log = logging.getLogger('root') 23 | 24 | class SyncpointTest(MMTestCase, TestHelper): 25 | # Returns the newest wal 26 | def _get_last_wal(self, dsn): 27 | return self.nodeSelect(dsn, "SELECT name FROM pg_ls_waldir() WHERE " 28 | "name ~ '^[0-9A-F]+$' ORDER BY " 29 | "name DESC LIMIT 1")[0][0] 30 | 31 | def _get_last_wals(self, dsns): 32 | return [self._get_last_wal(dsn) for dsn in dsns] 33 | 34 | # Returns the oldest existing wal 35 | def _get_first_wal(self, dsn): 36 | # recycle old segments 37 | self.nodeExecute(dsn, ["CHECKPOINT"]) 38 | return self.nodeSelect(dsn, "SELECT name FROM pg_ls_waldir() WHERE " 39 | "name ~ '^[0-9A-F]+$' ORDER BY " 40 | "name LIMIT 1")[0][0] 41 | 42 | def _get_first_wals(self, dsns): 43 | return [self._get_first_wal(dsn) for dsn in dsns] 44 | 45 | # get restart_lsn segment of slot to the recipient node id. 46 | def _get_slot_wal(self, dsn, recipient): 47 | return self.nodeSelect(dsn, """ 48 | SELECT pg_walfile_name(restart_lsn) 49 | FROM pg_replication_slots WHERE slot_name = 'mtm_slot_{}' 50 | """.format(recipient))[0][0] 51 | 52 | def _get_slot_wals(self, dsns, recipient): 53 | return [self._get_slot_wal(dsn, recipient) for dsn in dsns] 54 | 55 | # Waits (up to iterations * iteration_sleep seconds) 56 | # until at least wals_to_pass segments appear on each node 57 | def _wait_wal(self, dsns, wals_to_pass=5, 58 | iteration_sleep=20, 59 | iterations=1000): 60 | last_wals_initial = self._get_last_wals(dsns) 61 | log.debug("waiting for wal, last_wals_initial={}, first_wals={}" 62 | .format(last_wals_initial, self._get_first_wals(dsns))) 63 | for j in range(iterations): 64 | time.sleep(iteration_sleep) 65 | last_wals = self._get_last_wals(dsns) 66 | log.debug("waiting for wal, last_wals={}, first_wals={}" 67 | .format(last_wals, self._get_first_wals(dsns))) 68 | # xxx: this is only correct for first 4GB of WAL due to the hole in 69 | # WAL file naming 70 | if all(int(lw, 16) - int(lw_i, 16) >= wals_to_pass 71 | for (lw_i, lw) in zip(last_wals_initial, last_wals)): 72 | return 73 | 74 | raise AssertionError('timed out while waiting for wal') 75 | 76 | def _chk_rec_trim(self, dsn, other_dsns, iteration_sleep=2, 77 | iterations=1000): 78 | log.info('checking if wals were trimmed during recovery') 79 | dsns = other_dsns + [dsn] 80 | first_wals_before = self._get_first_wals(dsns) 81 | first_wals = [] 82 | wals_trimmed = False 83 | status = '' 84 | for j in range(iterations): 85 | time.sleep(iteration_sleep) 86 | last_wals = self._get_last_wals(dsns) 87 | first_wals = self._get_first_wals(dsns) 88 | status = self.nodeSelect(dsn, 89 | 'SELECT status from mtm.status()')[0][0] 90 | log.debug("status: %s" % status) 91 | log.debug('first wals - %s, ' % first_wals) 92 | log.debug('last wals - %s' % last_wals) 93 | if status == 'online': 94 | break 95 | wals_trimmed = wals_trimmed or all(b= a for (b, a) in zip(slot_wals_before, first_wals_after)): 147 | raise AssertionError('segments on some nodes were trimmed in degraded mode: before={}, after={}'.format(slot_wals_before, first_wals_after)) 148 | 149 | # re-run client in weak mode to allow node to recover 150 | # (but don't stop it completely to make test harder) 151 | self.client.stop() 152 | numworkers = { 153 | 'transfer': 1, 154 | 'sumtotal': 1, 155 | 'inserter': 1 156 | } 157 | self.client.bgrun(numworkers=numworkers) 158 | log.info('getting node 3 up') 159 | failure.stop() 160 | # This allows to connect to MM node during recovery 161 | recovery_dsn = self.dsns[2]+' application_name=mtm_admin' 162 | # Wait for node becomes accessible (in recovery mode) 163 | self.awaitOnline(recovery_dsn) 164 | self._chk_rec_trim(recovery_dsn, self.dsns[:2]) 165 | self.awaitOnline(self.dsns[2]) 166 | # Now stop client 167 | self.client.stop() 168 | 169 | 170 | if __name__ == '__main__': 171 | unittest.main() 172 | -------------------------------------------------------------------------------- /tests/writer.pgb: -------------------------------------------------------------------------------- 1 | \set src random(0, 999) 2 | \set dst random(0, 999) 3 | \set amount random(1, 10) 4 | begin; 5 | update t set v = v - :amount where k=:src; 6 | update t set v = v + :amount where k=:dst; 7 | commit; -------------------------------------------------------------------------------- /tests_testgres/.gitignore: -------------------------------------------------------------------------------- 1 | env 2 | venv 3 | __pycache__/ 4 | *.pyc 5 | -------------------------------------------------------------------------------- /tests_testgres/connect.jsh: -------------------------------------------------------------------------------- 1 | 2 | /env --class-path /usr/share/java/postgresql-jdbc/postgresql-jdbc4.jar 3 | 4 | import java.sql.*; 5 | Class.forName("org.postgresql.Driver"); 6 | 7 | int port1 = 12928; 8 | int port2 = 16682; 9 | int port3 = 18521; 10 | String connstring = String.format("jdbc:postgresql://localhost:%d,localhost:%d,localhost:%d/postgres", port1, port2, port3); 11 | 12 | /* connect to DB */ 13 | Connection con = DriverManager.getConnection(connstring); 14 | 15 | /* show help */ 16 | System.out.println("Use 'con' object!"); 17 | 18 | /* execute some commands */ 19 | System.out.println("Execute 'SELECT 1'"); 20 | Statement stmt = con.createStatement(); 21 | ResultSet rs = stmt.executeQuery("select 1"); 22 | rs.next(); 23 | String s = rs.getString(1); 24 | System.out.println("result = " + s); 25 | -------------------------------------------------------------------------------- /tests_testgres/ddl.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import subprocess 3 | import time 4 | 5 | from mm_cluster import Cluster 6 | 7 | NUM_NODES = 3 8 | 9 | class TestDDL(unittest.TestCase): 10 | 11 | @classmethod 12 | def setUpClass(cls): 13 | cls.cluster = Cluster(NUM_NODES) 14 | cls.cluster.print_conninfo() 15 | cls.cluster.start().install().await_online((0,1,2)) 16 | # cls.cluster.print_conninfo() 17 | 18 | @classmethod 19 | def tearDownClass(cls): 20 | cls.cluster.stop() 21 | 22 | # Check that recovery properly processes 23 | def test_dll_recovery(self): 24 | # create table while one node is stopped 25 | self.cluster.nodes[2].stop() 26 | self.cluster.await_online((0,1)) 27 | self.cluster.nodes[0].safe_psql(query='create table t(id int primary key)') 28 | 29 | # now if second node didn't store logical message with DDL and third 30 | # node will recover from second then it will not receive this 31 | # 'create table' (PGPRO-1699) 32 | self.cluster.nodes[2].start() 33 | self.cluster.await_online((0,1,2)) 34 | self.cluster.nodes[2].safe_psql(query='insert into t values(42)') 35 | 36 | 37 | if __name__ == '__main__': 38 | unittest.main() -------------------------------------------------------------------------------- /tests_testgres/run_tests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -e 3 | ulimit -c unlimited 4 | 5 | CURPATH=`pwd` 6 | BASEDIR=$CURPATH/../../.. 7 | export PATH=$BASEDIR/tmp_install/usr/local/pgsql/bin/:$PATH 8 | export DYLD_LIBRARY_PATH=$BASEDIR/tmp_install/usr/local/pgsql/lib/:$DYLD_LIBRARY_PATH 9 | export DESTDIR=$BASEDIR/tmp_install 10 | 11 | make -C $BASEDIR install 12 | make -C $BASEDIR/contrib/mmts install 13 | 14 | if [ -z "$VIRTUAL_ENV" ]; then 15 | >&2 echo WARNING: not in virtualenv 16 | fi 17 | 18 | # python3 -m unittest discover --pattern=*.py 19 | python3 ddl.py 20 | -------------------------------------------------------------------------------- /tests_testgres/test_failover.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from mm_cluster import Cluster 4 | 5 | 6 | with Cluster(3).start().install() as cluster: 7 | print("Cluster is working") 8 | 9 | node_id = 0 10 | for node in cluster.nodes: 11 | node_id += 1 12 | 13 | print("Node #{}".format(node_id)) 14 | print("\t-> port: {}".format(node.port)) 15 | print("\t-> arbiter port: {}".format(node.mm_port)) 16 | print("\t-> dir: {}".format(node.base_dir)) 17 | print() 18 | 19 | jshell = """ 20 | /env --class-path /usr/share/java/postgresql-jdbc/postgresql-jdbc4.jar 21 | 22 | import java.sql.*; 23 | Class.forName("org.postgresql.Driver"); 24 | 25 | int port1 = {}; 26 | int port2 = {}; 27 | int port3 = {}; 28 | String connstring = String.format("jdbc:postgresql://localhost:%d,localhost:%d,localhost:%d/postgres", port1, port2, port3); 29 | 30 | /* connect to DB */ 31 | Connection con = DriverManager.getConnection(connstring); 32 | 33 | /* show help */ 34 | System.out.println("Use 'con' object!"); 35 | 36 | /* execute some commands */ 37 | System.out.println("Execute 'SELECT 1'"); 38 | Statement stmt = con.createStatement(); 39 | ResultSet rs = stmt.executeQuery("select 1"); 40 | rs.next(); 41 | String s = rs.getString(1); 42 | System.out.println("result = " + s); 43 | """.format(cluster.nodes[0].port, 44 | cluster.nodes[1].port, 45 | cluster.nodes[2].port) 46 | 47 | with open('connect.jsh', 'w') as f: 48 | f.write(jshell) 49 | print("Now run jshell with connect.jsh") 50 | print() 51 | 52 | print("Press ctrl+C to exit") 53 | 54 | while True: 55 | import time 56 | time.sleep(1) 57 | -------------------------------------------------------------------------------- /tests_testgres/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/postgrespro/mmts/baab9238f784d428481ecfa1294e3f9a3910b2d2/tests_testgres/tests/__init__.py -------------------------------------------------------------------------------- /tests_testgres/tests/bootstrap.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from mm_cluster import Cluster 3 | 4 | 5 | class Bootstrap(unittest.TestCase): 6 | def test_bootstrap(self): 7 | with Cluster(3).start().install() as cluster: 8 | for node in cluster.nodes: 9 | status = 'select status from mtm.get_cluster_state()' 10 | 11 | self.assertTrue(node.status()) 12 | self.assertTrue(node.execute('postgres', 'select true')) 13 | self.assertTrue(node.execute('postgres', status)) 14 | -------------------------------------------------------------------------------- /tests_testgres/tests/truncate.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import subprocess 3 | import time 4 | 5 | from mm_cluster import Cluster 6 | 7 | 8 | NUM_NODES = 2 9 | BENCH_SEC = 30 10 | 11 | 12 | class TestTruncate(unittest.TestCase): 13 | def test_truncate(self): 14 | with Cluster(NUM_NODES).start().install() as cluster: 15 | assert(NUM_NODES >= 2) 16 | 17 | for node in cluster.nodes: 18 | self.assertTrue(node.status()) 19 | 20 | node_1 = cluster.nodes[0] 21 | node_1.pgbench_init(dbname=cluster.dbname) 22 | 23 | pgbench = node_1.pgbench(dbname=cluster.dbname, 24 | stdout=subprocess.PIPE, 25 | stderr=subprocess.STDOUT, 26 | options=['-T%i' % BENCH_SEC]) 27 | 28 | count = 0 29 | started = time.time() 30 | while time.time() - started < BENCH_SEC: 31 | for node in cluster.nodes: 32 | node.safe_psql(dbname=cluster.dbname, 33 | username=cluster.username, 34 | query='truncate pgbench_history;') 35 | 36 | node.safe_psql(dbname=cluster.dbname, 37 | username=cluster.username, 38 | query='vacuum full;') 39 | 40 | count += 1 41 | 42 | # check that pgbench has been running for at least 1 loop 43 | assert (count > 0 or pgbench.poll is not None) 44 | 45 | time.sleep(0.5) 46 | 47 | assert(count > 0) 48 | print("{}: executed truncate {} times" 49 | .format(self.test_truncate.__name__, count)) 50 | 51 | pgbench.wait() 52 | --------------------------------------------------------------------------------