├── .gitignore
├── .gitlab-ci.yml
├── Cluster.pm
├── Dockerfile
├── LICENSE
├── Makefile
├── README.md
├── doc
    ├── multimaster.xml
    ├── multimaster_book.xml
    ├── readme.md
    ├── specs
    │   ├── .gitignore
    │   ├── MtmGenerations.cfg
    │   ├── MtmGenerations.tla
    │   ├── MtmPrimitiveCurrent.cfg
    │   ├── MtmPrimitiveCurrent.tla
    │   ├── MtmPrimitiveCurrentMasks.cfg
    │   ├── MtmPrimitiveCurrentMasks.tla
    │   ├── MtmPrimitiveCurrentMasksFixed.cfg
    │   ├── MtmPrimitiveCurrentMasksFixed.tla
    │   ├── commit.cfg
    │   ├── commit.md
    │   ├── commit.tla
    │   ├── generations2.md
    │   └── mm_recovery.ipynb
    ├── stylesheet.css
    └── stylesheet.xsl
├── expected
    ├── atx.out
    ├── multimaster.out
    ├── regression_ee.diff
    └── regression_vanilla.diff
├── multimaster--1.0.sql
├── multimaster.control
├── referee
    ├── Makefile
    ├── expected
    │   └── referee.out
    ├── referee--1.0.sql
    ├── referee.control
    └── sql
    │   └── referee.sql
├── run.pl
├── sql
    ├── atx.sql
    └── multimaster.sql
├── src
    ├── bgwpool.c
    ├── bkb.c
    ├── bytebuf.c
    ├── commit.c
    ├── ddd.c
    ├── ddl.c
    ├── dmq.c
    ├── global_tx.c
    ├── include
    │   ├── bgwpool.h
    │   ├── bkb.h
    │   ├── bytebuf.h
    │   ├── commit.h
    │   ├── compat.h
    │   ├── ddd.h
    │   ├── ddl.h
    │   ├── dmq.h
    │   ├── global_tx.h
    │   ├── logger.h
    │   ├── messaging.h
    │   ├── mtm_utils.h
    │   ├── multimaster.h
    │   ├── pglogical_config.h
    │   ├── pglogical_hooks.h
    │   ├── pglogical_output.h
    │   ├── pglogical_output
    │   │   ├── compat.h
    │   │   └── hooks.h
    │   ├── pglogical_proto.h
    │   ├── pglogical_relid_map.h
    │   ├── receiver.h
    │   ├── resolver.h
    │   ├── spill.h
    │   ├── state.h
    │   └── syncpoint.h
    ├── mtm_utils.c
    ├── multimaster.c
    ├── pglogical_apply.c
    ├── pglogical_config.c
    ├── pglogical_hooks.c
    ├── pglogical_output.c
    ├── pglogical_proto.c
    ├── pglogical_receiver.c
    ├── pglogical_relid_map.c
    ├── resolver.c
    ├── spill.c
    ├── state.c
    ├── syncpoint.c
    └── test_bkb.sage.py
├── t
    ├── 000_cross._pl
    ├── 000_deadlock.pl
    ├── 000_init._pl
    ├── 001_regress.pl
    ├── 002_regressmm.pl
    ├── 003_basic_recovery.pl
    ├── 004_recovery.pl
    ├── 005_pgbench.pl
    ├── 006_pgbenchdl.pl
    ├── 007_add_stop_node.pl
    ├── 008_bugfixes.pl
    └── 009_identity_func.pl
├── tests
    ├── .gitignore
    ├── Pipfile
    ├── Pipfile.lock
    ├── deadl.pgb
    ├── docker-compose.yml
    ├── docker-entrypoint.sh
    ├── lib
    │   ├── __init__.py
    │   ├── bank_client.py
    │   ├── failure_injector.py
    │   ├── log_helper.py
    │   └── test_helper.py
    ├── reader.pgb
    ├── requirements.txt
    ├── support
    │   ├── bumptime.c
    │   ├── docker-regress.sh
    │   └── two_nodes.yml
    ├── test_bkb.sage.py
    ├── test_recovery_random.py
    ├── test_referee.py
    ├── test_regression.py
    ├── test_syncpoint.py
    └── writer.pgb
└── tests_testgres
    ├── .gitignore
    ├── connect.jsh
    ├── ddl.py
    ├── mm_cluster.py
    ├── run_tests.sh
    ├── test_failover.py
    └── tests
        ├── __init__.py
        ├── bootstrap.py
        └── truncate.py


/.gitignore:
--------------------------------------------------------------------------------
1 | /log/
2 | /results/
3 | /tmp_check/
4 | regression.diff.diff


--------------------------------------------------------------------------------
/.gitlab-ci.yml:
--------------------------------------------------------------------------------
  1 | # Run python tests.
  2 | 
  3 | image: pgpgpro/dev:stretch
  4 | 
  5 | .only-default: &only-default
  6 |   only:
  7 |     refs:
  8 |       - merge_requests
  9 |       - tags
 10 |       - schedules
 11 |       - branches
 12 |       - pushes
 13 |       - web
 14 |       - triggers
 15 |     changes:
 16 |       - '**/*'
 17 | 
 18 | # Tests are docker-based, and so is gitlab executor itself. We are using a bit
 19 | # monstrous (and recommended) approach of running dind 'service' container
 20 | # alongside main executor; it runs docker and exposes its socket:
 21 | # https://docs.gitlab.com/ee/ci/docker/using_docker_build.html#use-the-docker-executor-with-the-docker-image-docker-in-docker
 22 | # These variables tell the executor how to reach the socket.
 23 | #
 24 | # The 'docker' hostname is the alias of the service container as described at
 25 | # https://docs.gitlab.com/ee/ci/docker/using_docker_images.html#accessing-the-services
 26 | .docker_variables: &docker_variables
 27 |   DOCKER_HOST: tcp://docker:2375/
 28 |   # When using dind, it's wise to use the overlayfs driver for
 29 |   # improved performance.
 30 |   DOCKER_DRIVER: overlay2
 31 |   DOCKER_TLS_CERTDIR: ""
 32 | 
 33 | stages:
 34 |   - build_core_image
 35 |   - make_check
 36 |   # hardcoded stuff in python tests doesn't allow to run them in parallel
 37 |   - recovery random
 38 |   - referee
 39 |   - syncpoint
 40 | 
 41 | # builds image with ee core and saves it as an artifact
 42 | build_core_image:
 43 |   <<: *only-default
 44 |   stage: build_core_image
 45 |   retry: 1
 46 |   image: pgpgpro/dev:alpine
 47 |   # run container providing docker alongside
 48 |   services:
 49 |     - docker:dind
 50 |   variables:
 51 |     <<: *docker_variables
 52 |     branch: ee13_mm
 53 |   artifacts:
 54 |     expire_in: 24 hours
 55 |     when: always
 56 |     paths:
 57 |       - docker-image/pgmm.tar.gz
 58 |       - postgrespro.tar.gz
 59 |   script:
 60 |     # Add mm_gitlab_ci_ed25519 env var of type 'file' with the key in gitlab
 61 |     - ssh-agent sh -c 'ssh-add ${mm_gitlab_ci_ed25519}; GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone --depth=1 --branch "${branch}" git@git.postgrespro.ru:pgpro-dev/postgrespro.git'
 62 |     - cd postgrespro
 63 |     - docker build -t pgmm .
 64 |     - cd ..
 65 |     - mkdir docker-image
 66 |     - docker save pgmm > docker-image/pgmm.tar
 67 |     - tar czf docker-image/pgmm.tar.gz docker-image/pgmm.tar
 68 |     - ls -lah docker-image/
 69 |     - rm docker-image/pgmm.tar
 70 |     # also save archived sources of core
 71 |     - tar -czf postgrespro.tar.gz postgrespro
 72 | 
 73 | # make check. We build core from sources again which is a bit ugly as we already
 74 | # built the image, but let's not wobble here with yet another docker
 75 | make_check:
 76 |   <<: *only-default
 77 |   stage: make_check
 78 |   # gives us the archive with core sources
 79 |   dependencies:
 80 |     - build_core_image
 81 |   artifacts:
 82 |     when: always
 83 |     paths:
 84 |       - postgrespro/contrib/mmts/tmp_check/log
 85 |       - postgrespro/contrib/mmts/tmp_check/regress_outdir
 86 |   script:
 87 |     - ls
 88 |     - tar -xzf postgrespro.tar.gz
 89 |     - shopt -s extglob
 90 |     - rm -rf postgrespro/contrib/mmts; mkdir postgrespro/contrib/mmts
 91 |     - mv !(postgrespro) postgrespro/contrib/mmts
 92 |     - cd postgrespro
 93 |     - CFLAGS="-ggdb3 -O0" ./configure --enable-cassert --enable-debug --with-perl --enable-tap-tests
 94 |     - make -j8
 95 |     - cd contrib/mmts && make check
 96 | 
 97 | recovery_random:
 98 |   <<: *only-default
 99 |   stage: recovery random
100 |   image: pgpgpro/dev:alpine
101 |   services:
102 |     - docker:dind
103 |   dependencies:
104 |     - build_core_image
105 |   artifacts:
106 |     when: on_failure
107 |     paths:
108 |       - tests/logs1
109 |       - tests/logs2
110 |       - tests/logs3
111 |   variables:
112 |     <<: *docker_variables
113 |   before_script:
114 |     - docker info
115 |   script:
116 |     - tar -xzvf docker-image/pgmm.tar.gz
117 |     - docker load -i docker-image/pgmm.tar
118 |     - cd tests/
119 |     - env CI=1 python3 -u test_recovery_random.py --failfast
120 | 
121 | referee:
122 |   extends: recovery_random
123 |   stage: referee
124 |   artifacts:
125 |     paths:
126 |       - tests/logs1
127 |       - tests/logs2
128 |       - tests/logs_referee
129 |   script:
130 |     - tar -xzvf docker-image/pgmm.tar.gz
131 |     - docker load -i docker-image/pgmm.tar
132 |     - cd tests/
133 |     - env CI=1 python3 -u test_referee.py --failfast
134 | 
135 | syncpoint:
136 |   extends: recovery_random
137 |   stage: syncpoint
138 |   script:
139 |     - tar -xzvf docker-image/pgmm.tar.gz
140 |     - docker load -i docker-image/pgmm.tar
141 |     - cd tests/
142 |     - env CI=1 python3 -u test_syncpoint.py --failfast
143 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM pgmm
 2 | 
 3 | RUN mkdir /pg/mmts
 4 | COPY ./ /pg/mmts/
 5 | 
 6 | RUN export USE_PGXS=1 && \
 7 |     cd /pg/mmts && make clean && make install
 8 | 
 9 | # pg_regress client assumes such dir exists on server
10 | RUN cp /pg/src/src/test/regress/*.so /pg/install/lib/postgresql/
11 | USER postgres
12 | ENV PGDATA /pg/data
13 | ENTRYPOINT ["/pg/mmts/tests/docker-entrypoint.sh"]
14 | 
15 | EXPOSE 5432
16 | CMD ["postgres"]
17 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | EXTENSION = multimaster
  2 | DATA = multimaster--1.0.sql
  3 | OBJS = src/multimaster.o src/dmq.o src/commit.o src/bytebuf.o src/bgwpool.o \
  4 | src/pglogical_output.o src/pglogical_proto.o src/pglogical_receiver.o \
  5 | src/pglogical_apply.o src/pglogical_hooks.o src/pglogical_config.o \
  6 | src/pglogical_relid_map.o src/ddd.o src/bkb.o src/spill.o src/state.o \
  7 | src/resolver.o src/ddl.o src/syncpoint.o src/global_tx.o src/mtm_utils.o
  8 | MODULE_big = multimaster
  9 | 
 10 | ifndef USE_PGXS # hmm, user didn't requested to use pgxs
 11 | # relative path to this makefile
 12 | mkfile_path := $(word $(words $(MAKEFILE_LIST)),$(MAKEFILE_LIST))
 13 | # relative path to dir with this makefile
 14 | mkfile_dir := $(dir $(mkfile_path))
 15 | # abs path to dir with this makefile
 16 | mkfile_abspath := $(shell cd $(mkfile_dir) && pwd -P)
 17 | # parent dir name of directory with makefile
 18 | parent_dir_name := $(shell basename $(shell dirname $(mkfile_abspath)))
 19 | ifneq ($(parent_dir_name),contrib) # a-ha, but the extension is not inside 'contrib' dir
 20 | USE_PGXS := 1 # so use it anyway, most probably that's what the user wants
 21 | endif
 22 | endif
 23 | # $(info) is introduced in 3.81, and PG doesn't support makes older than 3.80
 24 | # ifeq ($(MAKE_VERSION),3.80)
 25 | # $(warning $$USE_PGXS is [${USE_PGXS}] (we use it automatically if not in contrib dir))
 26 | # else
 27 | # $(info $$USE_PGXS is [${USE_PGXS}] (we use it automatically if not in contrib dir))
 28 | # endif
 29 | 
 30 | ifdef USE_PGXS # use pgxs
 31 | # You can specify path to pg_config in PG_CONFIG var
 32 | ifndef PG_CONFIG
 33 | 	PG_CONFIG := pg_config
 34 | endif
 35 | PG_CPPFLAGS += -I$(CURDIR)/src/include
 36 | # add installation top include directory for libpq header
 37 | # (seems like server/ dir is added by pgxs)
 38 | PG_CPPFLAGS += -I$(shell $(PG_CONFIG) --includedir)
 39 | SHLIB_LINK += -lpq # add libpq
 40 | PGXS := $(shell $(PG_CONFIG) --pgxs)
 41 | include $(PGXS)
 42 | 
 43 | else # assume the extension is in contrib/ dir of pg distribution
 44 | PG_CPPFLAGS += -I$(top_srcdir)/$(subdir)/src/include
 45 | PG_CPPFLAGS += -I$(libpq_srcdir) # include libpq-fe, defined in Makefile.global.in
 46 | SHLIB_LINK = $(libpq) # defined in Makefile.global.in
 47 | subdir = contrib/mmts
 48 | top_builddir = ../..
 49 | include $(top_builddir)/src/Makefile.global
 50 | # in ee, install pathman as well
 51 | ifeq (${PGPRO_EDITION}, enterprise)
 52 | EXTRA_INSTALL=contrib/pg_pathman
 53 | endif
 54 | include $(top_srcdir)/contrib/contrib-global.mk
 55 | endif # USE_PGXS
 56 | 
 57 | REGRESS_SHLIB=$(abs_top_builddir)/src/test/regress/regress$(DLSUFFIX)
 58 | export REGRESS_SHLIB
 59 | 
 60 | .PHONY: all
 61 | 
 62 | # recurse down to referee/ on install.
 63 | # (I'd use $(call recurse...), but how can we pass USE_PGXS there?
 64 | referee-install:
 65 | 	USE_PGXS=$(USE_PGXS) $(MAKE) -C referee install
 66 | install: referee-install
 67 | 
 68 | all: multimaster.so
 69 | 
 70 | submake-regress:
 71 | 	$(MAKE) -C $(top_builddir)/src/test/regress all
 72 | 	$(MAKE) -C $(top_builddir)/src/test/regress tablespace-setup
 73 | 
 74 | # all .pl tests should pass now, but let's see what the buildfarm says
 75 | # ifndef MTM_ALL
 76 | # PROVE_TESTS ?=
 77 | # endif
 78 | PROVE_FLAGS += --timer
 79 | ifndef USE_PGXS
 80 | check: temp-install submake-regress
 81 | 	$(prove_check)
 82 | else # pgxs build
 83 | # Note that for PGXS build we override here bail-out recipe defined in pgxs.mk,
 84 | # but well, why should we chose another name?
 85 | # submake-regress won't work as we have no access to the source; we assume
 86 | # regress is already installed
 87 | # final spell is inspired by
 88 | # https://www.2ndquadrant.com/en/blog/using-postgresql-tap-framework-extensions/
 89 | # and Makefile.global.in which is obviously the original source
 90 | check:
 91 | 	rm -rf '$(CURDIR)'/tmp_check
 92 | 	$(MKDIR_P) '$(CURDIR)'/tmp_check
 93 | 	PGXS=$(PGXS) TESTDIR='$(CURDIR)' PATH="$(bindir):$$PATH" PG_REGRESS='$(top_builddir)/src/test/regress/pg_regress' $(PROVE) $(PG_PROVE_FLAGS) $(PROVE_FLAGS) $(if $(PROVE_TESTS),$(PROVE_TESTS),t/*.pl)
 94 | endif
 95 | 
 96 | # PG_PROVE_FLAGS adds PostgresNode and friends include dir
 97 | start: temp-install
 98 | 	rm -rf '$(CURDIR)'/tmp_check
 99 | 	$(MKDIR_P) '$(CURDIR)'/tmp_check
100 | 	cd $(srcdir) && TESTDIR='$(CURDIR)' \
101 | 		$(with_temp_install) \
102 | 		PG_REGRESS='$(CURDIR)/$(top_builddir)/src/test/regress/pg_regress' \
103 | 		perl $(PG_PROVE_FLAGS) run.pl --action=start $(RUN_OPTS)
104 | 
105 | stop:
106 | 	cd $(srcdir) && TESTDIR='$(CURDIR)' \
107 | 		PG_REGRESS='$(top_builddir)/src/test/regress/pg_regress' \
108 | 		perl $(PG_PROVE_FLAGS) run.pl --action=stop $(RUN_OPTS)
109 | 
110 | # for manual testing: runs core regress tests on 'make start'ed cluster
111 | run-pg-regress: submake-regress
112 | 	cd $(CURDIR)/$(top_builddir)/src/test/regress && \
113 | 	$(with_temp_install) \
114 | 	PGPORT='65432' \
115 | 	PGHOST='127.0.0.1' \
116 | 	PGUSER='$(USER)' \
117 | 	./pg_regress \
118 | 	--bindir='' \
119 | 	--use-existing \
120 | 	--schedule=$(abs_top_srcdir)/src/test/regress/parallel_schedule \
121 | 	--dlpath=$(CURDIR)/$(top_builddir)/src/test/regress \
122 | 	--inputdir=$(abs_top_srcdir)/src/test/regress
123 | 
124 | # for manual testing: runs contrib/test_partition on 'make start'ed cluster
125 | run-pathman-regress:
126 | 	cd $(CURDIR)/$(top_builddir)/src/test/regress && \
127 | 	$(with_temp_install) \
128 | 	PGPORT='65432' \
129 | 	PGHOST='127.0.0.1' \
130 | 	PGUSER='$(USER)' \
131 | 	./pg_regress \
132 | 	--bindir='' \
133 | 	--use-existing \
134 | 	--temp-config=$(abs_top_srcdir)/contrib/test_partition/pg_pathman.add \
135 | 	--inputdir=$(abs_top_srcdir)/contrib/test_partition/ \
136 | 	partition
137 | 
138 | 
139 | # bgw-based partition spawning is not supported by mm, so I
140 | # commenting out body of set_spawn_using_bgw() sql function before
141 | # running that
142 | run-pathman-regress-ext:
143 | 	cd $(CURDIR)/$(top_builddir)/src/test/regress && \
144 | 	$(with_temp_install) \
145 | 	PGPORT='65432' \
146 | 	PGHOST='127.0.0.1' \
147 | 	PGUSER='$(USER)' \
148 | 	./pg_regress \
149 | 	--bindir='' \
150 | 	--use-existing \
151 | 	--temp-config=$(abs_top_srcdir)/contrib/pg_pathman/conf.add \
152 | 	--inputdir=$(abs_top_srcdir)/contrib/pg_pathman/ \
153 | 	pathman_array_qual pathman_basic pathman_bgw pathman_calamity pathman_callbacks \
154 | 	pathman_column_type pathman_cte pathman_domains pathman_dropped_cols pathman_expressions \
155 | 	pathman_foreign_keys pathman_gaps pathman_inserts pathman_interval pathman_join_clause \
156 | 	pathman_lateral pathman_hashjoin pathman_mergejoin pathman_only pathman_param_upd_del \
157 | 	pathman_permissions pathman_rebuild_deletes pathman_rebuild_updates pathman_rowmarks \
158 | 	pathman_runtime_nodes pathman_subpartitions pathman_update_node pathman_update_triggers \
159 | 	pathman_upd_del pathman_utility_stmt pathman_views
160 | 
161 | pg-regress: | start run-pg-regress
162 | pathman-regress: | start run-pathman-regress-ext stop
163 | installcheck:
164 | 	$(prove_installcheck)
165 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # multimaster
 2 | 
 3 | multimaster is a Postgres extension with a set of core patches that turn the
 4 | DBMS into a synchronous shared-nothing symmetric cluster providing high
 5 | availability with strong consistency and read scalability.
 6 | 
 7 | It offers the following benefits, some of which are not available in traditional streaming replication based solutions:
 8 | * Fault tolerance and automatic node recovery
 9 | * Fast failover
10 | * Both read and write transactions can be executed on any node.
11 | * Read scalability
12 | * Working with temporary tables on each cluster node
13 | * Online minor upgrades
14 | 
15 | ## Documentation
16 | 
17 | [current documentation](https://postgrespro.github.io/mmts/)
18 | 
19 | Documentation for versions released with PostgresPro Enterprise can be found
20 | [here](https://postgrespro.ru/docs/enterprise/current/multimaster?lang=en).
21 | 
22 | ## Building from source
23 | 
24 | Since multimaster depends on core patches, both Postgres and extension must be compiled. The patched version (based on Postgres 13) is available [here](https://github.com/postgrespro/postgres_cluster/tree/rel13_mm_2). Follow the [documentation](https://www.postgresql.org/docs/current/installation.html) to build it.
25 | 
26 | Then enter the build directory and install the extension with
27 | ```shell
28 | cd contrib
29 | git clone https://github.com/postgrespro/mmts/
30 | cd mmts
31 | make install
32 | ```
33 | 


--------------------------------------------------------------------------------
/doc/multimaster_book.xml:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.2//EN"
 2 |           "http://www.oasis-open.org/docbook/xml/4.2/docbookx.dtd"
 3 | [
 4 | <!ENTITY multimaster     SYSTEM "multimaster.xml">
 5 | ]>
 6 | 
 7 | <book id="multimaster_book">
 8 |   <chapter>
 9 |    <title>multimaster Documentation</title>
10 | &multimaster;
11 |   </chapter>
12 | </book>
13 | 


--------------------------------------------------------------------------------
/doc/readme.md:
--------------------------------------------------------------------------------
 1 | # Generating documentation
 2 | ```
 3 | xmllint --noout --valid multimaster_book.xml
 4 | xsltproc stylesheet.xsl multimaster_book.xml >multimaster.html
 5 | ```
 6 | 
 7 | and don't forget to install the result on postgrespro.github.io:
 8 | ```
 9 | cp multimaster.html stylesheet.css <dir-with-site>/mmts/
10 | ```


--------------------------------------------------------------------------------
/doc/specs/.gitignore:
--------------------------------------------------------------------------------
1 | *.toolbox/
2 | .ipynb_checkpoints/
3 | 


--------------------------------------------------------------------------------
/doc/specs/MtmGenerations.cfg:
--------------------------------------------------------------------------------
 1 | \* MV CONSTANT declarations
 2 | CONSTANTS
 3 | n1 = n1
 4 | n2 = n2
 5 | n3 = n3
 6 | \* MV CONSTANT definitions
 7 | CONSTANT
 8 | nodes = {n1, n2, n3}
 9 | \* SYMMETRY definition
10 | SYMMETRY perms
11 | \* CONSTANT definitions
12 | CONSTANT
13 | max_xacts = 3
14 | CONSTANT
15 | max_gen = 3
16 | \* INIT definition
17 | INIT
18 | Init
19 | \* NEXT definition
20 | NEXT
21 | Next
22 | \* INVARIANT definition
23 | INVARIANT
24 | OrderOk
25 | \* Generated on Fri Dec 06 18:48:51 MSK 2019


--------------------------------------------------------------------------------
/doc/specs/MtmPrimitiveCurrent.cfg:
--------------------------------------------------------------------------------
 1 | \* MV CONSTANT declarations
 2 | CONSTANTS
 3 | n1 = n1
 4 | n2 = n2
 5 | n3 = n3
 6 | \* MV CONSTANT definitions
 7 | CONSTANT
 8 | nodes = {n1, n2, n3}
 9 | \* CONSTANT definitions
10 | \* INIT definition
11 | INIT
12 | Init
13 | \* NEXT definition
14 | NEXT
15 | Next
16 | \* INVARIANT definition
17 | INVARIANT
18 | OrderOk
19 | \* Generated on Fri Dec 06 18:48:51 MSK 2019


--------------------------------------------------------------------------------
/doc/specs/MtmPrimitiveCurrent.tla:
--------------------------------------------------------------------------------
  1 | ---- MODULE MtmPrimitiveCurrent ----
  2 | 
  3 | \* Primitive (meaning immediate PREPARE everywhere and immediate recovery)
  4 | \* but pretty close model of current multimaster.
  5 | \*  - There is an obvious sausage problem, shown by TLC. One of sort of its appearances
  6 | \*    is that we push xact into node without checking its state at all; xact is
  7 | \*    just appended to all nodes coordinator regards as 'enabled'.
  8 | \*  - Also 'works' only on 3 nodes because we recover from single node.
  9 | \*  - I don't see any reason for RECOVERED->ONLINE transition condition,
 10 | \*    and associated maintenance of walsenders/walreceivers masks. We can allow
 11 | \*    our xacts even just after recovery or (simpler for selecting xacts needing
 12 | \*    resolving) when majority is enabled.
 13 | \*  - I also don't see the point of recovery phase in RECOVERED|ONLINE: we don't pull
 14 | \*    all origins and thus it doesn't save us from sausage-like problems,
 15 | \*    but we still don't confirm xacts and don't allow parallel apply in it.
 16 | 
 17 | \* model depth constraint is hardcoded in do_tx
 18 | 
 19 | EXTENDS Integers, Sequences, FiniteSets, TLC
 20 | VARIABLES state, logs
 21 | 
 22 | CONSTANT nodes
 23 | 
 24 | n_nodes == Cardinality(nodes)
 25 | 
 26 | 
 27 | \**************************************************************************************
 28 | \* Helpers
 29 | \**************************************************************************************
 30 | 
 31 | \* is s1 subsequence of s2?
 32 | IsSubSeq(s1, s2) ==
 33 |     /\ Len(s1) <= Len(s2)
 34 |     /\ SubSeq(s2, 1, Len(s1)) = s1
 35 | 
 36 | 
 37 | quorum(mask) == Cardinality({i \in DOMAIN mask : mask[i] = 1}) >= (n_nodes \div 2 + 1)
 38 | 
 39 | max(set) == IF set = {} THEN 0 ELSE CHOOSE e1 \in set: \A e2 \in set: e1 >= e2
 40 | 
 41 | maxlen(seqs) == max({Len(seqs[n]) : n \in DOMAIN seqs})
 42 | 
 43 | \* max lsn of given origin in given log
 44 | maxlsn(log, origin) == max({log[j].olsn : j \in {i \in DOMAIN log : log[i].origin = origin }})
 45 | 
 46 | \* how far each node's changes are applied in given log?
 47 | rep_state(log) == [n \in nodes |-> maxlsn(log,n)]
 48 | 
 49 | log_newer_than(log, origin_vec) == SelectSeq(log, LAMBDA e: e.olsn > origin_vec[e.origin])
 50 | 
 51 | \*is_increasing(s) == IF Len(s) > 1
 52 | \*                    THEN {s[i] < s[i+1] : i \in 1..(Len(s)-1)} = {TRUE}
 53 | \*                    ELSE TRUE
 54 | 
 55 | \* returns not just new status but record with new state because masks might change
 56 | \* old status is taken from state[n]
 57 | new_state(n, view, enabled, wsndmask, wrcvmask) ==
 58 |     LET
 59 |       old_status == state[n].status
 60 |       new_status == CASE
 61 |         \* This is hardly needed; safety won't be altered if we are in recovery
 62 |         \* with less than majority in view mask
 63 |         ~ quorum(view) -> "disabled"
 64 |         [] quorum(view) /\ old_status = "disabled" -> "recovery"
 65 |         \* recovery -> recovered done explicitly in do_recovery()
 66 |         [] quorum(view) /\ old_status = "recovered" /\ view = enabled /\ view = wsndmask /\ view = wrcvmask -> "online"
 67 |         \* I don't think we need that, nothing should be prepared with minority enabled anyway
 68 |         [] quorum(view) /\ old_status = "online" /\ ~quorum(enabled) -> "disabled"
 69 |         [] OTHER -> old_status
 70 |       \* all zeros but me
 71 |       zeros == [[_n \in nodes |-> 0] EXCEPT ![n] = 1]
 72 |       new_enabled == IF new_status = "disabled" THEN zeros ELSE enabled
 73 |       new_wsndmask == IF new_status = "disabled" THEN zeros ELSE wsndmask
 74 |       new_wrcvmask == IF new_status = "disabled" THEN zeros ELSE wrcvmask
 75 |     IN
 76 |       \* next_lsn goes unchanged
 77 |       [state[n] EXCEPT !.status = new_status,
 78 |                        !.view = view,
 79 |                        !.enabled = new_enabled,
 80 |                        !.walsenders = new_wsndmask,
 81 |                        !.walreceivers = new_wrcvmask]
 82 | 
 83 | 
 84 | \**************************************************************************************
 85 | \* Initial
 86 | \**************************************************************************************
 87 | 
 88 | 
 89 | Init == /\ state = [n \in nodes |-> [
 90 |                         next_lsn |-> 1,
 91 |                         status |-> "disabled",
 92 |                         view |-> [[_n \in nodes |-> 0] EXCEPT ![n] = 1],
 93 |                         enabled |-> [[_n \in nodes |-> 0] EXCEPT ![n] = 1],
 94 |                         walsenders |-> [[_n \in nodes |-> 0] EXCEPT ![n] = 1],
 95 |                         walreceivers |-> [[_n \in nodes |-> 0] EXCEPT ![n] = 1]
 96 |                     ]]
 97 |         /\ logs =  [n \in nodes |-> << >>]
 98 | 
 99 | \**************************************************************************************
100 | \* Actions
101 | \**************************************************************************************
102 | 
103 | 
104 | \* n1 disconnects n2
105 | disconnect(n1, n2) ==
106 |     /\ n1 /= n2
107 |     /\ state[n1].view[n2] = 1
108 | 
109 |     /\ logs' = logs
110 |     /\  LET
111 |             view == [state[n1].view EXCEPT ![n2] = 0]
112 |             enabled == [state[n1].enabled EXCEPT ![n2] = 0]
113 |             n1_state == new_state(n1, view, enabled, state[n1].walsenders, state[n2].walreceivers)
114 |         IN
115 |         state' = [state EXCEPT ![n1] = n1_state]
116 | 
117 | 
118 | connect(n1, n2) ==
119 |     /\ n1 /= n2
120 |     /\ state[n1].view[n2] = 0
121 | 
122 |     /\ logs' = logs
123 |     /\  LET
124 |             view == [state[n1].view EXCEPT ![n2] = 1]
125 |             n1_state == new_state(n1, view, state[n1].enabled, state[n1].walsenders, state[n1].walreceivers)
126 |         IN
127 |             state' = [state EXCEPT ![n1] = n1_state]
128 | 
129 | \* n1 recovers from n2
130 | do_recovery(n1, n2) ==
131 |     /\ n1 /= n2
132 |     /\ state[n1].status = "recovery"
133 |     /\ state[n1].view[n2] = 1
134 |     \* Apparently this ensures we won't keep dead node as enabled
135 |     /\ state[n2].view[n1] = 1
136 | 
137 |     /\  LET
138 |             origin_vec == rep_state(logs[n1])
139 |             new_entries == log_newer_than(logs[n2], origin_vec)
140 |             \* enable n1
141 |             n2_enabled == [state[n2].enabled EXCEPT ![n1] = 1]
142 |             n2_state == new_state(n2, state[n2].view, n2_enabled, state[n2].walsenders, state[n2].walreceivers)
143 |         IN
144 |         /\ logs' = [logs EXCEPT ![n1] = logs[n1] \o new_entries]
145 |         /\ state' = [state EXCEPT  ![n1].status = "recovered",
146 |                                    ![n2] = n2_state]
147 | 
148 | 
149 | do_recovered(n1, n2) ==
150 |     /\ n1 /= n2
151 |     /\ (state[n1].status = "recovered" \/ state[n1].status = "online")
152 |     /\ state[n1].view[n2] = 1
153 |     /\ state[n2].view[n1] = 1
154 | 
155 |     /\  LET
156 |             our_last_lsn == maxlsn(logs[n1], n2)
157 |             new_entries == SelectSeq(logs[n2], LAMBDA e: e.origin = n2 /\ e.olsn > our_last_lsn )
158 |         IN
159 |             logs' = [logs EXCEPT ![n1] = logs[n1] \o new_entries]
160 |     /\  LET
161 |             n2_enabled == [state[n2].enabled EXCEPT ![n1] = 1]
162 |             n2_walsenders == [state[n2].walsenders EXCEPT ![n1] = 1]
163 |             n2_state == new_state(n2, state[n2].view, n2_enabled, n2_walsenders, state[n2].walreceivers)
164 |             n1_walreceivers == [state[n1].walreceivers EXCEPT ![n2] = 1]
165 |             n1_state == new_state(n1, state[n1].view, state[n1].enabled, state[n1].walsenders, n1_walreceivers)
166 |         IN
167 |         state' = [state EXCEPT  ![n1] = n1_state,
168 |                                 ![n2] = n2_state]
169 | 
170 | 
171 | do_tx(node) ==
172 |     \* model depth constraint
173 |     /\ Len(logs[node]) <= 4
174 |     /\ state[node].status = "online"
175 |     /\ quorum(state[node].enabled)
176 |     /\ logs' = [n \in nodes |->
177 |                     IF state[node].enabled[n] = 1
178 |                     THEN Append(logs[n], [origin |-> node, olsn |-> state[node].next_lsn])
179 |                     ELSE logs[n]]
180 |     /\ state' = [state EXCEPT ![node].next_lsn = state[node].next_lsn + 1]
181 | 
182 | 
183 | \**************************************************************************************
184 | \* Final spec
185 | \**************************************************************************************
186 | 
187 | 
188 | Next ==     \/ \E n1,n2 \in nodes : connect(n1,n2)
189 |             \/ \E n1,n2 \in nodes : disconnect(n1,n2)
190 |             \/ \E n1,n2 \in nodes : do_recovery(n1,n2)
191 |             \/ \E n1,n2 \in nodes : do_recovered(n1,n2)
192 |             \/ \E n \in nodes : do_tx(n)
193 | 
194 | spec == Init /\ [][Next]_<<state, logs>>
195 | 
196 | 
197 | \**************************************************************************************
198 | \* Stuff to check
199 | \**************************************************************************************
200 | 
201 | \* Make sure every log is sublog of the longest one
202 | OrderOk ==
203 |   LET
204 |       most_advanced_node == CHOOSE n1 \in nodes: \A n2 \in nodes: Len(logs[n1]) >= Len(logs[n2])
205 |   IN
206 |       \A n \in nodes: IsSubSeq(logs[n], logs[most_advanced_node])
207 | 
208 | ====


--------------------------------------------------------------------------------
/doc/specs/MtmPrimitiveCurrentMasks.cfg:
--------------------------------------------------------------------------------
 1 | \* MV CONSTANT declarations
 2 | CONSTANTS
 3 | n1 = n1
 4 | n2 = n2
 5 | n3 = n3
 6 | \* MV CONSTANT definitions
 7 | CONSTANT
 8 | nodes = {n1, n2, n3}
 9 | \* CONSTANT definitions
10 | \* INIT definition
11 | INIT
12 | Init
13 | \* NEXT definition
14 | NEXT
15 | Next
16 | \* INVARIANT definition
17 | INVARIANT
18 | OrderOk
19 | \* Generated on Fri Dec 06 18:48:51 MSK 2019


--------------------------------------------------------------------------------
/doc/specs/MtmPrimitiveCurrentMasks.tla:
--------------------------------------------------------------------------------
  1 | ---- MODULE MtmPrimitiveCurrentMasks ----
  2 | 
  3 | \* This just adds to MtmPrimitiveCurrent.tla tracking of enabled masks: while
  4 | \* doing xact coordinator stamps it with current enabled mask. Others apply it
  5 | \* in normal mode iff their enabled mask is exactly the same. TLC demonstrates
  6 | \* here that we still have a problem because in do_recovered we ask to enable us
  7 | \* without pulling all origins.
  8 | 
  9 | \* model depth constraint is hardcoded in do_tx
 10 | 
 11 | EXTENDS Integers, Sequences, FiniteSets, TLC
 12 | VARIABLES state, logs
 13 | 
 14 | CONSTANT nodes
 15 | 
 16 | n_nodes == Cardinality(nodes)
 17 | 
 18 | 
 19 | \**************************************************************************************
 20 | \* Helpers
 21 | \**************************************************************************************
 22 | 
 23 | \* is s1 subsequence of s2?
 24 | IsSubSeq(s1, s2) ==
 25 |     /\ Len(s1) <= Len(s2)
 26 |     /\ SubSeq(s2, 1, Len(s1)) = s1
 27 | 
 28 | 
 29 | quorum(mask) == Cardinality({i \in DOMAIN mask : mask[i] = 1}) >= (n_nodes \div 2 + 1)
 30 | 
 31 | max(set) == IF set = {} THEN 0 ELSE CHOOSE e1 \in set: \A e2 \in set: e1 >= e2
 32 | 
 33 | maxlen(seqs) == max({Len(seqs[n]) : n \in DOMAIN seqs})
 34 | 
 35 | \* max lsn of given origin in given log
 36 | maxlsn(log, origin) == max({log[j].olsn : j \in {i \in DOMAIN log : log[i].origin = origin }})
 37 | 
 38 | \* how far each node's changes are applied in given log?
 39 | rep_state(log) == [n \in nodes |-> maxlsn(log,n)]
 40 | 
 41 | log_newer_than(log, origin_vec) == SelectSeq(log, LAMBDA e: e.olsn > origin_vec[e.origin])
 42 | 
 43 | \*is_increasing(s) == IF Len(s) > 1
 44 | \*                    THEN {s[i] < s[i+1] : i \in 1..(Len(s)-1)} = {TRUE}
 45 | \*                    ELSE TRUE
 46 | 
 47 | \* returns not just new status but record with new state because masks might change
 48 | \* old status is taken from state[n]
 49 | new_state(n, view, enabled, wsndmask, wrcvmask) ==
 50 |     LET
 51 |       old_status == state[n].status
 52 |       new_status == CASE
 53 |         \* This is hardly needed; safety won't be altered if we are in recovery
 54 |         \* with less than majority in view mask
 55 |         ~ quorum(view) -> "disabled"
 56 |         [] quorum(view) /\ old_status = "disabled" -> "recovery"
 57 |         \* recovery -> recovered done explicitly in do_recovery()
 58 |         [] quorum(view) /\ old_status = "recovered" /\ view = enabled /\ view = wsndmask /\ view = wrcvmask -> "online"
 59 |         \* I don't think we need that, nothing should be prepared with minority enabled anyway
 60 |         [] quorum(view) /\ old_status = "online" /\ ~quorum(enabled) -> "disabled"
 61 |         [] OTHER -> old_status
 62 |       \* all zeros but me
 63 |       zeros == [[_n \in nodes |-> 0] EXCEPT ![n] = 1]
 64 |       new_enabled == IF new_status = "disabled" THEN zeros ELSE enabled
 65 |       new_wsndmask == IF new_status = "disabled" THEN zeros ELSE wsndmask
 66 |       new_wrcvmask == IF new_status = "disabled" THEN zeros ELSE wrcvmask
 67 |     IN
 68 |       \* next_lsn goes unchanged
 69 |       [state[n] EXCEPT !.status = new_status,
 70 |                        !.view = view,
 71 |                        !.enabled = new_enabled,
 72 |                        !.walsenders = new_wsndmask,
 73 |                        !.walreceivers = new_wrcvmask]
 74 | 
 75 | 
 76 | \**************************************************************************************
 77 | \* Initial
 78 | \**************************************************************************************
 79 | 
 80 | 
 81 | Init == /\ state = [n \in nodes |-> [
 82 |                         next_lsn |-> 1,
 83 |                         status |-> "disabled",
 84 |                         view |-> [[_n \in nodes |-> 0] EXCEPT ![n] = 1],
 85 |                         enabled |-> [[_n \in nodes |-> 0] EXCEPT ![n] = 1],
 86 |                         walsenders |-> [[_n \in nodes |-> 0] EXCEPT ![n] = 1],
 87 |                         walreceivers |-> [[_n \in nodes |-> 0] EXCEPT ![n] = 1]
 88 |                     ]]
 89 |         /\ logs =  [n \in nodes |-> << >>]
 90 | 
 91 | \**************************************************************************************
 92 | \* Actions
 93 | \**************************************************************************************
 94 | 
 95 | 
 96 | \* n1 disconnects n2
 97 | disconnect(n1, n2) ==
 98 |     /\ n1 /= n2
 99 |     /\ state[n1].view[n2] = 1
100 | 
101 |     /\ logs' = logs
102 |     /\  LET
103 |             view == [state[n1].view EXCEPT ![n2] = 0]
104 |             enabled == [state[n1].enabled EXCEPT ![n2] = 0]
105 |             n1_state == new_state(n1, view, enabled, state[n1].walsenders, state[n2].walreceivers)
106 |         IN
107 |         state' = [state EXCEPT ![n1] = n1_state]
108 | 
109 | 
110 | connect(n1, n2) ==
111 |     /\ n1 /= n2
112 |     /\ state[n1].view[n2] = 0
113 | 
114 |     /\ logs' = logs
115 |     /\  LET
116 |             view == [state[n1].view EXCEPT ![n2] = 1]
117 |             n1_state == new_state(n1, view, state[n1].enabled, state[n1].walsenders, state[n1].walreceivers)
118 |         IN
119 |             state' = [state EXCEPT ![n1] = n1_state]
120 | 
121 | \* n1 recovers from n2
122 | do_recovery(n1, n2) ==
123 |     /\ n1 /= n2
124 |     /\ state[n1].status = "recovery"
125 |     /\ state[n1].view[n2] = 1
126 |     \* Apparently this ensures we won't keep dead node as enabled
127 |     /\ state[n2].view[n1] = 1
128 | 
129 |     /\  LET
130 |             origin_vec == rep_state(logs[n1])
131 |             new_entries == log_newer_than(logs[n2], origin_vec)
132 |             \* enable n1
133 |             n2_enabled == [state[n2].enabled EXCEPT ![n1] = 1]
134 |             n2_state == new_state(n2, state[n2].view, n2_enabled, state[n2].walsenders, state[n2].walreceivers)
135 |         IN
136 |         /\ logs' = [logs EXCEPT ![n1] = logs[n1] \o new_entries]
137 |         /\ state' = [state EXCEPT  ![n1].status = "recovered",
138 |                                    ![n2] = n2_state]
139 | 
140 | 
141 | do_recovered(n1, n2) ==
142 |     /\ n1 /= n2
143 |     /\ (state[n1].status = "recovered" \/ state[n1].status = "online")
144 |     /\ state[n1].view[n2] = 1
145 |     /\ state[n2].view[n1] = 1
146 | 
147 |     /\  LET
148 |             our_last_lsn == maxlsn(logs[n1], n2)
149 |             new_entries == SelectSeq(logs[n2], LAMBDA e: e.origin = n2 /\ e.olsn > our_last_lsn )
150 |         IN
151 |             /\ \A k \in DOMAIN new_entries: new_entries[k].participants = state[n1].enabled
152 |             /\ logs' = [logs EXCEPT ![n1] = logs[n1] \o new_entries]
153 |     /\  LET
154 |             n2_enabled == [state[n2].enabled EXCEPT ![n1] = 1]
155 |             n2_walsenders == [state[n2].walsenders EXCEPT ![n1] = 1]
156 |             n2_state == new_state(n2, state[n2].view, n2_enabled, n2_walsenders, state[n2].walreceivers)
157 |             n1_walreceivers == [state[n1].walreceivers EXCEPT ![n2] = 1]
158 |             n1_state == new_state(n1, state[n1].view, state[n1].enabled, state[n1].walsenders, n1_walreceivers)
159 |         IN
160 |         state' = [state EXCEPT  ![n1] = n1_state,
161 |                                 ![n2] = n2_state]
162 | 
163 | 
164 | do_tx(node) ==
165 |     \* model depth constraint
166 |     /\ Len(logs[node]) <= 4
167 |     /\ state[node].status = "online"
168 |     /\ quorum(state[node].enabled)
169 |     \* make sure set of enabled nodes is the same on all participants
170 |     /\ \A n \in nodes: state[node].enabled[n] = 0 \/ state[n].enabled = state[node].enabled
171 |     /\ logs' = [n \in nodes |->
172 |                     IF state[node].enabled[n] = 1
173 |                     THEN Append(logs[n], [origin |-> node, olsn |-> state[node].next_lsn, participants |-> state[node].enabled])
174 |                     ELSE logs[n]]
175 |     /\ state' = [state EXCEPT ![node].next_lsn = state[node].next_lsn + 1]
176 | 
177 | 
178 | \**************************************************************************************
179 | \* Final spec
180 | \**************************************************************************************
181 | 
182 | 
183 | Next ==     \/ \E n1,n2 \in nodes : connect(n1,n2)
184 |             \/ \E n1,n2 \in nodes : disconnect(n1,n2)
185 |             \/ \E n1,n2 \in nodes : do_recovery(n1,n2)
186 |             \/ \E n1,n2 \in nodes : do_recovered(n1,n2)
187 |             \/ \E n \in nodes : do_tx(n)
188 | 
189 | spec == Init /\ [][Next]_<<state, logs>>
190 | 
191 | 
192 | \**************************************************************************************
193 | \* Stuff to check
194 | \**************************************************************************************
195 | 
196 | \* Make sure every log is sublog of the longest one
197 | OrderOk ==
198 |   LET
199 |       most_advanced_node == CHOOSE n1 \in nodes: \A n2 \in nodes: Len(logs[n1]) >= Len(logs[n2])
200 |   IN
201 |       \A n \in nodes: IsSubSeq(logs[n], logs[most_advanced_node])
202 | 
203 | ====


--------------------------------------------------------------------------------
/doc/specs/MtmPrimitiveCurrentMasksFixed.cfg:
--------------------------------------------------------------------------------
 1 | \* MV CONSTANT declarations
 2 | CONSTANTS
 3 | n1 = n1
 4 | n2 = n2
 5 | n3 = n3
 6 | \* MV CONSTANT definitions
 7 | CONSTANT
 8 | nodes = {n1, n2, n3}
 9 | \* SYMMETRY definition
10 | SYMMETRY perms
11 | \* CONSTANT definitions
12 | CONSTANT
13 | depth = 3
14 | \* INIT definition
15 | INIT
16 | Init
17 | \* NEXT definition
18 | NEXT
19 | Next
20 | \* INVARIANT definition
21 | INVARIANT
22 | OrderOk
23 | \* Generated on Fri Dec 06 18:48:51 MSK 2019


--------------------------------------------------------------------------------
/doc/specs/commit.cfg:
--------------------------------------------------------------------------------
1 | SPECIFICATION spec
2 | INVARIANTS consistency types_correct1 types_correct2


--------------------------------------------------------------------------------
/doc/stylesheet.css:
--------------------------------------------------------------------------------
  1 | @import url('https://fonts.googleapis.com/css?family=Roboto:300,400,500,700&subset=cyrillic');
  2 | 
  3 | body {
  4 |  font-family: 'Roboto',Arial,sans-serif;
  5 | }
  6 | 
  7 | body {
  8 |    font-size: 18px;
  9 |    font-weight: 300;
 10 | }
 11 | 
 12 | /* ../media/css/docs.css */
 13 | .navheader th { text-align: center; } /* anti-bootstrap */
 14 | 
 15 | .navheader tbody tr:nth-child(1) th {  /* временно убрать ненужную строчку */
 16 | 	display: none;
 17 | }
 18 | 
 19 | /* PostgreSQL.org Documentation Style */
 20 | 
 21 | .book div.NAVHEADER table {
 22 |   margin-left: 0;
 23 | }
 24 | 
 25 | .book div.NAVHEADER th {
 26 |     text-align: center;
 27 | }
 28 | 
 29 | .book {
 30 | 	font-size: 15px;
 31 | 	line-height: 1.6;
 32 | }
 33 | 
 34 | /* Heading Definitions */
 35 | 
 36 | .book h1,
 37 | .book h2,
 38 | .book h3 {
 39 |   font-weight: bold;
 40 |   margin-top: 2ex;
 41 | }
 42 | 
 43 | .book h1 a,
 44 | .book h2 a,
 45 | .book h3 a,
 46 | .book h4 a
 47 |  {
 48 |   color: #EC5800;
 49 | }
 50 | 
 51 | /* EKa --> */
 52 | .book h1 {
 53 |   font-size: 1.4em;
 54 | }
 55 | 
 56 | .book h2 {
 57 |   font-size: 1.25em;
 58 | }
 59 | 
 60 | .book h3 {
 61 |   font-size: 1.2em;
 62 | }
 63 | 
 64 | .book h4 {
 65 |   font-size: 1.15em;
 66 | }
 67 | 
 68 | .book h5 {
 69 |   font-size: 1.1em;
 70 | }
 71 | 
 72 | .book h6 {
 73 |   font-size: 1.0em;
 74 | }
 75 | /* <-- EKa */
 76 | 
 77 | .book h1 a:hover {
 78 |   color: #EC5800;
 79 |   text-decoration: none;
 80 | }
 81 | 
 82 | .book h2 a:hover,
 83 | .book h3 a:hover,
 84 | .book h4 a:hover {
 85 |   color: #666666;
 86 |   text-decoration: none;
 87 | }
 88 | 
 89 | 
 90 | 
 91 | /* Text Styles */
 92 | 
 93 | .book div.SECT2 {
 94 |   margin-top: 4ex;
 95 | }
 96 | 
 97 | .book div.SECT3 {
 98 |   margin-top: 3ex;
 99 |   margin-left: 3ex;
100 | }
101 | 
102 | .book .txtCurrentLocation {
103 |   font-weight: bold;
104 | }
105 | 
106 | .book p,
107 | .book ol,
108 | .book ul,
109 | .book li {
110 |   line-height: 1.5em;
111 | }
112 | 
113 | .book code {
114 | 	font-size: 1em;
115 | 	padding: 0px;
116 | 	color: #525f6c;
117 | 	background-color: #FFF;
118 | 	border-radius: 0px;
119 | }
120 | 
121 | .book code, kbd, pre, samp {
122 | 	font-family: monospace,monospace;	
123 | }
124 | 
125 | .book .txtCommentsWrap {
126 |   border: 2px solid #F5F5F5; 
127 |   width: 100%;
128 | }
129 | 
130 | .book .txtCommentsContent {
131 |   background: #F5F5F5;
132 |   padding: 3px;
133 | }
134 | 
135 | .book .txtCommentsPoster {
136 |   float: left;
137 | }
138 | 
139 | .book .txtCommentsDate {
140 |   float: right;
141 | }
142 | 
143 | .book .txtCommentsComment {
144 |   padding: 3px;
145 | }
146 | 
147 | .book #docContainer pre code,
148 | .book #docContainer pre tt,
149 | .book #docContainer pre pre,
150 | .book #docContainer tt tt,
151 | .book #docContainer tt code,
152 | .book #docContainer tt pre {
153 |   font-size: 1em;
154 | }
155 | 
156 | .book pre.LITERALLAYOUT,
157 | .book .SCREEN,
158 | .book .SYNOPSIS,
159 | .book .PROGRAMLISTING,
160 | .book .REFSYNOPSISDIV p,
161 | .book table.CAUTION,
162 | .book table.WARNING,
163 | .book blockquote.NOTE,
164 | .book blockquote.TIP,
165 | .book div.note,
166 | .book div.tip,
167 | .book table.CALSTABLE {
168 |   -moz-box-shadow: 3px 3px 5px #DFDFDF;
169 |   -webkit-box-shadow: 3px 3px 5px #DFDFDF;
170 |   -khtml-box-shadow: 3px 3px 5px #DFDFDF;
171 |   -o-box-shadow: 3px 3px 5px #DFDFDF;
172 |   box-shadow: 3px 3px 5px #DFDFDF;
173 | }
174 | 
175 | .book pre.LITERALLAYOUT,
176 | .book .SCREEN,
177 | .book .SYNOPSIS,
178 | .book .PROGRAMLISTING,
179 | .book .REFSYNOPSISDIV p,
180 | .book table.CAUTION,
181 | .book table.WARNING,
182 | .book blockquote.NOTE,
183 | .book blockquote.TIP
184 | .book div.note,
185 | .book div.tip {
186 |   color: black;
187 |   border-width: 1px;
188 |   border-style: solid;
189 |   padding: 2ex;
190 |   margin: 2ex 0 2ex 2ex;
191 |   overflow: auto;
192 |   -moz-border-radius: 8px;
193 |   -webkit-border-radius: 8px;
194 |   -khtml-border-radius: 8px;
195 |   border-radius: 8px;
196 | }
197 | 
198 | .book div.note,
199 | .book div.tip {
200 |   -moz-border-radius: 8px !important;
201 |   -webkit-border-radius: 8px !important;
202 |   -khtml-border-radius: 8px !important;
203 |   border-radius: 8px !important;
204 | } 
205 | 
206 | 
207 | .book pre.LITERALLAYOUT,
208 | .book pre.SYNOPSIS,
209 | .book pre.PROGRAMLISTING,
210 | .book .REFSYNOPSISDIV p,
211 | .book .SCREEN {
212 |   border-color: #CFCFCF;
213 |   background-color: #F7F7F7;
214 | }
215 | 
216 | .book blockquote.NOTE,
217 | .book blockquote.TIP,
218 | .book div.note,
219 | .book div.tip {
220 |   border-color: #DBDBCC;
221 |   background-color: #EEEEDD;
222 |   padding: 14px;
223 |   width: 572px;
224 | /*  font-size: 12px; */
225 | }
226 | 
227 | .book blockquote.NOTE,
228 | .book blockquote.TIP,
229 | .book table.CAUTION,
230 | .book table.WARNING {
231 |   margin: 4ex auto;
232 | }
233 | 
234 | .book div.note,
235 | .book div.tip {
236 |   margin: 4ex auto  !important;
237 | }
238 | 
239 | 
240 | .book blockquote.NOTE p,
241 | .book blockquote.TIP p,
242 | .book div.note p,
243 | .book div.tip p {
244 |   margin: 0;
245 | }
246 | 
247 | .book blockquote.NOTE pre,
248 | .book blockquote.NOTE code,
249 | .book div.note pre,
250 | .book div.note code,
251 | .book blockquote.TIP pre,
252 | .book blockquote.TIP code,
253 | .book div.tip pre,
254 | .book div.tio code {
255 |   margin-left: 0;
256 |   margin-right: 0;
257 |   -moz-box-shadow: none;
258 |   -webkit-box-shadow: none;
259 |   -khtml-box-shadow: none;
260 |   -o-box-shadow: none;
261 |   box-shadow: none;
262 | }
263 | 
264 | .book .emphasis,
265 | .book .c2 {
266 |   font-weight: bold;
267 | }
268 | 
269 | .book .REPLACEABLE {
270 |   font-style: italic;
271 | }
272 | 
273 | /* Table Styles */
274 | 
275 | .book table {
276 |   margin-left: 2ex;
277 | }
278 | 
279 | .book table.CALSTABLE td,
280 | .book table.CALSTABLE th,
281 | .book table.CAUTION td,
282 | .book table.CAUTION th,
283 | .book table.WARNING td,
284 | .book table.WARNING th {
285 |   border-style: solid;
286 | }
287 | 
288 | .book table.CALSTABLE,
289 | .book table.CAUTION,
290 | .book table.WARNING {
291 |   border-spacing: 0;
292 |   border-collapse: collapse;
293 | }
294 | 
295 | .book table.CALSTABLE
296 | {
297 |   margin: 2ex 0 2ex 2ex;
298 |   background-color: #E0ECEF;
299 |   border: 2px solid #A7C6DF;
300 | }
301 | 
302 | .book table.CALSTABLE tr:hover td
303 | {
304 |   background-color: #EFEFEF;
305 | }
306 | 
307 | .book table.CALSTABLE td {
308 |   background-color: #FFF;
309 | }
310 | 
311 | .book table.CALSTABLE td,
312 | .book table.CALSTABLE th {
313 |   border: 1px solid #A7C6DF;
314 |   padding: 0.5ex 0.5ex;
315 | }
316 | 
317 | table.CAUTION,
318 | .book table.WARNING {
319 |   border-collapse: separate;
320 |   display: block;
321 |   padding: 0;
322 |   max-width: 600px;
323 | }
324 | 
325 | .book table.CAUTION {
326 |   background-color: #F5F5DC;
327 |   border-color: #DEDFA7;
328 | }
329 | 
330 | .book table.WARNING {
331 |   background-color: #FFD7D7;
332 |   border-color: #DF421E;
333 | }
334 | 
335 | .book table.CAUTION td,
336 | .book table.CAUTION th,
337 | .book table.WARNING td,
338 | .book table.WARNING th {
339 |   border-width: 0;
340 |   padding-left: 2ex;
341 |   padding-right: 2ex;
342 | }
343 | 
344 | .book table.CAUTION td,
345 | .book table.CAUTION th {
346 |   border-color: #F3E4D5
347 | }
348 | 
349 | .book table.WARNING td,
350 | .book table.WARNING th {
351 |   border-color: #FFD7D7;
352 | }
353 | 
354 | .book td.c1,
355 | .book td.c2,
356 | .book td.c3,
357 | .book td.c4,
358 | .book td.c5,
359 | .book td.c6 {
360 |   font-size: 1.1em;
361 |   font-weight: bold;
362 |   border-bottom: 0px solid #FFEFEF;
363 |   padding: 1ex 2ex 0;
364 | }
365 | 
366 | .book .table  thead {
367 | 	background: #E0ECEF;
368 | 	border-bottom: 1px solid #000;
369 | }
370 | .book .table > thead > tr > th {
371 | 	border-bottom: 1px solid #000;
372 | }
373 | 
374 | .book td, th {
375 | 	padding: 0.1ex 0.5ex; 
376 | }
377 | 
378 | .book .book table tr:hover td {
379 | 	background-color: #EFEFEF;
380 | }
381 | 
382 | /* Link Styles */
383 | 
384 | .book #docNav a {
385 |   font-weight: bold;
386 | }
387 | 
388 | .book code.FUNCTION tt {
389 |   font-size: 1em;
390 | }
391 | 
392 | .book table.docs-compare {
393 | 	align: center;
394 | 	width: 90%;
395 | 	border: 2px solid #999; 
396 | 	border-collapse: collapse;
397 | }
398 | 
399 | .book table.docs-compare td  {
400 | 	padding: 12px;
401 | 	border: 1px solid #DDD;
402 | }
403 | 
404 | .book dd {
405 | 	margin-left: 40px;
406 | }
407 | 
408 | 
409 | .book .sidebar {
410 |     padding: 8px;
411 |     background: #FFF;
412 |     width: auto;
413 | }
414 | 
415 | .book pre {
416 |     background: #f5f5f5;
417 |     padding: 10px;
418 |     border: 1px solid #ccc;
419 |     border-radius: 4px;
420 | }
421 | 


--------------------------------------------------------------------------------
/doc/stylesheet.xsl:
--------------------------------------------------------------------------------
 1 | <?xml version='1.0'?>
 2 | <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
 3 |                 version='1.0'
 4 |                 xmlns="http://www.w3.org/TR/xhtml1/transitional"
 5 |                 exclude-result-prefixes="#default">
 6 | 
 7 | <xsl:import href="http://docbook.sourceforge.net/release/xsl/current/xhtml/docbook.xsl"/>
 8 | 
 9 | <xsl:param name="html.stylesheet" select="'stylesheet.css'"></xsl:param>
10 | 
11 | <xsl:param name="chapter.autolabel" select="0"></xsl:param>
12 | <xsl:param name="xref.with.number.and.title" select="0"></xsl:param>
13 | <xsl:param name="generate.section.toc.level" select="1"></xsl:param>
14 | <xsl:param name="variablelist.term.break.after">1</xsl:param>
15 | <xsl:param name="variablelist.term.separator"></xsl:param>
16 | 
17 | <xsl:template match="productname">
18 |   <xsl:call-template name="inline.charseq"/>
19 | </xsl:template>
20 | 
21 | <xsl:param name="generate.toc">
22 | sect1 toc
23 | </xsl:param>
24 | 
25 | </xsl:stylesheet>
26 | 


--------------------------------------------------------------------------------
/expected/atx.out:
--------------------------------------------------------------------------------
 1 | create table atx_test1(a text);
 2 | -- check that commit of autonomous tx will not steal locks from parent tx
 3 | begin;
 4 |     insert into atx_test1 values (1);
 5 |     select count(*) from pg_locks where transactionid=txid_current();
 6 |  count 
 7 | -------
 8 |      1
 9 | (1 row)
10 | 
11 |     begin autonomous;
12 |         insert into atx_test1 values (1);
13 |         select count(*) from pg_locks where transactionid=txid_current();
14 |  count 
15 | -------
16 |      1
17 | (1 row)
18 | 
19 |     commit;
20 |     -- here we still should see our lock
21 |     select count(*) from pg_locks where transactionid=txid_current();
22 |  count 
23 | -------
24 |      1
25 | (1 row)
26 | 
27 | commit;
28 | drop table atx_test1;
29 | 


--------------------------------------------------------------------------------
/multimaster.control:
--------------------------------------------------------------------------------
1 | comment = 'Multimaster'
2 | default_version = '1.0'
3 | module_pathname = '$libdir/multimaster'
4 | schema = mtm
5 | relocatable = false
6 | 


--------------------------------------------------------------------------------
/referee/Makefile:
--------------------------------------------------------------------------------
 1 | EXTENSION = referee
 2 | DATA = referee--1.0.sql
 3 | REGRESS = referee
 4 | 
 5 | ifdef USE_PGXS
 6 | PG_CONFIG = pg_config
 7 | PGXS := $(shell $(PG_CONFIG) --pgxs)
 8 | include $(PGXS)
 9 | else
10 | subdir = contrib/mmts/referee
11 | top_builddir = ../../../
12 | include $(top_builddir)/src/Makefile.global
13 | include $(top_srcdir)/contrib/contrib-global.mk
14 | endif
15 | 


--------------------------------------------------------------------------------
/referee/expected/referee.out:
--------------------------------------------------------------------------------
 1 | CREATE EXTENSION referee;
 2 | SELECT * FROM referee.decision;
 3 |  key | node_id | gen_num 
 4 | -----+---------+---------
 5 | (0 rows)
 6 | 
 7 | SELECT referee.request_grant(1, 7);
 8 |  request_grant 
 9 | ---------------
10 |  
11 | (1 row)
12 | 
13 | -- node can get its grant reissued
14 | SELECT referee.request_grant(1, 9);
15 |  request_grant 
16 | ---------------
17 |  
18 | (1 row)
19 | 
20 | -- but another can't get it while the previous is not cleared
21 | SELECT referee.request_grant(2, 4);
22 | ERROR:  grant was already issued to node 1 in generation 9
23 | CONTEXT:  PL/pgSQL function request_grant(integer,bigint) line 19 at RAISE
24 | SELECT referee.request_grant(2, 10);
25 | ERROR:  grant was already issued to node 1 in generation 9
26 | CONTEXT:  PL/pgSQL function request_grant(integer,bigint) line 19 at RAISE
27 | SELECT * FROM referee.decision;
28 |   key   | node_id | gen_num 
29 | --------+---------+---------
30 |  winner |       1 |       9
31 | (1 row)
32 | 
33 | DELETE FROM referee.decision WHERE gen_num < 8 OR (node_id = 1 AND gen_num <= 9);
34 | -- surely 2 node can acquire the grant after removal of the old one
35 | SELECT referee.request_grant(2, 11);
36 |  request_grant 
37 | ---------------
38 |  
39 | (1 row)
40 | 
41 | SELECT * FROM referee.decision;
42 |   key   | node_id | gen_num 
43 | --------+---------+---------
44 |  winner |       2 |      11
45 | (1 row)
46 | 
47 | 


--------------------------------------------------------------------------------
/referee/referee--1.0.sql:
--------------------------------------------------------------------------------
 1 | -- complain if script is sourced in psql, rather than via CREATE EXTENSION
 2 | \echo Use "CREATE EXTENSION referee" to load this file. \quit
 3 | 
 4 | CREATE TABLE IF NOT EXISTS referee.decision(
 5 |     key text PRIMARY KEY NOT NULL,
 6 |     node_id int,
 7 |     -- storing gen_num here guarantees we clear (delete) the grant which is
 8 |     -- indeed can already be cleared instead of accidently removing newer one
 9 |     gen_num bigint
10 | );
11 | 
12 | -- returns nothing on success, bails out with ERROR on conflict
13 | CREATE OR REPLACE FUNCTION referee.request_grant(applicant_id int, gen_num bigint) RETURNS void AS
14 | $$
15 | DECLARE
16 |     winner_id int;
17 |     winner_gen_num bigint;
18 | BEGIN
19 |     INSERT INTO referee.decision AS d VALUES ('winner', applicant_id, gen_num)
20 |       ON CONFLICT (key) DO UPDATE SET
21 |         node_id=EXCLUDED.node_id, gen_num=EXCLUDED.gen_num
22 |       -- reissue grant iff it was previously given to this node, not another
23 |       WHERE d.node_id = EXCLUDED.node_id AND
24 |       -- this could be assert as well, node never repeats request with the same
25 |       -- gen num
26 |             d.gen_num < EXCLUDED.gen_num
27 |       RETURNING applicant_id INTO winner_id;
28 |     -- if insertion hasn't happened, there must have been conflict with existing
29 |     -- grant
30 |     IF winner_id IS NULL THEN
31 |       SELECT d.node_id, d.gen_num INTO winner_id, winner_gen_num FROM referee.decision d;
32 |       RAISE EXCEPTION 'grant was already issued to node % in generation %', winner_id, winner_gen_num;
33 |     END IF;
34 | END
35 | $$ LANGUAGE plpgsql;
36 | 
37 | CREATE OR REPLACE FUNCTION referee.clean() RETURNS bool AS
38 | $$
39 | BEGIN
40 |     delete from referee.decision where key = 'winner';
41 |     return 'true';
42 | END
43 | $$ LANGUAGE plpgsql;
44 | 


--------------------------------------------------------------------------------
/referee/referee.control:
--------------------------------------------------------------------------------
1 | comment = 'Multimaster referee'
2 | default_version = '1.0'
3 | module_pathname = '$libdir/referee'
4 | schema = referee
5 | relocatable = false
6 | 


--------------------------------------------------------------------------------
/referee/sql/referee.sql:
--------------------------------------------------------------------------------
 1 | CREATE EXTENSION referee;
 2 | 
 3 | SELECT * FROM referee.decision;
 4 | 
 5 | SELECT referee.request_grant(1, 7);
 6 | -- node can get its grant reissued
 7 | SELECT referee.request_grant(1, 9);
 8 | -- but another can't get it while the previous is not cleared
 9 | SELECT referee.request_grant(2, 4);
10 | SELECT referee.request_grant(2, 10);
11 | SELECT * FROM referee.decision;
12 | 
13 | DELETE FROM referee.decision WHERE gen_num < 8 OR (node_id = 1 AND gen_num <= 9);
14 | -- surely 2 node can acquire the grant after removal of the old one
15 | SELECT referee.request_grant(2, 11);
16 | SELECT * FROM referee.decision;
17 | 


--------------------------------------------------------------------------------
/run.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | use File::Basename;
 4 | use Getopt::Long;
 5 | BEGIN { unshift @INC, '.'; unshift @INC, '../../src/test/perl' }
 6 | use Cluster;
 7 | 
 8 | my $n_nodes = 3;
 9 | my $referee = 0;
10 | my $action = 'start';
11 | GetOptions ("nnodes=i" => \$n_nodes,    # numeric
12 | 			"referee"   => \$referee,   # flag
13 | 			"action=s"  => \$action);	# strings
14 | # referee works only with 2 nodes
15 | if ($referee)
16 | {
17 | 	$n_nodes = 2;
18 | }
19 | 
20 | if ($action eq "start")
21 | {
22 | 	$Cluster::last_port_assigned = 65431;
23 | 
24 | 	my $cluster = new Cluster($n_nodes, $referee);
25 | 	$cluster->init();
26 | 	$cluster->start();
27 | 	$cluster->create_mm('regression');
28 | 
29 | 	# prevent PostgresNode.pm from shutting down nodes on exit in END {}
30 | 	@PostgresNode::all_nodes = ();
31 | }
32 | elsif ($action eq "stop")
33 | {
34 | 	my @datas = <$TestLib::tmp_check/*data>;
35 | 	foreach my $data (@datas) {
36 | 		TestLib::system_log('pg_ctl',
37 | 							'-D', "$data/pgdata",
38 | 							'-m', 'fast',
39 | 							'stop');
40 | 	}
41 | }
42 | else
43 | {
44 | 	die("Usage: run.pl action=<start|stop> [opts]\n");
45 | }
46 | 


--------------------------------------------------------------------------------
/sql/atx.sql:
--------------------------------------------------------------------------------
 1 | create table atx_test1(a text);
 2 | 
 3 | 
 4 | -- check that commit of autonomous tx will not steal locks from parent tx
 5 | begin;
 6 |     insert into atx_test1 values (1);
 7 |     select count(*) from pg_locks where transactionid=txid_current();
 8 |     begin autonomous;
 9 |         insert into atx_test1 values (1);
10 |         select count(*) from pg_locks where transactionid=txid_current();
11 |     commit;
12 |     -- here we still should see our lock
13 |     select count(*) from pg_locks where transactionid=txid_current();
14 | commit;
15 | 
16 | drop table atx_test1;
17 | 


--------------------------------------------------------------------------------
/src/bkb.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * bkb.c
  3 |  *
  4 |  * Bron–Kerbosch algorithm to find maximum clique in a graph.
  5 |  *
  6 |  * Copyright (c) 2017-2021, Postgres Professional
  7 |  *
  8 |  */
  9 | #ifndef TEST
 10 | #include "bkb.h"
 11 | 
 12 | #else
 13 | #include <assert.h>
 14 | #include <stdint.h>
 15 | #define Assert(expr) assert(expr)
 16 | typedef uint64_t nodemask_t;
 17 | #define MAX_NODES 64
 18 | #define BIT_CHECK(mask, bit) (((mask) & ((nodemask_t)1 << (bit))) != 0)
 19 | #define BIT_SET(mask, bit)   (mask |= ((nodemask_t)1 << (bit)))
 20 | #endif
 21 | 
 22 | typedef struct {
 23 | 	int size;
 24 | 	int nodes[MAX_NODES];
 25 | } NodeList;
 26 | 
 27 | static void
 28 | _list_append(NodeList* list, int n)
 29 | {
 30 | 	list->nodes[list->size++] = n;
 31 | }
 32 | 
 33 | static void
 34 | _list_copy(NodeList* dst, NodeList const* src)
 35 | {
 36 | 	int i;
 37 | 	int n = src->size;
 38 | 	dst->size = n;
 39 | 	for (i = 0; i < n; i++) {
 40 | 		dst->nodes[i] = src->nodes[i];
 41 | 	}
 42 | }
 43 | 
 44 | static nodemask_t
 45 | _list_to_nodemask(NodeList *list)
 46 | {
 47 | 	nodemask_t res = 0;
 48 | 	int i;
 49 | 
 50 | 	for (i = 0; i < list->size; i++)
 51 | 		BIT_SET(res, list->nodes[i]);
 52 | 	return res;
 53 | }
 54 | 
 55 | /*
 56 |  * See original paper
 57 |  *   Bron, Coen; Kerbosch, Joep (1973), "Algorithm 457: finding all cliques of
 58 |  *   an undirected graph", Commun. ACM, ACM, 16 (9): 575–577
 59 |  * or wiki article (I recommend the latter). Var names (and generally the code)
 60 |  * here closely resemble ones in the original paper and deserve some deciphering:
 61 |  *   - cur is R in wiki
 62 |  *   - oldSet[0; ne) is X in wiki
 63 |  *   - oldSet[ne; ce) is P in wiki
 64 |  *
 65 |  * Pristine Bron-Kerbosch algorithm calculates *all* max cliques. In mtm we
 66 |  * don't need that, so we return in result only one biggest max clique
 67 |  * (actually, this means we could avoid maintaining X altogether).
 68 |  * What we do need though is deterministic calculation, so that whenever we
 69 |  * have a majority of nodes seeing each other, *all* members of some such
 70 |  * majority calculate *the same* clique. e.g. with topology
 71 |  *
 72 |  *     2
 73 |  *    /|\
 74 |  *   1 | 3
 75 |  *    \|/
 76 |  *     4
 77 |  *
 78 |  * 2 and 4 must calculate the same clique, or we won't converge.
 79 |  * To this end, we compare max cliques by nodemask and pick the
 80 |  * smallest one.
 81 |  */
 82 | static void
 83 | extend(NodeList* cur, NodeList* result, nodemask_t* graph, int* oldSet, int ne, int ce)
 84 | {
 85 | 	int nod = 0;
 86 | 	int minnod = ce;
 87 | 	int fixp = -1; /* pivot (u in wiki) */
 88 | 	/* index in oldSet of next vertice we'll include in R -- vertex v in wiki*/
 89 | 	int s = -1;
 90 | 	int i, j, k;
 91 | 	int newce, newne;
 92 | 	int sel; /* the vertex moved P->R itself, pointed to by s -- v in wiki */
 93 | 	int newSet[MAX_NODES];
 94 | 
 95 | 	/* Choose the pivot vertex fixp */
 96 | 	for (i = 0; i < ce && minnod != 0; i++)
 97 | 	{
 98 | 		int p = oldSet[i];
 99 | 		int cnt = 0;
100 | 		int pos = -1;
101 | 
102 | 		/*
103 | 		 * Count how many non-neighbours of potential pivot we have in P.
104 | 		 * Counterintuitively, we require input to have self-loops, so node is
105 | 		 * sorta neighbour of itself, though we must also recurse into it and
106 | 		 * thus we miss it here (in cnt) and count it in nod instead.
107 | 		 * This mumbo-jumbo is important as it forces (cnt < minnod) be true
108 | 		 * when P contains only one vertex (minnod=1 initially).
109 | 		 * I'd actually make initial minnod bigger and remove self loops...
110 | 		 */
111 | 		for (j = ne; j < ce && cnt < minnod; j++)
112 | 		{
113 | 			if (!BIT_CHECK(graph[p], oldSet[j]))
114 | 			{
115 | 				cnt++;
116 | 				pos = j;
117 | 			}
118 | 		}
119 | 
120 | 		if (cnt < minnod)
121 | 		{
122 | 			minnod = cnt;
123 | 			fixp = p;
124 | 			if (i < ne)
125 | 			{
126 | 				/* if pivot is from X, not P, take random non-neighbour */
127 | 				s = pos;
128 | 			}
129 | 			else
130 | 			{
131 | 				/*
132 | 				 * else, process pivot itself first, otherwise we won't find
133 | 				 * it in the loop below as pivot is a neighbour of itself
134 | 				 */
135 | 				s = i;
136 | 				/* don't forget to increment num of nodes to recurse to */
137 | 				nod = 1;
138 | 			}
139 | 		}
140 | 	}
141 | 
142 | 	for (k = minnod + nod; k >= 1; k--)
143 | 	{
144 | 		Assert(s >= 0);
145 | 		Assert(s < MAX_NODES);
146 | 		Assert(ne >= 0);
147 | 		Assert(ne < MAX_NODES);
148 | 		Assert(ce >= 0);
149 | 		Assert(ce < MAX_NODES);
150 | 
151 | 		/*
152 | 		 * put (wiki) v on the border of X and P, we'll move the border to
153 | 		 * relocate the vertex
154 | 		 */
155 | 		sel = oldSet[s];
156 | 		oldSet[s] = oldSet[ne];
157 | 		oldSet[ne] = sel;
158 | 
159 | 		newne = 0;
160 | 		/* form X for recursive call -- leave only v's neighbours */
161 | 		for (i = 0; i < ne; i++) {
162 | 			if (BIT_CHECK(graph[sel], oldSet[i])) {
163 | 				newSet[newne++] = oldSet[i];
164 | 			}
165 | 		}
166 | 
167 | 		newce = newne;
168 | 		/*
169 | 		 * similarly, form P for recursive call -- leave only v's neighbours
170 | 		 *
171 | 		 * + 1 skips v itself, which is moved to R (again the crutch
172 | 		 * introduced by self loops)
173 | 		 */
174 | 		for (i = ne + 1; i < ce; i++) {
175 | 			if (BIT_CHECK(graph[sel], oldSet[i])) {
176 | 				newSet[newce++] = oldSet[i];
177 | 			}
178 | 		}
179 | 		/* push v to R */
180 | 		_list_append(cur, sel);
181 | 		if (newce == 0) { /* both P and X are empty => max clique */
182 | 			if (result->size < cur->size ||
183 | 				(result->size == cur->size &&
184 | 				 _list_to_nodemask(result) > _list_to_nodemask(cur))) {
185 | 				_list_copy(result, cur);
186 | 			}
187 | 		} else if (newne < newce) { /* P is not empty, so recurse */
188 | 			if (cur->size + newce - newne > result->size)  {
189 | 				extend(cur, result, graph, newSet, newne, newce);
190 | 			}
191 | 		}
192 | 		/* remove v back from R for the next iteration */
193 | 		cur->size -= 1;
194 | 		/* move v from P to X */
195 | 		ne += 1;
196 | 		/* and find in P next non-neighbour of pivot */
197 | 		if (k > 1)
198 | 		{
199 | 
200 | 			for (s = ne; BIT_CHECK(graph[fixp], oldSet[s]); s++)
201 | 			{
202 | 				Assert(s < MAX_NODES);
203 | 			}
204 | 		}
205 | 	}
206 | }
207 | 
208 | /*
209 |  * Deterministically (c.f. extend) calculates biggest max clique of the graph.
210 |  * The matrix must be symmetric (undirected graph) and must have 1 on the
211 |  * diagonal (self loops).
212 |  *
213 |  * Note that this API renders impossible to distinguish absent node from node
214 |  * without any edges -- absent nodes with ids <= n_nodes must still have 1
215 |  * on the diagonal. This is fine as we are not interested much in cliques
216 |  * of size 1, they never form majority; well, not as far as we don't support
217 |  * cluster of size 1.
218 |  */
219 | nodemask_t
220 | MtmFindMaxClique(nodemask_t* graph, int n_nodes, int* clique_size)
221 | {
222 | 	NodeList tmp;
223 | 	NodeList result;
224 | 	int all[MAX_NODES];
225 | 	int i;
226 | 	int j;
227 | 
228 | 	tmp.size = 0;
229 | 	result.size = 0;
230 | 	for (i = 0; i < MAX_NODES; i++)
231 | 		all[i] = i;
232 | 
233 | 	/* check that matrix is symmetric */
234 | 	for (i = 0; i < n_nodes; i++)
235 | 	for (j = 0; j < n_nodes; j++)
236 | 		Assert(BIT_CHECK(graph[i], j) == BIT_CHECK(graph[j], i));
237 | 
238 | 	/* algorithm requires diagonal elements to be set */
239 | 	for (i = 0; i < n_nodes; i++)
240 | 		Assert(BIT_CHECK(graph[i], i));
241 | 
242 | 	extend(&tmp, &result, graph, all, 0, n_nodes);
243 | 
244 | 	*clique_size = result.size;
245 | 	return _list_to_nodemask(&result);
246 | }
247 | 
248 | #ifdef TEST
249 | #include <stdio.h>
250 | 
251 | /*
252 |  * To run some randomized tests, compile with -DTEST to ./a.out, e.g.
253 |  * gcc -ggdb3 -O0 -DTEST bkb.c
254 |  * , install sage and run ./test_bkb.sage.py
255 |  */
256 | 
257 | int main()
258 | {
259 | 	nodemask_t matrix[64] = {0};
260 | 	nodemask_t clique;
261 | 	int clique_size;
262 | 	int n_nodes;
263 | 
264 | 	n_nodes = 4;
265 | 	matrix[0] = 15; /* 1111 */
266 | 	matrix[1] = 15; /* 1111 */
267 | 	matrix[2] = 7;	/* 0111 */
268 | 	matrix[3] = 11; /* 1011 */
269 | 
270 | 	scanf("%d", &n_nodes);
271 | 	for (int i = 0; i < n_nodes; i++)
272 | 	{
273 | 		nodemask_t row;
274 | 		scanf("%ld", &row);
275 | 		matrix[i] = row;
276 | 	}
277 | 
278 | 	clique = MtmFindMaxClique(matrix, n_nodes, &clique_size);
279 | 	printf("%ld %d\n", clique, clique_size);
280 | 	return 0;
281 | }
282 | #endif
283 | 


--------------------------------------------------------------------------------
/src/bytebuf.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * bytebuf.c
 3 |  *
 4 |  * Copyright (c) 2016-2021, Postgres Professional
 5 |  *
 6 |  */
 7 | #include "postgres.h"
 8 | 
 9 | #include "bytebuf.h"
10 | 
11 | #define INIT_BUF_SIZE 1024
12 | 
13 | void
14 | ByteBufferAlloc(ByteBuffer *buf)
15 | {
16 | 	buf->size = INIT_BUF_SIZE;
17 | 	buf->data = palloc(buf->size);
18 | 	buf->used = 0;
19 | }
20 | 
21 | void
22 | ByteBufferAppend(ByteBuffer *buf, void *data, int len)
23 | {
24 | 	if (buf->used + len > buf->size)
25 | 	{
26 | 		buf->size = buf->used + len > buf->size * 2 ? buf->used + len : buf->size * 2;
27 | 		buf->data = (char *) repalloc(buf->data, buf->size);
28 | 	}
29 | 	memcpy(&buf->data[buf->used], data, len);
30 | 	buf->used += len;
31 | }
32 | 
33 | void
34 | ByteBufferAppendInt32(ByteBuffer *buf, int data)
35 | {
36 | 	ByteBufferAppend(buf, &data, sizeof data);
37 | }
38 | 
39 | void
40 | ByteBufferFree(ByteBuffer *buf)
41 | {
42 | 	pfree(buf->data);
43 | }
44 | 
45 | void
46 | ByteBufferReset(ByteBuffer *buf)
47 | {
48 | 	buf->used = 0;
49 | }
50 | 


--------------------------------------------------------------------------------
/src/ddd.c:
--------------------------------------------------------------------------------
  1 | /*----------------------------------------------------------------------------
  2 |  *
  3 |  * ddd.c
  4 |  *
  5 |  * Distributed deadlock detector.
  6 |  *
  7 |  * Copyright (c) 2017-2021, Postgres Professional
  8 |  *
  9 |  *----------------------------------------------------------------------------
 10 |  */
 11 | 
 12 | #include "postgres.h"
 13 | #include "access/clog.h"
 14 | #include "access/twophase.h"
 15 | #include "access/transam.h"
 16 | #include "storage/lwlock.h"
 17 | #include "storage/ipc.h"
 18 | #include "storage/proc.h"
 19 | #include "utils/hsearch.h"
 20 | #include "utils/timeout.h"
 21 | #include "miscadmin.h"
 22 | #include "replication/origin.h"
 23 | #include "replication/message.h"
 24 | #include "utils/builtins.h"
 25 | #include "storage/lmgr.h"
 26 | #include "storage/procarray.h"
 27 | 
 28 | #include "multimaster.h"
 29 | 
 30 | #include "ddd.h"
 31 | #include "bytebuf.h"
 32 | #include "state.h"
 33 | #include "logger.h"
 34 | #include "commit.h"
 35 | 
 36 | 
 37 | /*
 38 |  * This DDD is based on following observations:
 39 |  *
 40 |  *    Situation when a transaction (say T1) in apply_worker (or receiver
 41 |  * itself) stucks on some lock created by a transaction in a local backend (say
 42 |  * T2) will definitely lead to a deadlock since T2 after being prepared and
 43 |  * replicated will fail to obtain lock that is already held by T1.
 44 |  *    Same reasoning may be applied to the situation when apply_worker (or
 45 |  * receiver) is waiting for an apply_worker (or receiver) belonging to other
 46 |  * origin -- no need to wait for a distributed deadlock detection and we may
 47 |  * just instantly abort.
 48 |  *    Only case for distributed deadlock that is left is when apply_worker
 49 |  * (or receiver) is waiting for another apply_worker from same origin. However,
 50 |  * such situation isn't possible since one origin node can not have two
 51 |  * conflicting prepared transaction simultaneously.
 52 |  *
 53 |  *    So we may construct distributed deadlock avoiding mechanism by disallowing
 54 |  * such edges. Now we may ask inverse question: what amount of wait graphs
 55 |  * with such edges are actually do not represent distributed deadlock? That may
 56 |  * happen in cases when holding transaction is purely local since it holding
 57 |  * locks only in SHARED mode. Only lock levels that are conflicting with this
 58 |  * modes are EXCLUSIVE and ACCESS EXCLUSIVE. In all other cases proposed
 59 |  * avoiding scheme should not yield false positives.
 60 |  *
 61 |  *     To cope with false positives in EXCLUSIVE and ACCESS EXCLUSIVE modes we
 62 |  * may throw exception not in WaitOnLock() when we first saw forbidden edge
 63 |  * but later during first call to local deadlock detector. This way we still
 64 |  * have `deadlock_timeout` second to grab that lock and database user also can
 65 |  * increase it on per-transaction basis if there are long-living read-only
 66 |  * transactions.
 67 |  *
 68 |  *     As a further optimization it is possible to check whether our lock is
 69 |  * EXCLUSIVE or higher so not to delay rollback till `deadlock_timeout` event.
 70 |  */
 71 | bool
 72 | MtmDetectGlobalDeadLock(PGPROC *proc)
 73 | {
 74 | 	StringInfoData locktagbuf;
 75 | 	LOCK	   *lock = proc->waitLock;
 76 | 	bool		is_detected = false;
 77 | 	Assert(proc == MyProc);
 78 | 
 79 | 	/*
 80 | 	 * These locks never participate in deadlocks, ignore them. Without it,
 81 | 	 * spurious deadlocks might be reported due to concurrency on rel
 82 | 	 * extension.
 83 | 	 */
 84 | 	if (LOCK_LOCKTAG(*lock) == LOCKTAG_RELATION_EXTEND ||
 85 | 		(LOCK_LOCKTAG(*lock) == LOCKTAG_PAGE))
 86 | 		return false;
 87 | 
 88 | 	/*
 89 | 	 * There is no need to check for deadlocks in recovery: all
 90 | 	 * conflicting transactions must be eventually committed/aborted
 91 | 	 * by the resolver. It would not be fatal, but restarting due to
 92 | 	 * deadlock ERRORs might significantly slow down the recovery
 93 | 	 */
 94 | 	is_detected = (curr_replication_mode == REPLMODE_NORMAL);
 95 | 
 96 | 	if (is_detected)
 97 | 	{
 98 | 		initStringInfo(&locktagbuf);
 99 | 		DescribeLockTag(&locktagbuf, &lock->tag);
100 | 		mtm_log(LOG, "apply worker %d waits for %s on %s",
101 | 				MyProcPid,
102 | 				GetLockmodeName(lock->tag.locktag_lockmethodid, proc->waitLockMode),
103 | 				locktagbuf.data);
104 | 	}
105 | 
106 | 	return is_detected;
107 | 
108 | }
109 | 


--------------------------------------------------------------------------------
/src/include/bgwpool.h:
--------------------------------------------------------------------------------
 1 | #ifndef __BGWPOOL_H__
 2 | #define __BGWPOOL_H__
 3 | 
 4 | #include "storage/lwlock.h"
 5 | #include "storage/pg_sema.h"
 6 | #include "postmaster/bgworker.h"
 7 | #include "storage/condition_variable.h"
 8 | #include "storage/dsm.h"
 9 | 
10 | #include "receiver.h"
11 | 
12 | #define MAX_DBNAME_LEN 30
13 | #define MAX_DBUSER_LEN 30
14 | #define MAX_NAME_LEN 30
15 | #define MULTIMASTER_BGW_RESTART_TIMEOUT BGW_NEVER_RESTART	/* seconds */
16 | 
17 | typedef struct
18 | {
19 | 	int			value;			/* 0 - not used; 1 - transaction; 2 - sync
20 | 								 * point */
21 | 	int			prev;
22 | 	int			next;
23 | } txlelem_t;
24 | 
25 | typedef struct
26 | {
27 | 	txlelem_t  *store;
28 | 	int			tail;
29 | 	int			head;
30 | 	int			size;
31 | 	int			nelems;
32 | 	LWLock		lock;
33 | 	ConditionVariable syncpoint_cv;
34 | 	ConditionVariable transaction_cv;
35 | } txlist_t;
36 | 
37 | /*
38 |  * Shared data of BgwPool
39 |  */
40 | typedef struct BgwPool
41 | {
42 | 	int			sender_node_id;
43 | 	LWLock		lock;
44 | 	ConditionVariable syncpoint_cv;
45 | 	int			n_holders;
46 | 
47 | 	/* Tell workers that queue contains a number of work. */
48 | 	ConditionVariable available_cv;
49 | 
50 | 	/*
51 | 	 * Queue is full. We can't insert a work data into the queue and wait
52 | 	 * while any worker will take over a piece of data from queue and we will
53 | 	 * do an attempt to try to add the work data into the queue.
54 | 	 */
55 | 	ConditionVariable overflow_cv;
56 | 
57 | 	/* Queue state */
58 | 	size_t		head;
59 | 	size_t		tail;
60 | 	size_t		size;			/* Size of queue aligned to INT word */
61 | 
62 | 	bool		producerBlocked;
63 | 
64 | 	char		poolName[MAX_NAME_LEN];
65 | 	Oid			db_id;
66 | 	Oid			user_id;
67 | 	dsm_handle	dsmhandler;		/* DSM descriptor. Workers use it for
68 | 								 * attaching */
69 | 
70 | 	size_t		nWorkers;		/* a number of pool workers launched */
71 | 	TimestampTz lastDynamicWorkerStartTime;
72 | 	/* Handlers of workers at the pool */
73 | 	BackgroundWorkerHandle **bgwhandles;
74 | 	pid_t		receiver_pid;
75 | 
76 | 	txlist_t	txlist;
77 | } BgwPool;
78 | 
79 | 
80 | extern void BgwPoolStart(int sender_node_id, char *poolName, Oid db_id, Oid user_id);
81 | extern void BgwPoolExecute(BgwPool *pool, void *work, int size, MtmReceiverWorkerContext *rwctx);
82 | extern void BgwPoolShutdown(BgwPool *poolDesc);
83 | extern void BgwPoolCancel(BgwPool *pool);
84 | 
85 | extern int	txl_store(txlist_t *txlist, int value);
86 | extern void txl_remove(txlist_t *txlist, int txlist_pos);
87 | extern void txl_wait_syncpoint(txlist_t *txlist, int txlist_pos);
88 | extern void txl_wait_sphead(txlist_t *txlist, int txlist_pos);
89 | extern void txl_wait_txhead(txlist_t *txlist, int txlist_pos);
90 | extern void txl_wakeup_workers(txlist_t *txlist);
91 | 
92 | #endif
93 | 


--------------------------------------------------------------------------------
/src/include/bkb.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Bron–Kerbosch algorithm to find maximum clique in graph
 3 |  */
 4 | #ifndef __BKB_H__
 5 | #define __BKB_H__
 6 | 
 7 | #include "postgres.h"
 8 | 
 9 | #include "multimaster.h" /* xxx move nodemask to separate file */
10 | 
11 | extern uint64 MtmFindMaxClique(uint64 *matrix, int n_modes, int *clique_size);
12 | 
13 | #endif
14 | 


--------------------------------------------------------------------------------
/src/include/bytebuf.h:
--------------------------------------------------------------------------------
 1 | #ifndef __BYTEBUF_H__
 2 | #define __BYTEBUF_H__
 3 | 
 4 | typedef struct
 5 | {
 6 | 	char	   *data;
 7 | 	int			size;
 8 | 	int			used;
 9 | } ByteBuffer;
10 | 
11 | extern void ByteBufferAlloc(ByteBuffer *buf);
12 | extern void ByteBufferAppend(ByteBuffer *buf, void *data, int len);
13 | extern void ByteBufferAppendInt32(ByteBuffer *buf, int data);
14 | extern void ByteBufferFree(ByteBuffer *buf);
15 | extern void ByteBufferReset(ByteBuffer *buf);
16 | 
17 | #endif
18 | 


--------------------------------------------------------------------------------
/src/include/commit.h:
--------------------------------------------------------------------------------
 1 | /*----------------------------------------------------------------------------
 2 |  *
 3 |  * ddl.h
 4 |  *	  Statement based replication of DDL commands.
 5 |  *
 6 |  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
 7 |  * Portions Copyright (c) 1994, Regents of the University of California
 8 |  * Portions Copyright (c) 2021, Postgres Professional
 9 |  *
10 |  *----------------------------------------------------------------------------
11 |  */
12 | 
13 | #ifndef COMMIT_H
14 | #define COMMIT_H
15 | 
16 | #include "postgres.h"
17 | #include "access/xact.h"
18 | 
19 | #include "messaging.h"
20 | 
21 | /*
22 |  * gid starting with MTM is used by internal multimaster 2PC xacts; clients
23 |  * shouldn't use them for their own prepares.
24 |  */
25 | #define IS_EXPLICIT_2PC_GID(gid) (strncmp((gid), "MTM-", 4) != 0)
26 | 
27 | extern void MtmGenerateGid(char *gid, int node_id, TransactionId xid,
28 | 						   uint64 gen_num);
29 | extern uint64 MtmGidParseGenNum(const char *gid);
30 | extern int	MtmGidParseNodeId(const char *gid);
31 | extern TransactionId MtmGidParseXid(const char *gid);
32 | 
33 | extern bool MtmTwoPhaseCommit(void);
34 | extern void MtmBeginTransaction(void);
35 | extern void MtmXactCallback(XactEvent event, void *arg);
36 | 
37 | extern bool MtmExplicitPrepare(char *gid);
38 | extern void MtmExplicitFinishPrepared(bool isTopLevel, char *gid, bool isCommit);
39 | 
40 | #endif
41 | 


--------------------------------------------------------------------------------
/src/include/compat.h:
--------------------------------------------------------------------------------
 1 | #ifndef MTMCOMPAT_H
 2 | #define MTMCOMPAT_H
 3 | 
 4 | /* EE pooler gets rid of static variable */
 5 | #ifdef PGPRO_EE
 6 | #define FeBeWaitSetCompat() (MyProcPort->pqcomm_waitset)
 7 | #else
 8 | #define FeBeWaitSetCompat() (FeBeWaitSet)
 9 | #endif
10 | 
11 | #ifdef PGPRO_EE /* atx */
12 | #define BeginTransactionBlockCompat() (BeginTransactionBlock(false, NIL))
13 | #define UserAbortTransactionBlockCompat(chain) (UserAbortTransactionBlock(false, (chain)))
14 | #else
15 | #define BeginTransactionBlockCompat() (BeginTransactionBlock())
16 | #define UserAbortTransactionBlockCompat(chain) (UserAbortTransactionBlock(chain))
17 | #endif
18 | 
19 | /* atx renames this for some reason */
20 | #ifdef PGPRO_EE
21 | #define on_commits_compat() (pg_on_commit_actions)
22 | #else
23 | #define on_commits_compat() (on_commits)
24 | #endif
25 | 
26 | #endif							/* MTMCOMPAT_H */
27 | 


--------------------------------------------------------------------------------
/src/include/ddd.h:
--------------------------------------------------------------------------------
1 | #ifndef __DDD_H__
2 | #define __DDD_H__
3 | 
4 | extern bool MtmDetectGlobalDeadLock(PGPROC *proc);
5 | 
6 | #endif
7 | 


--------------------------------------------------------------------------------
/src/include/ddl.h:
--------------------------------------------------------------------------------
 1 | /*----------------------------------------------------------------------------
 2 |  *
 3 |  * ddl.h
 4 |  *	  Statement based replication of DDL commands.
 5 |  *
 6 |  * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
 7 |  * Portions Copyright (c) 1994, Regents of the University of California
 8 |  * Portions Copyright (c) 2021, Postgres Professional
 9 |  *
10 |  *----------------------------------------------------------------------------
11 |  */
12 | 
13 | #ifndef DML_H
14 | #define DML_H
15 | 
16 | #include "utils/relcache.h"
17 | 
18 | /* GUCs */
19 | extern bool MtmMonotonicSequences;
20 | extern char *MtmRemoteFunctionsList;
21 | extern bool MtmRemoteFunctionsUpdating;
22 | extern bool MtmVolksWagenMode;
23 | extern bool MtmIgnoreTablesWithoutPk;
24 | 
25 | typedef enum
26 | {
27 | 	MTM_DDL_IN_PROGRESS_NOTHING,
28 | 	MTM_DDL_IN_PROGRESS_TX,
29 | 	MTM_DDL_IN_PROGRESS_NONTX,
30 | } MtmDDLInProgress;
31 | 
32 | extern MtmDDLInProgress DDLApplyInProgress;
33 | 
34 | extern void MtmDDLReplicationInit(void);
35 | extern void MtmDDLReplicationShmemStartup(void);
36 | extern void temp_schema_reset_all(int my_node_id);
37 | extern bool MtmIsRelationLocal(Relation rel);
38 | extern void MtmDDLResetStatement(void);
39 | extern void MtmApplyDDLMessage(const char *messageBody, bool transactional);
40 | extern void MtmDDLResetApplyState(void);
41 | extern void MtmSetRemoteFunction(char const *list, void *extra);
42 | extern void MtmToggleDML(void);
43 | extern void MtmMakeTableLocal(char const *schema, char const *name, bool locked);
44 | extern void multimaster_fmgr_hook(FmgrHookEventType event, FmgrInfo *flinfo, Datum *private);
45 | 
46 | #endif
47 | 


--------------------------------------------------------------------------------
/src/include/dmq.h:
--------------------------------------------------------------------------------
 1 | #ifndef DMQ_H
 2 | #define DMQ_H
 3 | 
 4 | #include "libpq-fe.h"
 5 | #include "lib/stringinfo.h"
 6 | 
 7 | typedef int8 DmqDestinationId;
 8 | 
 9 | #define DMQ_NAME_MAXLEN 32
10 | /* mm currently uses xact gid as stream name, so this should be >= GIDSIZE */
11 | #define DMQ_STREAM_NAME_MAXLEN 200
12 | 
13 | extern void dmq_init(int send_timeout, int connect_timeout);
14 | 
15 | #define DMQ_N_MASK_POS 16 /* ought to be >= MTM_MAX_NODES */
16 | extern DmqDestinationId dmq_destination_add(char *connstr, char *sender_name,
17 | 											char *receiver_name, int8 recv_mask_pos,
18 | 											int ping_period);
19 | extern void dmq_destination_drop(char *receiver_name);
20 | extern void dmq_destination_reconnect(char *receiver_name);
21 | 
22 | extern void dmq_attach_receiver(char *sender_name, int8 mask_pos);
23 | extern void dmq_detach_receiver(char *sender_name);
24 | 
25 | extern void dmq_terminate_receiver(char *name);
26 | 
27 | extern void dmq_reattach_receivers(void);
28 | extern void dmq_stream_subscribe(char *stream_name);
29 | extern void dmq_stream_unsubscribe(void);
30 | 
31 | extern void dmq_get_sendconn_cnt(uint64 participants, int *sconn_cnt);
32 | extern bool dmq_pop(int8 *sender_mask_pos, StringInfo msg, uint64 mask);
33 | extern bool dmq_pop_nb(int8 *sender_mask_pos, StringInfo msg, uint64 mask, bool *wait);
34 | extern uint64 dmq_purge_failed_participants(uint64 participants, int *sconn_cnt);
35 | 
36 | extern void dmq_push(DmqDestinationId dest_id, char *stream_name, char *msg);
37 | extern void dmq_push_buffer(DmqDestinationId dest_id, char *stream_name, const void *buffer, size_t len);
38 | 
39 | typedef void (*dmq_hook_type) (char *);
40 | extern void *(*dmq_receiver_start_hook)(char *sender_name);
41 | extern dmq_hook_type dmq_receiver_stop_hook;
42 | extern void (*dmq_receiver_heartbeat_hook)(char *sender_name, StringInfo msg, void *extra);
43 | extern dmq_hook_type dmq_sender_connect_hook;
44 | extern void (*dmq_sender_heartbeat_hook)(char *receiver_name, StringInfo buf);
45 | extern dmq_hook_type dmq_sender_disconnect_hook;
46 | 
47 | #endif
48 | 


--------------------------------------------------------------------------------
/src/include/global_tx.h:
--------------------------------------------------------------------------------
  1 | /*----------------------------------------------------------------------------
  2 |  *
  3 |  * global_tx.h
  4 |  *	  Persistent and in-memory state necessary for our E3PC-like atomic commit
  5 |  #	  protocol.
  6 |  *
  7 |  * Copyright (c) 2016-2021, Postgres Professional
  8 |  *
  9 |  *----------------------------------------------------------------------------
 10 |  */
 11 | #ifndef GLOBAL_TX_H
 12 | #define GLOBAL_TX_H
 13 | 
 14 | #include "multimaster.h"
 15 | 
 16 | typedef struct
 17 | {
 18 | 	int		ballot;
 19 | 	int		node_id;
 20 | } GlobalTxTerm;
 21 | 
 22 | #define InvalidGTxTerm ((GlobalTxTerm) {0, 0})
 23 | /*
 24 |  * This term with ballot 1 and fake 0 node id is less than any term generated
 25 |  * by resolver; it is used by the coordinator itself.
 26 |  */
 27 | #define InitialGTxTerm ((GlobalTxTerm) {1, 0})
 28 | 
 29 | typedef enum
 30 | {
 31 | 	GTXInvalid = 0,  /* we never gave a vote */
 32 | 	GTXPreCommitted, /* voted for commit */
 33 | 	GTXPreAborted,	 /* voted for abort */
 34 | 	GTXCommitted,	 /* definitely know xact is committed */
 35 | 	GTXAborted		 /* definitely know xact is aborted */
 36 | } GlobalTxStatus;
 37 | 
 38 | extern char const *const GlobalTxStatusMnem[];
 39 | 
 40 | typedef enum
 41 | {
 42 | 	GTRS_AwaitStatus, /* 1a sent, wait for 1b */
 43 | 	GTRS_AwaitAcks    /* 2a sent, wait for 2b */
 44 | } GlobalTxResolvingStage;
 45 | 
 46 | typedef struct
 47 | {
 48 | 	GlobalTxTerm	proposal; /* nextBal in terms of The Part-Time Parliament */
 49 | 	GlobalTxTerm	accepted; /* prevBal in terms of The Part-Time Parliament */
 50 | 	GlobalTxStatus	status; /*
 51 | 							 * prevDec in terms of The Part-Time Parliament
 52 | 							 * (or special never voted | commit | abort)
 53 | 							 */
 54 | } GTxState;
 55 | 
 56 | /*
 57 |  * Constant xact metadata which we encode into state_3pc. We could (and
 58 |  * previously did) carry that directly in gid, but this intervenes with
 59 |  * explicit 2PC usage: applier must know generation of the xact, and
 60 |  * scribbling over user-provided gid is ugly and/or inefficient.
 61 |  */
 62 | typedef struct
 63 | {
 64 | 	int coordinator; /* node id who initiated the transaction */
 65 | 	TransactionId xid; /* xid at coordinator */
 66 | 	uint64 gen_num; /* the number of generation xact belongs to */
 67 | 	nodemask_t configured; /* mask of configured nodes of this generation;
 68 | 							* the idea was to use this by resolver, but it
 69 | 							* wasn't finished. We shouldn't have any problems
 70 | 							* with this anyway if all xacts created before
 71 | 							* first node add-rm are resolved before the
 72 | 							* second one is started
 73 | 							*/
 74 | } XactInfo;
 75 | 
 76 | typedef struct GlobalTx
 77 | {
 78 | 	char		gid[GIDSIZE];
 79 | 	XactInfo	xinfo;
 80 | 	XLogRecPtr	coordinator_end_lsn;
 81 | 	BackendId	acquired_by;
 82 | 	/* paxos voting state for this xact */
 83 | 	GTxState	state;
 84 | 	/* transient thing used to rm shmem entry on error */
 85 | 	bool		prepared;
 86 | 
 87 | 	/* resolver corner */
 88 | 	bool		orphaned;	/* Indication for resolver that current tx needs
 89 | 							 * to be picked up. Comes from a failed backend or
 90 | 							 * a disabled node. */
 91 | 	GTxState	phase1_acks[MTM_MAX_NODES];
 92 | 	/*
 93 | 	 * Technically phase2 ack contains just one term, which is acked. However,
 94 | 	 * we 1) collect decrees (in 'status') to perform sanity checks
 95 | 	 * 2) make it GTxState to reuse quorum() function.
 96 | 	 */
 97 | 	GTxState	phase2_acks[MTM_MAX_NODES];
 98 | 	GlobalTxResolvingStage resolver_stage;
 99 | } GlobalTx;
100 | 
101 | typedef struct
102 | {
103 | 	LWLock	   *lock;
104 | 	HTAB	   *gid2gtx;
105 | } gtx_shared_data;
106 | 
107 | extern gtx_shared_data *gtx_shared;
108 | 
109 | void MtmGlobalTxInit(void);
110 | void MtmGlobalTxShmemStartup(void);
111 | void GlobalTxEnsureBeforeShmemExitHook(void);
112 | GlobalTx *GlobalTxAcquire(const char *gid, bool create, bool nowait_own_live,
113 | 						  bool *busy, int coordinator);
114 | void GlobalTxRelease(GlobalTx *gtx);
115 | void GlobalTxAtExit(int code, Datum arg);
116 | void GlobalTxLoadAll(void);
117 | char *serialize_xstate(XactInfo *xinfo, GTxState *gtx_state);
118 | int term_cmp(GlobalTxTerm t1, GlobalTxTerm t2);
119 | int deserialize_xstate(const char *state, XactInfo *xinfo, GTxState *gtx_state,
120 | 					   int elevel);
121 | GlobalTxTerm GlobalTxGetMaxProposal(void);
122 | void GlobalTxSaveInTable(const char *gid, XLogRecPtr coordinator_end_lsn,
123 | 						 GlobalTxStatus status,
124 | 						 GlobalTxTerm term_prop, GlobalTxTerm term_acc);
125 | void GlobalTxMarkOrphaned(int node_id);
126 | 
127 | char *GlobalTxToString(GlobalTx *gtx);
128 | 
129 | #endif							/* GLOBAL_TX_H */
130 | 


--------------------------------------------------------------------------------
/src/include/logger.h:
--------------------------------------------------------------------------------
  1 | /*----------------------------------------------------------------------------
  2 |  *
  3 |  * logger.h
  4 |  *		GUC-controlled map from application meaningful log tags to actual log
  5 |  *		levels.
  6 |  *
  7 |  * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
  8 |  * Portions Copyright (c) 1994, Regents of the University of California
  9 |  * Portions Copyright (c) 2021, Postgres Professional
 10 |  *
 11 |  *----------------------------------------------------------------------------
 12 |  */
 13 | 
 14 | #include "postgres.h"
 15 | 
 16 | #include "postmaster/bgworker.h"
 17 | #include "utils/elog.h"
 18 | #include "utils/memutils.h"
 19 | 
 20 | /*
 21 |  * this hack allows to use mtm_log with direct log level (e.g. ERROR), see
 22 |  * mtm_log
 23 |  */
 24 | #define FIRST_UNUSED_ERRCODE (PANIC + 1)
 25 | 
 26 | /* keep it in sync with mtm_log_gucs */
 27 | typedef enum MtmLogTag
 28 | {
 29 | 	/* general */
 30 | 	MtmTxTrace				= FIRST_UNUSED_ERRCODE,
 31 | 	MtmTxFinish,
 32 | 
 33 | 	/* coordinator */
 34 | 	MtmCoordinatorTrace,
 35 | 
 36 | 	/* dmq */
 37 | 	DmqStateIntermediate,
 38 | 	DmqStateFinal,
 39 | 	DmqTraceOutgoing,
 40 | 	DmqTraceIncoming,
 41 | 	DmqTraceShmMq,
 42 | 	DmqPqTiming,
 43 | 
 44 | 	/* resolver */
 45 | 	ResolverState,
 46 | 	ResolverTx,
 47 | 	ResolverTasks,
 48 | 
 49 | 	/* status worker */
 50 | 	StatusRequest,
 51 | 
 52 | 	/* pool */
 53 | 	BgwPoolEvent,
 54 | 	BgwPoolEventDebug,
 55 | 
 56 | 	/* ddd */
 57 | 	DeadlockCheck,
 58 | 	DeadlockUpdate,
 59 | 	DeadlockSerialize,
 60 | 
 61 | 	/* ddl */
 62 | 	DDLStmtOutgoing,
 63 | 	DDLStmtIncoming,
 64 | 	DDLProcessingTrace,
 65 | 
 66 | 	/* walsender's proto */
 67 | 	ProtoTraceFilter,
 68 | 	ProtoTraceSender,
 69 | 	ProtoTraceMessage,
 70 | 	ProtoTraceState,
 71 | 
 72 | 	/* receiver */
 73 | 	MtmReceiverState,
 74 | 	MtmReceiverStateDebug,
 75 | 	MtmReceiverFilter,
 76 | 	MtmApplyMessage,
 77 | 	MtmApplyTrace,
 78 | 	MtmApplyError,
 79 | 	MtmApplyBgwFinish,
 80 | 	MtmReceiverFeedback,
 81 | 
 82 | 	/* state */
 83 | 	MtmStateMessage,
 84 | 	MtmStateSwitch,
 85 | 	MtmStateDebug,
 86 | 
 87 | 	/* syncpoints */
 88 | 	SyncpointCreated,
 89 | 	SyncpointApply,
 90 | 
 91 | 	/* Node add/drop */
 92 | 	NodeMgmt
 93 | } MtmLogTag;
 94 | 
 95 | typedef struct MtmLogGuc
 96 | {
 97 | 	const char *name;
 98 | 	int	 default_val;
 99 | 	int	 val;
100 | } MtmLogGuc;
101 | 
102 | extern MtmLogGuc mtm_log_gucs[];
103 | 
104 | #define MTM_TAG "[MTM]%s"
105 | 
106 | /*
107 |  * I tried to use get_ps_display instead of MyBgworkerEntry, but it returns
108 |  * only dynamic 'activity' part which doesn't include bgw name. Apparently
109 |  * there is no way to retrieve main part. Weird.
110 |  */
111 | extern bool MtmBackgroundWorker; /* avoid including multimaster.h for this */
112 | extern char *walsender_name; /* same for pglogical_proto.h */
113 | static inline char *
114 | am(void)
115 | {
116 | 	char *res = " ";
117 | 	char *name = NULL;
118 | 
119 | 	if (MtmBackgroundWorker)
120 | 		name = MyBgworkerEntry->bgw_name;
121 | 	else if (walsender_name)
122 | 		name = walsender_name;
123 | 	if (name)
124 | 	{
125 | 		/* this is for elog, so alloc in ErrorContext where fmt is evaluated */
126 | 		MemoryContext old_ctx = MemoryContextSwitchTo(ErrorContext);
127 | 		res = psprintf(" [%s] ", name);
128 | 		MemoryContextSwitchTo(old_ctx);
129 | 	}
130 | 	return res;
131 | }
132 | 
133 | #define MTM_ERRMSG(fmt,...) errmsg(MTM_TAG fmt, am(), ## __VA_ARGS__)
134 | 
135 | /*
136 |  * tag can either one of MtmLogTag values (in which case corresponding GUC
137 |  * defines the actual log level) or direct level like ERROR
138 |  */
139 | #define mtm_log(tag, fmt, ...) ereport( \
140 | 		((tag) >= FIRST_UNUSED_ERRCODE ? \
141 | 		 mtm_log_gucs[tag - FIRST_UNUSED_ERRCODE].val : (tag)), \
142 | 		(errmsg(MTM_TAG fmt, \
143 | 				am(), ## __VA_ARGS__), \
144 | 		 errhidestmt(true), errhidecontext(true)))
145 | 


--------------------------------------------------------------------------------
/src/include/messaging.h:
--------------------------------------------------------------------------------
  1 | 
  2 | /*****************************************************************************
  3 |  *
  4 |  * Messaging
  5 |  *
  6 |  *****************************************************************************/
  7 | #ifndef MESSAGING_H
  8 | #define MESSAGING_H
  9 | 
 10 | #include "global_tx.h"
 11 | #include "state.h"
 12 | 
 13 | /*
 14 |  * All messages are stamped with MtmMessageTag that should came before the rest
 15 |  * of the message. That is used upon receival as typecasting criterion.
 16 |  */
 17 | typedef enum
 18 | {
 19 | 	T_MtmPrepareResponse = 0,
 20 | 	T_Mtm2AResponse,
 21 | 	T_MtmTxRequest,
 22 | 	T_MtmTxStatusResponse,
 23 | 	T_MtmHeartbeat,
 24 | 	T_MtmGenVoteRequest,
 25 | 	T_MtmGenVoteResponse
 26 | } MtmMessageTag;
 27 | 
 28 | typedef struct MtmMessage
 29 | {
 30 | 	MtmMessageTag		tag;
 31 | } MtmMessage;
 32 | 
 33 | #define messageTag(msgptr)		(((const MtmMessage *)(msgptr))->tag)
 34 | 
 35 | /* Response to PREPARE by apply worker */
 36 | typedef struct
 37 | {
 38 | 	MtmMessageTag		tag;
 39 | 	int					node_id;
 40 | 	/* for PREPARE we care only about, well, prepare success */
 41 | 	bool				prepared;
 42 | 	int32				errcode;
 43 | 	const char		   *errmsg;
 44 | 	TransactionId	   	xid; /* identifies the message */
 45 | } MtmPrepareResponse;
 46 | 
 47 | /*
 48 |  * Response to 2A msg by apply worker or by replier (during resolving).
 49 |  * This could be named just 2B, ha.
 50 |  * It is also abused for COMMIT PREPARED ack (with .status = GTXCommitted).
 51 |  */
 52 | typedef struct
 53 | {
 54 | 	MtmMessageTag		tag;
 55 | 	int					node_id;
 56 | 	/*
 57 | 	 * Our prevVote in terms of the Part-Time Parliament paper. Actually there
 58 | 	 * is no need to carry the decree (status) itself, ballot (term) is
 59 | 	 * enough, but it is kept for convenience.
 60 | 	 */
 61 | 	GlobalTxStatus		status;
 62 | 	GlobalTxTerm		accepted_term;
 63 | 	int32				errcode;
 64 | 	const char		   *errmsg;
 65 | 	const char		   *gid; /* identifies the message */
 66 | } Mtm2AResponse;
 67 | 
 68 | /*
 69 |  * Response on MtmLastTermRequest request, holds last proposal value.
 70 |  */
 71 | typedef struct
 72 | {
 73 | 	MtmMessageTag		tag;
 74 | 	GlobalTxTerm		term;
 75 | } MtmLastTermResponse;
 76 | 
 77 | /*
 78 |  * Request to change transaction state. This messages are duplicate of
 79 |  * corresponding WAL records, but we need them during transaction resolution
 80 |  * upon recovery as WAL receiver may be blocked by a transaction that we
 81 |  * are actually resolving.
 82 |  *
 83 |  * Sent from mtm-resolver to mtm-status worker.
 84 |  */
 85 | typedef enum
 86 | {
 87 | 	MTReq_Abort = 0,
 88 | 	MTReq_Commit,
 89 | 	MTReq_Precommit,  /* 2a with value commit */
 90 | 	MTReq_Preabort,   /* 2a with value abort */
 91 | 	MTReq_Status	  /* 1a */
 92 | } MtmTxRequestValue;
 93 | 
 94 | typedef struct
 95 | {
 96 | 	MtmMessageTag		tag;
 97 | 	MtmTxRequestValue	type;
 98 | 	GlobalTxTerm		term;
 99 | 	const char		   *gid;
100 | 	int					coordinator;
101 | 	uint64				gen_num;
102 | 	XLogRecPtr			coordinator_end_lsn; /* matters for 1a */
103 | } MtmTxRequest;
104 | 
105 | extern char const * const MtmTxRequestValueMnem[];
106 | 
107 | /*
108 |  * Status response, phase 1b of paxos on a given transaction result.
109 |  * Sent from mtm-status to mtm-resolver worker.
110 |  */
111 | typedef struct
112 | {
113 | 	MtmMessageTag		tag;
114 | 	int					node_id;
115 | 	GTxState			state;
116 | 	const char		   *gid;
117 | } MtmTxStatusResponse;
118 | 
119 | /*
120 |  * Data sent in dmq heartbeats.
121 |  */
122 | typedef struct
123 | {
124 | 	MtmMessageTag		tag;
125 | 	MtmGeneration		current_gen;
126 | 	uint64				donors; /* xxx nodemask_t */
127 | 	uint64				last_online_in;
128 | 	uint64				connected_mask; /* xxx nodemask_t */
129 | } MtmHeartbeat;
130 | 
131 | /*
132 |  * Request to vote for new generation.
133 |  */
134 | typedef struct
135 | {
136 | 	MtmMessageTag		tag;
137 | 	MtmGeneration		gen;
138 | } MtmGenVoteRequest;
139 | 
140 | /*
141 |  * Reply to new generation vote request.
142 |  */
143 | typedef struct
144 | {
145 | 	MtmMessageTag		tag;
146 | 	uint64				gen_num; /* identifies the message */
147 | 	uint8				vote_ok;
148 | 	/* last_online_in of replier on the moment of voting, determines donors */
149 | 	uint64				last_online_in;
150 | 	/*
151 | 	 * if vote_ok is false this might be a valid gen number showing that
152 | 	 * replier couldn't vote because its last_vote is higher.
153 | 	 */
154 | 	uint64				last_vote_num;
155 | 	/*
156 | 	 * curr gen donors of the responder and its donors. Sometimes we wish to
157 | 	 * send it along with refusal to vote, see HandleGenVoteRequest.
158 | 	 */
159 | 	MtmGeneration		curr_gen;
160 | 	uint64_t			curr_gen_donors;
161 | } MtmGenVoteResponse;
162 | 
163 | 
164 | StringInfo MtmMessagePack(MtmMessage *anymsg);
165 | MtmMessage *MtmMessageUnpack(StringInfo s);
166 | char *MtmMesageToString(MtmMessage *anymsg);
167 | 
168 | #endif							/* MESSAGING_H */
169 | 


--------------------------------------------------------------------------------
/src/include/mtm_utils.h:
--------------------------------------------------------------------------------
 1 | /*-------------------------------------------------------------------------
 2 |  *
 3 |  * mtm_utils.h
 4 |  *	Utility functions:
 5 |  *	- disable global timeouts settings;
 6 |  *	- libpq connect function wrappers.
 7 |  *
 8 |  *
 9 |  * Copyright (c) 2022, Postgres Professional
10 |  *
11 |  *-------------------------------------------------------------------------
12 |  */
13 | #ifndef MTM_UTILS_H
14 | #define MTM_UTILS_H
15 | 
16 | #include "libpq/pqformat.h"
17 | #include "libpq-fe.h"
18 | 
19 | extern void MtmDisableTimeouts(void);
20 | 
21 | extern PostgresPollingStatusType MtmPQconnectPoll(PGconn *conn);
22 | extern PGconn* MtmPQconnectdb(const char *conninfo);
23 | 
24 | #endif
25 | 


--------------------------------------------------------------------------------
/src/include/pglogical_config.h:
--------------------------------------------------------------------------------
 1 | #ifndef PG_LOGICAL_CONFIG_H
 2 | #define PG_LOGICAL_CONFIG_H
 3 | 
 4 | #ifndef PG_VERSION_NUM
 5 | #error <postgres.h> must be included first
 6 | #endif
 7 | 
 8 | #include "nodes/pg_list.h"
 9 | #include "pglogical_output.h"
10 | 
11 | inline static bool
12 | server_float4_byval(void)
13 | {
14 | #ifdef USE_FLOAT4_BYVAL
15 | 	return true;
16 | #else
17 | 	return false;
18 | #endif
19 | }
20 | 
21 | inline static bool
22 | server_float8_byval(void)
23 | {
24 | #ifdef USE_FLOAT8_BYVAL
25 | 	return true;
26 | #else
27 | 	return false;
28 | #endif
29 | }
30 | 
31 | inline static bool
32 | server_integer_datetimes(void)
33 | {
34 | #ifdef USE_INTEGER_DATETIMES
35 | 	return true;
36 | #else
37 | 	return false;
38 | #endif
39 | }
40 | 
41 | inline static bool
42 | server_bigendian(void)
43 | {
44 | #ifdef WORDS_BIGENDIAN
45 | 	return true;
46 | #else
47 | 	return false;
48 | #endif
49 | }
50 | 
51 | extern int	process_parameters(List *options, PGLogicalOutputData *data);
52 | 
53 | extern List *prepare_startup_message(PGLogicalOutputData *data);
54 | 
55 | #endif
56 | 


--------------------------------------------------------------------------------
/src/include/pglogical_hooks.h:
--------------------------------------------------------------------------------
 1 | #ifndef PGLOGICAL_HOOKS_H
 2 | #define PGLOGICAL_HOOKS_H
 3 | 
 4 | #include "replication/reorderbuffer.h"
 5 | 
 6 | /* public interface for hooks */
 7 | #include "pglogical_output/hooks.h"
 8 | #include "pglogical_output.h"
 9 | 
10 | extern void load_hooks(PGLogicalOutputData *data);
11 | 
12 | extern void call_startup_hook(PGLogicalOutputData *data, List *plugin_params);
13 | 
14 | extern void call_shutdown_hook(PGLogicalOutputData *data);
15 | 
16 | extern bool call_row_filter_hook(PGLogicalOutputData *data,
17 | 					 ReorderBufferTXN *txn, Relation rel, ReorderBufferChange *change);
18 | 
19 | extern bool call_txn_filter_hook(PGLogicalOutputData *data,
20 | 					 RepOriginId txn_origin);
21 | 
22 | 
23 | #endif
24 | 


--------------------------------------------------------------------------------
/src/include/pglogical_output.h:
--------------------------------------------------------------------------------
  1 | /*-------------------------------------------------------------------------
  2 |  *
  3 |  * pglogical_output.h
  4 |  *		pglogical output plugin
  5 |  *
  6 |  * Copyright (c) 2015, PostgreSQL Global Development Group
  7 |  * Portions Copyright (c) 2021, Postgres Professional
  8 |  *
  9 |  * IDENTIFICATION
 10 |  *		pglogical_output.h
 11 |  *
 12 |  *-------------------------------------------------------------------------
 13 |  */
 14 | #ifndef PG_LOGICAL_OUTPUT_H
 15 | #define PG_LOGICAL_OUTPUT_H
 16 | 
 17 | #include "nodes/parsenodes.h"
 18 | 
 19 | #include "replication/logical.h"
 20 | #include "replication/output_plugin.h"
 21 | 
 22 | #include "storage/lock.h"
 23 | 
 24 | #include "pglogical_output/hooks.h"
 25 | 
 26 | #include "pglogical_proto.h"
 27 | 
 28 | #include "multimaster.h"
 29 | 
 30 | #define PG_LOGICAL_PROTO_VERSION_NUM 1
 31 | #define PG_LOGICAL_PROTO_MIN_VERSION_NUM 1
 32 | 
 33 | /*
 34 |  * The name of a hook function. This is used instead of the usual List*
 35 |  * because can serve as a hash key.
 36 |  *
 37 |  * Must be zeroed on allocation if used as a hash key since padding is
 38 |  * *not* ignored on compare.
 39 |  */
 40 | typedef struct HookFuncName
 41 | {
 42 | 	/* funcname is more likely to be unique, so goes first */
 43 | 	char		function[NAMEDATALEN];
 44 | 	char		schema[NAMEDATALEN];
 45 | }			HookFuncName;
 46 | 
 47 | typedef struct MtmDecoderPrivate
 48 | {
 49 | 	int			receiver_node_id;
 50 | 	bool		is_recovery;
 51 | 	MtmConfig  *cfg;
 52 | } MtmDecoderPrivate;
 53 | 
 54 | typedef struct PGLogicalOutputData
 55 | {
 56 | 	MemoryContext context;
 57 | 
 58 | 	PGLogicalProtoAPI *api;
 59 | 
 60 | 	/* protocol */
 61 | 	bool		allow_internal_basetypes;
 62 | 	bool		allow_binary_basetypes;
 63 | 	bool		forward_changesets;
 64 | 	bool		forward_changeset_origins;
 65 | 	int			field_datum_encoding;
 66 | 
 67 | 	/*
 68 | 	 * client info
 69 | 	 *
 70 | 	 * Lots of this should move to a separate shorter-lived struct used only
 71 | 	 * during parameter reading, since it contains what the client asked for.
 72 | 	 * Once we've processed this during startup we don't refer to it again.
 73 | 	 */
 74 | 	uint32		client_pg_version;
 75 | 	uint32		client_max_proto_version;
 76 | 	uint32		client_min_proto_version;
 77 | 	const char *client_expected_encoding;
 78 | 	const char *client_protocol_format;
 79 | 	uint32		client_binary_basetypes_major_version;
 80 | 	bool		client_want_internal_basetypes_set;
 81 | 	bool		client_want_internal_basetypes;
 82 | 	bool		client_want_binary_basetypes_set;
 83 | 	bool		client_want_binary_basetypes;
 84 | 	bool		client_binary_bigendian_set;
 85 | 	bool		client_binary_bigendian;
 86 | 	uint32		client_binary_sizeofdatum;
 87 | 	uint32		client_binary_sizeofint;
 88 | 	uint32		client_binary_sizeoflong;
 89 | 	bool		client_binary_float4byval_set;
 90 | 	bool		client_binary_float4byval;
 91 | 	bool		client_binary_float8byval_set;
 92 | 	bool		client_binary_float8byval;
 93 | 	bool		client_binary_intdatetimes_set;
 94 | 	bool		client_binary_intdatetimes;
 95 | 	bool		client_forward_changesets_set;
 96 | 	bool		client_forward_changesets;
 97 | 	bool		client_no_txinfo;
 98 | 
 99 | 	/* hooks */
100 | 	List	   *hooks_setup_funcname;
101 | 	struct PGLogicalHooks hooks;
102 | 	MemoryContext hooks_mctxt;
103 | 
104 | 	/* DefElem<String> list populated by startup hook */
105 | 	List	   *extra_startup_params;
106 | } PGLogicalOutputData;
107 | 
108 | typedef struct PGLogicalTupleData
109 | {
110 | 	Datum		values[MaxTupleAttributeNumber];
111 | 	bool		nulls[MaxTupleAttributeNumber];
112 | 	bool		changed[MaxTupleAttributeNumber];
113 | }			PGLogicalTupleData;
114 | 
115 | extern void MtmOutputPluginWrite(LogicalDecodingContext *ctx, bool last_write, bool flush);
116 | extern void MtmOutputPluginPrepareWrite(LogicalDecodingContext *ctx, bool last_write, bool flush);
117 | 
118 | #endif							/* PG_LOGICAL_OUTPUT_H */
119 | 


--------------------------------------------------------------------------------
/src/include/pglogical_output/compat.h:
--------------------------------------------------------------------------------
 1 | #ifndef PG_LOGICAL_COMPAT_H
 2 | #define PG_LOGICAL_COMPAT_H
 3 | 
 4 | #include "pg_config.h"
 5 | 
 6 | /* 9.4 lacks replication origins */
 7 | #if PG_VERSION_NUM >= 90500
 8 | #define HAVE_REPLICATION_ORIGINS
 9 | #else
10 | /* To allow the same signature on hooks in 9.4 */
11 | typedef uint16 RepOriginId;
12 | #define InvalidRepOriginId 0
13 | #endif
14 | 
15 | /* 9.4 lacks PG_UINT32_MAX */
16 | #ifndef PG_UINT32_MAX
17 | #define PG_UINT32_MAX UINT32_MAX
18 | #endif
19 | 
20 | #ifndef PG_INT32_MAX
21 | #define PG_INT32_MAX INT32_MAX
22 | #endif
23 | 
24 | #ifndef PG_INT32_MIN
25 | #define PG_INT32_MIN INT32_MIN
26 | #endif
27 | 
28 | #endif
29 | 


--------------------------------------------------------------------------------
/src/include/pglogical_output/hooks.h:
--------------------------------------------------------------------------------
 1 | #ifndef PGLOGICAL_OUTPUT_HOOKS_H
 2 | #define PGLOGICAL_OUTPUT_HOOKS_H
 3 | 
 4 | #include "access/xlogdefs.h"
 5 | #include "nodes/pg_list.h"
 6 | #include "utils/rel.h"
 7 | #include "utils/palloc.h"
 8 | #include "replication/reorderbuffer.h"
 9 | 
10 | #include "pglogical_output/compat.h"
11 | 
12 | /*
13 |  * This header is to be included by extensions that implement pglogical output
14 |  * plugin callback hooks for transaction origin and row filtering, etc. It is
15 |  * installed as "pglogical_output/hooks.h"
16 |  *
17 |  * See the README.md and the example in examples/hooks/ for details on hooks.
18 |  */
19 | 
20 | 
21 | struct PGLogicalStartupHookArgs
22 | {
23 | 	void	   *private_data;
24 | 	List	   *in_params;
25 | 	List	   *out_params;
26 | };
27 | 
28 | typedef void (*pglogical_startup_hook_fn) (struct PGLogicalStartupHookArgs *args);
29 | 
30 | 
31 | struct PGLogicalTxnFilterArgs
32 | {
33 | 	void	   *private_data;
34 | 	RepOriginId origin_id;
35 | };
36 | 
37 | typedef bool (*pglogical_txn_filter_hook_fn) (struct PGLogicalTxnFilterArgs *args);
38 | 
39 | 
40 | struct PGLogicalRowFilterArgs
41 | {
42 | 	void	   *private_data;
43 | 	Relation	changed_rel;
44 | 	enum ReorderBufferChangeType change_type;
45 | 	/* detailed row change event from logical decoding */
46 | 	ReorderBufferChange *change;
47 | };
48 | 
49 | typedef bool (*pglogical_row_filter_hook_fn) (struct PGLogicalRowFilterArgs *args);
50 | 
51 | 
52 | struct PGLogicalShutdownHookArgs
53 | {
54 | 	void	   *private_data;
55 | };
56 | 
57 | typedef void (*pglogical_shutdown_hook_fn) (struct PGLogicalShutdownHookArgs *args);
58 | 
59 | /*
60 |  * This struct is passed to the pglogical_get_hooks_fn as the first argument,
61 |  * typed 'internal', and is unwrapped with `DatumGetPointer`.
62 |  */
63 | struct PGLogicalHooks
64 | {
65 | 	pglogical_startup_hook_fn startup_hook;
66 | 	pglogical_shutdown_hook_fn shutdown_hook;
67 | 	pglogical_txn_filter_hook_fn txn_filter_hook;
68 | 	pglogical_row_filter_hook_fn row_filter_hook;
69 | 	void	   *hooks_private_data;
70 | };
71 | 
72 | 
73 | #endif							/* PGLOGICAL_OUTPUT_HOOKS_H */
74 | 


--------------------------------------------------------------------------------
/src/include/pglogical_proto.h:
--------------------------------------------------------------------------------
 1 | /*-------------------------------------------------------------------------
 2 |  *
 3 |  * pglogical_proto.h
 4 |  *		pglogical protocol
 5 |  *
 6 |  * Copyright (c) 2015, PostgreSQL Global Development Group
 7 |  * Portions Copyright (c) 2021, Postgres Professional
 8 |  *
 9 |  * IDENTIFICATION
10 |  *		  pglogical_proto.h
11 |  *
12 |  *-------------------------------------------------------------------------
13 |  */
14 | #ifndef PG_LOGICAL_PROTO_H
15 | #define PG_LOGICAL_PROTO_H
16 | 
17 | struct PGLogicalOutputData;
18 | struct PGLRelMetaCacheEntry;
19 | 
20 | extern char *walsender_name;
21 | 
22 | typedef void (*pglogical_write_rel_fn) (StringInfo out, struct PGLogicalOutputData *data,
23 | 										Relation rel	/* , struct
24 | 														 * PGLRelMetaCacheEntry
25 | 										  * *cache_entry */ );
26 | 
27 | typedef void (*pglogical_write_begin_fn) (StringInfo out, struct PGLogicalOutputData *data,
28 | 										  ReorderBufferTXN *txn);
29 | typedef void (*pglogical_write_message_fn) (StringInfo out, LogicalDecodingContext *ctx,
30 | 											XLogRecPtr end_lsn,
31 | 											const char *prefix, Size sz, const char *message);
32 | typedef void (*pglogical_write_commit_fn) (StringInfo out, struct PGLogicalOutputData *data,
33 | 										   ReorderBufferTXN *txn, XLogRecPtr commit_lsn);
34 | 
35 | typedef void (*pglogical_write_origin_fn) (StringInfo out, const char *origin,
36 | 										   XLogRecPtr origin_lsn);
37 | 
38 | typedef void (*pglogical_write_insert_fn) (StringInfo out, struct PGLogicalOutputData *data,
39 | 										   Relation rel, HeapTuple newtuple);
40 | typedef void (*pglogical_write_update_fn) (StringInfo out, struct PGLogicalOutputData *data,
41 | 										   Relation rel, HeapTuple oldtuple,
42 | 										   HeapTuple newtuple);
43 | typedef void (*pglogical_write_delete_fn) (StringInfo out, struct PGLogicalOutputData *data,
44 | 										   Relation rel, HeapTuple oldtuple);
45 | 
46 | typedef void (*pglogical_write_caughtup_fn) (StringInfo out, struct PGLogicalOutputData *data,
47 | 											 XLogRecPtr wal_end_ptr);
48 | 
49 | typedef void (*write_startup_message_fn) (StringInfo out, List *msg);
50 | 
51 | typedef void (*pglogical_setup_hooks_fn) (struct PGLogicalHooks *hooks);
52 | 
53 | typedef struct PGLogicalProtoAPI
54 | {
55 | 	pglogical_write_rel_fn write_rel;
56 | 	pglogical_write_begin_fn write_begin;
57 | 	pglogical_write_message_fn write_message;
58 | 	pglogical_write_commit_fn write_commit;
59 | 	pglogical_write_origin_fn write_origin;
60 | 	pglogical_write_insert_fn write_insert;
61 | 	pglogical_write_update_fn write_update;
62 | 	pglogical_write_delete_fn write_delete;
63 | 	pglogical_write_caughtup_fn write_caughtup;
64 | 	pglogical_setup_hooks_fn setup_hooks;
65 | 	write_startup_message_fn write_startup_message;
66 | } PGLogicalProtoAPI;
67 | 
68 | 
69 | typedef enum PGLogicalProtoType
70 | {
71 | 	PGLogicalProtoNative,
72 | 	PGLogicalProtoJson
73 | } PGLogicalProtoType;
74 | 
75 | extern PGLogicalProtoAPI *pglogical_init_api(PGLogicalProtoType typ);
76 | 
77 | 
78 | extern void pglogical_write_abort(StringInfo out,
79 | 								  struct PGLogicalOutputData *data,
80 | 								  ReorderBufferTXN *txn, XLogRecPtr lsn);
81 | extern void pglogical_write_prepare(StringInfo out,
82 | 						struct PGLogicalOutputData *data,
83 | 						ReorderBufferTXN *txn, XLogRecPtr lsn);
84 | extern void pglogical_write_commit_prepared(StringInfo out,
85 | 								struct PGLogicalOutputData *data,
86 | 								ReorderBufferTXN *txn, XLogRecPtr lsn);
87 | extern void pglogical_write_abort_prepared(StringInfo out,
88 | 							   struct PGLogicalOutputData *data,
89 | 							   ReorderBufferTXN *txn, XLogRecPtr lsn);
90 | 
91 | #endif							/* PG_LOGICAL_PROTO_H */
92 | 


--------------------------------------------------------------------------------
/src/include/pglogical_relid_map.h:
--------------------------------------------------------------------------------
 1 | #ifndef PGLOGICAL_RELID_MAP
 2 | #define PGLOGICAL_RELID_MAP
 3 | 
 4 | #define PGL_INIT_RELID_MAP_SIZE 256
 5 | 
 6 | typedef struct PGLRelidMapEntry
 7 | {
 8 | 	Oid			remote_relid;
 9 | 	Oid			local_relid;
10 | } PGLRelidMapEntry;
11 | 
12 | extern Oid	pglogical_relid_map_get(Oid relid);
13 | extern bool pglogical_relid_map_put(Oid remote_relid, Oid local_relid);
14 | extern void pglogical_relid_map_reset(void);
15 | #endif
16 | 


--------------------------------------------------------------------------------
/src/include/receiver.h:
--------------------------------------------------------------------------------
 1 | #ifndef MTM_RECEIVER_H
 2 | #define MTM_RECEIVER_H
 3 | 
 4 | #include "libpq-fe.h"
 5 | 
 6 | typedef enum
 7 | {
 8 | 	REPLMODE_DISABLED,	/* stop the receiver */
 9 | 	REPLMODE_RECOVERY,	/* pull changes of all origins */
10 | 	REPLMODE_NORMAL		/* pull only sender changes, apply in parallel */
11 | } MtmReplicationMode;
12 | 
13 | /* ugly exported for the sake of MtmDetectGlobalDeadLock */
14 | extern MtmReplicationMode curr_replication_mode;
15 | 
16 | #define BGW_POOL_BY_NODE_ID(node_id) (&Mtm->pools[(node_id) - 1])
17 | 
18 | extern char const *const MtmReplicationModeMnem[];
19 | 
20 | /* forward decl to avoid including global_tx.h */
21 | struct GlobalTx;
22 | 
23 | /* same for bgwpool.h */
24 | struct BgwPool;
25 | 
26 | /*
27 |  * Part of MtmReceiverContext used by both main receiver and parallel workers.
28 |  * Exposed for bgwpool/apply needs.
29 |  */
30 | typedef struct
31 | {
32 | 	int					sender_node_id;
33 | 	MtmReplicationMode	mode;
34 | 	/* allows to release gtx on ERROR in apply */
35 | 	struct GlobalTx		*gtx;
36 | 	/*
37 | 	 * For parallel workers: position of current job in txlist.
38 | 	 */
39 | 	int					txlist_pos;
40 | 	/*
41 | 	 * Info about xact currently being executed
42 | 	 */
43 | 	TransactionId		origin_xid;
44 | 	bool				reply_pending;
45 | 	/*
46 | 	 * true means this is xact with plain commit, so we cannot ignore
47 | 	 * apply failure
48 | 	 */
49 | 	bool				bdr_like;
50 | 
51 | 	struct BgwPool		*pool;
52 | } MtmReceiverWorkerContext;
53 | 
54 | extern void MtmWakeupReceivers(void);
55 | 
56 | extern void MtmExecutor(void *work, size_t size, MtmReceiverWorkerContext *rwctx);
57 | extern void ApplyCancelHandler(SIGNAL_ARGS);
58 | extern void MtmUpdateLsnMapping(int node_id, XLogRecPtr end_lsn);
59 | 
60 | extern void MtmBeginSession(int nodeId);
61 | extern void MtmEndSession(int nodeId, bool unlock);
62 | 
63 | #endif
64 | 


--------------------------------------------------------------------------------
/src/include/resolver.h:
--------------------------------------------------------------------------------
 1 | #ifndef RESOLVER_H
 2 | #define RESOLVER_H
 3 | 
 4 | #include "postmaster/bgworker.h"
 5 | 
 6 | extern void ResolverMain(Datum main_arg);
 7 | void ResolverWake(void);
 8 | 
 9 | #endif							/* RESOLVER_H */
10 | 


--------------------------------------------------------------------------------
/src/include/spill.h:
--------------------------------------------------------------------------------
 1 | #ifndef __SPILL_H__
 2 | #define __SPILL_H__
 3 | 
 4 | void		MtmSpillToFile(int fd, char const *data, size_t size);
 5 | void		MtmCreateSpillDirectory(int node_id);
 6 | int			MtmCreateSpillFile(int node_id, int *file_id);
 7 | int			MtmOpenSpillFile(int node_id, int file_id);
 8 | void		MtmReadSpillFile(int fd, char *data, size_t size);
 9 | void		MtmCloseSpillFile(int fd);
10 | 
11 | #endif
12 | 


--------------------------------------------------------------------------------
/src/include/state.h:
--------------------------------------------------------------------------------
  1 | #ifndef STATE_H
  2 | #define STATE_H
  3 | 
  4 | /*
  5 |  * Generation is a uniquely numbered subset of configured nodes allowed to
  6 |  * commit transactions. Each xact is stamped with generation it belongs
  7 |  * to. Transaction must be PREPAREd on *all* generation members before commit;
  8 |  * this provides recovery -> normal work transition without risk of reordering
  9 |  * xacts.
 10 |  *
 11 |  * The two main properties of generations are
 12 |  *   - At each node all prepares of generation n who might ever be committed
 13 |  *     lie strictly before all such prepares of generation n+1.
 14 |  *   - Node which is MTM_GEN_ONLINE in generation n holds all committable
 15 |  *     xacts of all generations < n.
 16 |  * See generations2.md and MtmGenerations.tla for details.
 17 |  *
 18 |  * Normal (making xacts) generation contains at least majority
 19 |  * members. However, we allow to elect generation with less members as a sort
 20 |  * of mark that its members are recovered enough to be included in the
 21 |  * following normal generations. It allows nodes always add *only myself* (but
 22 |  * remove anyone else) when campaigning for new generations; thus only node
 23 |  * itself decides when it is recovered enough to force others wait for it,
 24 |  * which simplifies reasoning who should be next gen members.
 25 |  *
 26 |  * Another reason for minority gens existence is usage of generations to
 27 |  * directly abort transactions when we know they can't ever be prepared; this
 28 |  * allows to participate in normal transaction resolution iff node has
 29 |  * PREPARE. For that to work, we must be sure live connectivity clique forming
 30 |  * majority eventually forms its generation regardless of recovery process.
 31 |  * c.f. handle_1a for details.
 32 |  */
 33 | typedef struct MtmGeneration
 34 | {
 35 | 	uint64 num; /* logical clock aka term number aka ballot */
 36 | 	uint64 members; /* xxx extract nodemask.h and use it here */
 37 | 	/*
 38 | 	 * Generation has fixed set of configured nodes, which helps consistent
 39 | 	 * xact resolving with dynamic add/rm of nodes.
 40 | 	 */
 41 | 	uint64 configured; /* xxx extract nodemask.h and use it here */
 42 | } MtmGeneration;
 43 | 
 44 | #define MtmInvalidGenNum 0
 45 | #define EQUAL_GENS(g1, g2) \
 46 | 	((g1).num == (g2).num && (g1).members == (g2).members && (g1).configured == (g2).configured)
 47 | /*
 48 |  * Referee is enabled only with 2 nodes and single member gen is ever proposed
 49 |  * as referee one (requiring referee vote and allowing to be online this
 50 |  * single node), so instead of separate flag use this check.
 51 |  *
 52 |  * First condition is important as single node cluster shouldn't access
 53 |  * referee; also, with > 2 nodes there is at least theoretical possibility of
 54 |  * electing single-node generation after two consecutive minority gen
 55 |  * elections.
 56 |  */
 57 | #define IS_REFEREE_GEN(members, configured) \
 58 | 	(popcount(configured) == 2 && popcount(members) == 1)
 59 | 
 60 | typedef enum
 61 | {
 62 | 	MTM_GEN_DEAD,		/* can't ever be online in this gen */
 63 | 	MTM_GEN_RECOVERY,	/* need to pull in recovery latest xacts before */
 64 | 						/* starting making my own and receiving normally */
 65 | 	MTM_GEN_ONLINE		/* participating normally */
 66 | } MtmStatusInGen;
 67 | 
 68 | typedef enum
 69 | {
 70 | 	/*
 71 | 	 * We were not excluded to the best of our knowledge, but we don't see all
 72 | 	 * peers from current generation, so commits will likely fail.
 73 | 	 */
 74 | 	MTM_ISOLATED,
 75 | 
 76 | 	/*
 77 | 	 * We were excluded and definitely need recovery, but not yet sure from
 78 | 	 * whom as we don't see majority.
 79 | 	 */
 80 | 	MTM_DISABLED,
 81 | 
 82 | 	/*
 83 | 	 * We are catching up, eating changes committed without us participating.
 84 | 	 * Other nodes don't wait for us yet, so this doesn't freeze the cluster.
 85 | 	 */
 86 | 	MTM_CATCHUP,
 87 | 
 88 | 	/*
 89 | 	 * Generation with us was elected and others started waiting for us, but
 90 | 	 * we need to eat the latest changes in recovery mode to participate
 91 | 	 * normally.
 92 | 	 */
 93 | 	MTM_RECOVERY,
 94 | 
 95 | 	/*
 96 | 	 * It's Twelve O'clock and All's Well.
 97 | 	 */
 98 | 	MTM_ONLINE,
 99 | } MtmNodeStatus;
100 | 
101 | extern char const *const MtmNodeStatusMnem[];
102 | 
103 | extern void MtmStateInit(void);
104 | extern void MtmStateShmemStartup(void);
105 | extern void MtmStateStartup(void);
106 | 
107 | /* generation management */
108 | extern uint64 MtmGetCurrentGenNum(void);
109 | extern MtmGeneration MtmGetCurrentGen(bool locked);
110 | extern void MtmConsiderGenSwitch(MtmGeneration gen, nodemask_t donors);
111 | extern bool MtmHandleParallelSafe(MtmGeneration ps_gen, nodemask_t ps_donors,
112 | 								  bool is_recovery, XLogRecPtr end_lsn);
113 | extern MtmStatusInGen MtmGetCurrentStatusInGen(void);
114 | extern MtmStatusInGen MtmGetCurrentStatusInGenNotLocked(void);
115 | extern MtmNodeStatus MtmGetCurrentStatus(bool gen_locked, bool vote_locked);
116 | 
117 | /* receiver bits */
118 | extern void MtmReportReceiverCaughtup(int node_id);
119 | /* we should recover, but not not sure from whom yet */
120 | #define RECEIVE_MODE_DISABLED (~(uint32)0)
121 | /* all receivers work normally */
122 | #define RECEIVE_MODE_NORMAL   0
123 | #define IS_RECEIVE_MODE_DONOR(rcv_mode) ((rcv_mode) != RECEIVE_MODE_NORMAL && \
124 | 										 ((rcv_mode) != RECEIVE_MODE_DISABLED))
125 | extern MtmReplicationMode MtmGetReceiverMode(int nodeId);
126 | 
127 | /* connectivity */
128 | extern nodemask_t MtmGetDmqReceiversMask(void);
129 | extern nodemask_t MtmGetConnectedMask(bool locked);
130 | extern nodemask_t MtmGetConnectedMaskWithMe(bool locked);
131 | extern void *MtmOnDmqReceiverConnect(char *node_name);
132 | extern void MtmOnDmqReceiverHeartbeat(char *node_name, StringInfo msg, void *extra);
133 | extern void MtmOnDmqReceiverDisconnect(char *node_name);
134 | extern void MtmOnDmqSenderConnect(char *node_name);
135 | extern void MtmOnDmqSenderHeartbeat(char *node_name, StringInfo buf);
136 | extern void MtmOnDmqSenderDisconnect(char *node_name);
137 | 
138 | extern void AcquirePBByPreparer(bool backend);
139 | extern void AcquirePBByHolder(bool full);
140 | extern void ReleasePB(void);
141 | 
142 | /* bgws */
143 | extern void CampaignerMain(Datum main_arg);
144 | extern void ReplierMain(Datum main_arg);
145 | extern void MtmMonitor(Datum arg);
146 | extern void MtmMonitorStart(Oid db_id, Oid user_id);
147 | 
148 | /* not cleaned up yet */
149 | extern void MtmRefreshClusterStatus(void);
150 | extern nodemask_t MtmGetDisabledNodeMask(void);
151 | extern nodemask_t MtmGetEnabledNodeMask(bool ignore_disabled);
152 | extern void CampaignerStop(void);
153 | 
154 | #endif
155 | 


--------------------------------------------------------------------------------
/src/include/syncpoint.h:
--------------------------------------------------------------------------------
 1 | /*-------------------------------------------------------------------------
 2 |  *
 3 |  * syncpoint.h
 4 |  *
 5 |  * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
 6 |  * Portions Copyright (c) 1994, Regents of the University of California
 7 |  * Portions Copyright (c) 2021, Postgres Professional
 8 |  *
 9 |  *-------------------------------------------------------------------------
10 |  */
11 | #ifndef SYNCPOINT_H
12 | #define SYNCPOINT_H
13 | 
14 | #include "access/xlogdefs.h"
15 | #include "libpq-fe.h"
16 | #include "utils/hsearch.h"
17 | #include "replication/walsender.h"
18 | 
19 | typedef struct
20 | {
21 | 	XLogRecPtr	origin_lsn;
22 | 	XLogRecPtr	local_lsn;
23 | } Syncpoint;
24 | 
25 | /*
26 |  * Used as a hashkey in recovery filter.
27 |  *
28 |  * NB: make sure to memset this structure to zeroes before using as hashkey
29 |  * because it contains 4-byte padding hole in the middle.
30 |  */
31 | typedef struct
32 | {
33 | 	int			node_id;
34 | 	XLogRecPtr	origin_lsn;
35 | } FilterEntry;
36 | 
37 | 
38 | extern int MtmSyncpointInterval;
39 | 
40 | 
41 | extern void MaybeLogSyncpoint(void);
42 | extern void SyncpointRegister(int origin_node_id, XLogRecPtr origin_lsn,
43 | 				  XLogRecPtr receiver_lsn);
44 | extern Syncpoint SyncpointGetLatest(int origin_node_id);
45 | extern Syncpoint *SyncpointGetAllLatest(int sender_node_id);
46 | extern XLogRecPtr GetRecoveryHorizon(int sender_node_id);
47 | extern void UpdateRecoveryHorizons(void);
48 | extern HTAB *RecoveryFilterLoad(int filter_node_id, Syncpoint *spvector, MtmConfig *mtm_cfg);
49 | 
50 | extern char* pg_lsn_out_c(XLogRecPtr lsn);
51 | 
52 | #endif							/* SYNCPOINT_H */
53 | 


--------------------------------------------------------------------------------
/src/mtm_utils.c:
--------------------------------------------------------------------------------
  1 | /*----------------------------------------------------------------------------
  2 |  *
  3 |  * mtm_utils.c
  4 |  *	  Utility functions
  5 |  *
  6 |  * Copyright (c) 2022, Postgres Professional
  7 |  *
  8 |  *----------------------------------------------------------------------------
  9 |  */
 10 | 
 11 | #include "logger.h"
 12 | #include "mtm_utils.h"
 13 | 
 14 | #include "utils/timeout.h"
 15 | 
 16 | /*
 17 |  * Disables timeouts on a client side:
 18 |  * - statement_timeout;
 19 |  * - lock_timeout;
 20 |  * - idle_in_transaction_session_timeout;
 21 |  * - idle_session_timeout.
 22 |  *
 23 |  * This timeouts, when set in the postgres config file, affect all process.
 24 |  * The multimaster needs his sessions not to be interrupted, so we disable
 25 |  * these timeouts.
 26 |  *
 27 |  * This function raises an error on PQExec failed.
 28 |  */
 29 | static bool
 30 | disable_client_timeouts(PGconn *conn)
 31 | {
 32 | 	PGresult   *res;
 33 | 
 34 | 	res = PQexec(conn, "SET statement_timeout = 0");
 35 | 	if (PQresultStatus(res) != PGRES_COMMAND_OK)
 36 | 	{
 37 | 		mtm_log(WARNING, "failed to set statement_timeout: %s",
 38 | 				pchomp(PQerrorMessage(conn)));
 39 | 		return false;
 40 | 	}
 41 | 
 42 | 	res = PQexec(conn, "SET lock_timeout = 0");
 43 | 	if (PQresultStatus(res) != PGRES_COMMAND_OK)
 44 | 	{
 45 | 		mtm_log(WARNING, "failed to set lock_timeout: %s",
 46 | 				pchomp(PQerrorMessage(conn)));
 47 | 		return false;
 48 | 	}
 49 | 
 50 | 	res = PQexec(conn, "SET idle_in_transaction_session_timeout = 0");
 51 | 	if (PQresultStatus(res) != PGRES_COMMAND_OK)
 52 | 	{
 53 | 		mtm_log(WARNING, "failed to set idle_in_transaction_session_timeout: %s",
 54 | 				pchomp(PQerrorMessage(conn)));
 55 | 		return false;
 56 | 	}
 57 | 
 58 | 	res = PQexec(conn, "SET idle_session_timeout = 0");
 59 | 	if (PQresultStatus(res) != PGRES_COMMAND_OK)
 60 | 	{
 61 | 		mtm_log(WARNING, "failed to set idle_session_timeout: %s",
 62 | 				pchomp(PQerrorMessage(conn)));
 63 | 		return false;
 64 | 	}
 65 | 
 66 | 	return true;
 67 | }
 68 | 
 69 | /*
 70 |  * Disable timeouts for a current process
 71 |  * - statement_timeout;
 72 |  * - lock_timeout;
 73 |  * - idle_in_transaction_session_timeout;
 74 |  * - idle_session_timeout.
 75 |  *
 76 |  * We disable these timeout for the same reason as in the disable_client_timeout()
 77 |  */
 78 | extern void
 79 | MtmDisableTimeouts(void)
 80 | {
 81 | 	if (get_timeout_active(STATEMENT_TIMEOUT))
 82 | 		disable_timeout(STATEMENT_TIMEOUT, false);
 83 | 	if (get_timeout_active(LOCK_TIMEOUT))
 84 | 		disable_timeout(LOCK_TIMEOUT, false);
 85 | 	if (get_timeout_active(IDLE_IN_TRANSACTION_SESSION_TIMEOUT))
 86 | 		disable_timeout(IDLE_IN_TRANSACTION_SESSION_TIMEOUT, false);
 87 | 	if (get_timeout_active(IDLE_SESSION_TIMEOUT))
 88 | 		disable_timeout(IDLE_SESSION_TIMEOUT, false);
 89 | }
 90 | 
 91 | /*
 92 |  * Wrapper on PQconnectPoll
 93 |  *
 94 |  * On connect disables timeouts on a client side
 95 |  */
 96 | PostgresPollingStatusType
 97 | MtmPQconnectPoll(PGconn *conn)
 98 | {
 99 | 	PostgresPollingStatusType status;
100 | 
101 | 	status = PQconnectPoll(conn);
102 | 	if (status != PGRES_POLLING_OK)
103 | 		return status;
104 | 
105 | 	if (!disable_client_timeouts(conn))
106 | 		status = PGRES_POLLING_FAILED;
107 | 
108 | 	return status;
109 | }
110 | 
111 | /*
112 |  * Wrapper on PQconnectdb
113 |  *
114 |  * On connect disables timeouts on a client side
115 |  */
116 | PGconn *
117 | MtmPQconnectdb(const char *conninfo)
118 | {
119 | 	PGconn *conn;
120 | 
121 | 	conn = PQconnectdb(conninfo);
122 | 	if (PQstatus(conn) != CONNECTION_OK)
123 | 		return conn;
124 | 
125 | 	if (!disable_client_timeouts(conn))
126 | 	{
127 | 		PQfinish(conn);
128 | 		return NULL;
129 | 	}
130 | 
131 | 	return conn;
132 | }
133 | 
134 | 


--------------------------------------------------------------------------------
/src/pglogical_hooks.c:
--------------------------------------------------------------------------------
  1 | /*-------------------------------------------------------------------------
  2 |  *
  3 |  * pglogical_hooks.c
  4 |  *
  5 |  * Portions Copyright (c) 2015-2021, Postgres Professional
  6 |  * Portions Copyright (c) 2015-2020, PostgreSQL Global Development Group
  7 |  *
  8 |  *-------------------------------------------------------------------------
  9 |  */
 10 | #include "postgres.h"
 11 | 
 12 | #include "access/xact.h"
 13 | 
 14 | #include "catalog/pg_proc.h"
 15 | #include "catalog/pg_type.h"
 16 | 
 17 | #include "replication/origin.h"
 18 | 
 19 | #include "parser/parse_func.h"
 20 | 
 21 | #include "utils/acl.h"
 22 | #include "utils/lsyscache.h"
 23 | 
 24 | #include "miscadmin.h"
 25 | 
 26 | #include "pglogical_hooks.h"
 27 | #include "pglogical_output.h"
 28 | 
 29 | #include "multimaster.h"
 30 | #include "logger.h"
 31 | 
 32 | /*
 33 |  * Returns Oid of the hooks function specified in funcname.
 34 |  *
 35 |  * Error is thrown if function doesn't exist or doen't return correct datatype
 36 |  * or is volatile.
 37 |  */
 38 | static Oid
 39 | get_hooks_function_oid(List *funcname)
 40 | {
 41 | 	Oid			funcid;
 42 | 	Oid			funcargtypes[1];
 43 | 
 44 | 	funcargtypes[0] = INTERNALOID;
 45 | 
 46 | 	/* find the the function */
 47 | 	funcid = LookupFuncName(funcname, 1, funcargtypes, false);
 48 | 
 49 | 	/* Validate that the function returns void */
 50 | 	if (get_func_rettype(funcid) != VOIDOID)
 51 | 	{
 52 | 		ereport(ERROR,
 53 | 				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
 54 | 				 MTM_ERRMSG("function %s must return void",
 55 | 							NameListToString(funcname))));
 56 | 	}
 57 | 
 58 | 	if (func_volatile(funcid) == PROVOLATILE_VOLATILE)
 59 | 	{
 60 | 		ereport(ERROR,
 61 | 				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
 62 | 				 MTM_ERRMSG("function %s must not be VOLATILE",
 63 | 							NameListToString(funcname))));
 64 | 	}
 65 | 
 66 | 	if (pg_proc_aclcheck(funcid, GetUserId(), ACL_EXECUTE) != ACLCHECK_OK)
 67 | 	{
 68 | 		const char *username;
 69 | #if PG_VERSION_NUM >= 90500
 70 | 		username = GetUserNameFromId(GetUserId(), false);
 71 | #else
 72 | 		username = GetUserNameFromId(GetUserId());
 73 | #endif
 74 | 		ereport(ERROR,
 75 | 				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
 76 | 				 MTM_ERRMSG("current user %s does not have permission to call function %s",
 77 | 							username, NameListToString(funcname))));
 78 | 	}
 79 | 
 80 | 	return funcid;
 81 | }
 82 | 
 83 | /*
 84 |  * If a hook setup function was specified in the startup parameters, look it up
 85 |  * in the catalogs, check permissions, call it, and store the resulting hook
 86 |  * info struct.
 87 |  */
 88 | void
 89 | load_hooks(PGLogicalOutputData *data)
 90 | {
 91 | 	Oid			hooks_func;
 92 | 	MemoryContext old_ctxt;
 93 | 	bool		txn_started = false;
 94 | 
 95 | 	if (!IsTransactionState())
 96 | 	{
 97 | 		txn_started = true;
 98 | 		StartTransactionCommand();
 99 | 	}
100 | 
101 | 	if (data->hooks_setup_funcname != NIL)
102 | 	{
103 | 		hooks_func = get_hooks_function_oid(data->hooks_setup_funcname);
104 | 
105 | 		old_ctxt = MemoryContextSwitchTo(data->hooks_mctxt);
106 | 		(void) OidFunctionCall1(hooks_func, PointerGetDatum(&data->hooks));
107 | 		MemoryContextSwitchTo(old_ctxt);
108 | 
109 | 		elog(DEBUG3, "pglogical_output: Loaded hooks from function %u. Hooks are: \n"
110 | 			 "\tstartup_hook: %p\n"
111 | 			 "\tshutdown_hook: %p\n"
112 | 			 "\trow_filter_hook: %p\n"
113 | 			 "\ttxn_filter_hook: %p\n"
114 | 			 "\thooks_private_data: %p\n",
115 | 			 hooks_func,
116 | 			 data->hooks.startup_hook,
117 | 			 data->hooks.shutdown_hook,
118 | 			 data->hooks.row_filter_hook,
119 | 			 data->hooks.txn_filter_hook,
120 | 			 data->hooks.hooks_private_data);
121 | 	}
122 | 	else if (data->api->setup_hooks)
123 | 	{
124 | 		old_ctxt = MemoryContextSwitchTo(data->hooks_mctxt);
125 | 		(*data->api->setup_hooks) (&data->hooks);
126 | 		MemoryContextSwitchTo(old_ctxt);
127 | 	}
128 | 
129 | 	if (txn_started)
130 | 		CommitTransactionCommand();
131 | }
132 | 
133 | void
134 | call_startup_hook(PGLogicalOutputData *data, List *plugin_params)
135 | {
136 | 	struct PGLogicalStartupHookArgs args;
137 | 	MemoryContext old_ctxt;
138 | 
139 | 	if (data->hooks.startup_hook != NULL)
140 | 	{
141 | 		bool		tx_started = false;
142 | 
143 | 		args.private_data = data->hooks.hooks_private_data;
144 | 		args.in_params = plugin_params;
145 | 		args.out_params = NIL;
146 | 
147 | 		elog(DEBUG3, "calling pglogical startup hook");
148 | 
149 | 		if (!IsTransactionState())
150 | 		{
151 | 			tx_started = true;
152 | 			StartTransactionCommand();
153 | 		}
154 | 
155 | 		old_ctxt = MemoryContextSwitchTo(data->hooks_mctxt);
156 | 		(void) (*data->hooks.startup_hook) (&args);
157 | 		MemoryContextSwitchTo(old_ctxt);
158 | 
159 | 		if (tx_started)
160 | 			CommitTransactionCommand();
161 | 
162 | 		data->extra_startup_params = args.out_params;
163 | 		/* The startup hook might change the private data seg */
164 | 		data->hooks.hooks_private_data = args.private_data;
165 | 
166 | 		elog(DEBUG3, "called pglogical startup hook");
167 | 	}
168 | }
169 | 
170 | void
171 | call_shutdown_hook(PGLogicalOutputData *data)
172 | {
173 | 	struct PGLogicalShutdownHookArgs args;
174 | 	MemoryContext old_ctxt;
175 | 
176 | 	if (data->hooks.shutdown_hook != NULL)
177 | 	{
178 | 		args.private_data = data->hooks.hooks_private_data;
179 | 
180 | 		elog(DEBUG3, "calling pglogical shutdown hook");
181 | 
182 | 		old_ctxt = MemoryContextSwitchTo(data->hooks_mctxt);
183 | 		(void) (*data->hooks.shutdown_hook) (&args);
184 | 		MemoryContextSwitchTo(old_ctxt);
185 | 
186 | 		data->hooks.hooks_private_data = args.private_data;
187 | 
188 | 		elog(DEBUG3, "called pglogical shutdown hook");
189 | 	}
190 | }
191 | 
192 | /*
193 |  * Decide if the individual change should be filtered out by
194 |  * calling a client-provided hook.
195 |  */
196 | bool
197 | call_row_filter_hook(PGLogicalOutputData *data, ReorderBufferTXN *txn,
198 | 					 Relation rel, ReorderBufferChange *change)
199 | {
200 | 	struct PGLogicalRowFilterArgs hook_args;
201 | 	MemoryContext old_ctxt;
202 | 	bool		ret = true;
203 | 
204 | 	if (data->hooks.row_filter_hook != NULL)
205 | 	{
206 | 		hook_args.change_type = change->action;
207 | 		hook_args.private_data = data->hooks.hooks_private_data;
208 | 		hook_args.changed_rel = rel;
209 | 
210 | 		elog(DEBUG3, "calling pglogical row filter hook");
211 | 
212 | 		old_ctxt = MemoryContextSwitchTo(data->hooks_mctxt);
213 | 		ret = (*data->hooks.row_filter_hook) (&hook_args);
214 | 		MemoryContextSwitchTo(old_ctxt);
215 | 
216 | 		/* Filter hooks shouldn't change the private data ptr */
217 | 		Assert(data->hooks.hooks_private_data == hook_args.private_data);
218 | 
219 | 		elog(DEBUG3, "called pglogical row filter hook, returned %d", (int) ret);
220 | 	}
221 | 
222 | 	return ret;
223 | }
224 | 
225 | bool
226 | call_txn_filter_hook(PGLogicalOutputData *data, RepOriginId txn_origin)
227 | {
228 | 	struct PGLogicalTxnFilterArgs hook_args;
229 | 	bool		ret = true;
230 | 	MemoryContext old_ctxt;
231 | 
232 | 	if (data->hooks.txn_filter_hook != NULL)
233 | 	{
234 | 		hook_args.private_data = data->hooks.hooks_private_data;
235 | 		hook_args.origin_id = txn_origin;
236 | 
237 | 		elog(DEBUG3, "calling pglogical txn filter hook");
238 | 
239 | 		old_ctxt = MemoryContextSwitchTo(data->hooks_mctxt);
240 | 		ret = (*data->hooks.txn_filter_hook) (&hook_args);
241 | 		MemoryContextSwitchTo(old_ctxt);
242 | 
243 | 		/* Filter hooks shouldn't change the private data ptr */
244 | 		Assert(data->hooks.hooks_private_data == hook_args.private_data);
245 | 
246 | 		elog(DEBUG3, "called pglogical txn filter hook, returned %d", (int) ret);
247 | 	}
248 | 
249 | 	return ret;
250 | }
251 | 


--------------------------------------------------------------------------------
/src/pglogical_relid_map.c:
--------------------------------------------------------------------------------
 1 | /*-------------------------------------------------------------------------
 2 |  *
 3 |  * pglogical_relid_map.c
 4 |  *		  Logical Replication map of local Oids to to remote
 5 |  *
 6 |  * Portions Copyright (c) 2015-2021, Postgres Professional
 7 |  * Portions Copyright (c) 2015-2020, PostgreSQL Global Development Group
 8 |  *
 9 |  *
10 |  * IDENTIFICATION
11 |  *		  pglogical_relid_map.c
12 |  *
13 |  *-------------------------------------------------------------------------
14 |  */
15 | #include "postgres.h"
16 | #include "utils/hsearch.h"
17 | #include "pglogical_relid_map.h"
18 | 
19 | static HTAB *relid_map;
20 | 
21 | static void
22 | pglogical_relid_map_init(void)
23 | {
24 | 	HASHCTL		ctl;
25 | 
26 | 	Assert(relid_map == NULL);
27 | 
28 | 	MemSet(&ctl, 0, sizeof(ctl));
29 | 	ctl.keysize = sizeof(Oid);
30 | 	ctl.entrysize = sizeof(PGLRelidMapEntry);
31 | 	relid_map = hash_create("pglogical_relid_map", PGL_INIT_RELID_MAP_SIZE, &ctl, HASH_ELEM | HASH_BLOBS);
32 | 
33 | 	Assert(relid_map != NULL);
34 | }
35 | 
36 | Oid
37 | pglogical_relid_map_get(Oid relid)
38 | {
39 | 	if (relid_map != NULL)
40 | 	{
41 | 		PGLRelidMapEntry *entry = (PGLRelidMapEntry *) hash_search(relid_map, &relid, HASH_FIND, NULL);
42 | 
43 | 		return entry ? entry->local_relid : InvalidOid;
44 | 	}
45 | 	return InvalidOid;
46 | }
47 | 
48 | bool
49 | pglogical_relid_map_put(Oid remote_relid, Oid local_relid)
50 | {
51 | 	bool		found;
52 | 	PGLRelidMapEntry *entry;
53 | 
54 | 	if (relid_map == NULL)
55 | 	{
56 | 		pglogical_relid_map_init();
57 | 	}
58 | 	entry = hash_search(relid_map, &remote_relid, HASH_ENTER, &found);
59 | 	if (found)
60 | 	{
61 | 		entry->local_relid = local_relid;
62 | 		return false;
63 | 	}
64 | 	entry->local_relid = local_relid;
65 | 	return true;
66 | }
67 | 
68 | void
69 | pglogical_relid_map_reset(void)
70 | {
71 | 	if (relid_map != NULL)
72 | 	{
73 | 		hash_destroy(relid_map);
74 | 		relid_map = NULL;
75 | 	}
76 | }
77 | 


--------------------------------------------------------------------------------
/src/spill.c:
--------------------------------------------------------------------------------
  1 | /*-----------------------------------------------------------------------------
  2 |  * spill.c
  3 |  *
  4 |  * Copyright (c) 2017-2021, Postgres Professional
  5 |  *
  6 |  *-----------------------------------------------------------------------------
  7 |  */
  8 | #include "postgres.h"
  9 | 
 10 | #include <unistd.h>
 11 | #include <sys/stat.h>
 12 | #include "storage/fd.h"
 13 | #include "spill.h"
 14 | #include "pgstat.h"
 15 | 
 16 | #include "multimaster.h"
 17 | #include "logger.h"
 18 | 
 19 | void
 20 | MtmSpillToFile(int fd, char const *data, size_t size)
 21 | {
 22 | 	Assert(fd >= 0);
 23 | 	while (size != 0)
 24 | 	{
 25 | 		int			written = write(fd, data, size);
 26 | 
 27 | 		if (written <= 0)
 28 | 		{
 29 | 			close(fd);
 30 | 			ereport(ERROR,
 31 | 					(errcode_for_file_access(),
 32 | 					 MTM_ERRMSG("pglogical_recevier failed to spill transaction to file: %m")));
 33 | 		}
 34 | 		data += written;
 35 | 		size -= written;
 36 | 	}
 37 | }
 38 | 
 39 | void
 40 | MtmCreateSpillDirectory(int node_id)
 41 | {
 42 | 	char		path[MAXPGPATH];
 43 | 	struct dirent *spill_de;
 44 | 	DIR		   *spill_dir;
 45 | 
 46 | 	mkdir("pg_mtm", S_IRWXU);
 47 | 	sprintf(path, "pg_mtm/%d", node_id);
 48 | 	mkdir(path, S_IRWXU);
 49 | 
 50 | 	spill_dir = AllocateDir(path);
 51 | 	if (spill_dir == NULL)
 52 | 	{
 53 | 		ereport(PANIC,
 54 | 				(errcode_for_file_access(),
 55 | 				 MTM_ERRMSG("pglogical_receiver failed to create spill directory \"%s\": %m",
 56 | 							path)));
 57 | 	}
 58 | 	/* cleanup old files in case of previous crash */
 59 | 	while ((spill_de = ReadDir(spill_dir, path)) != NULL)
 60 | 	{
 61 | 		if (strncmp(spill_de->d_name, "txn", 3) == 0)
 62 | 		{
 63 | 			sprintf(path, "pg_mtm/%d/%s", node_id, spill_de->d_name);
 64 | 
 65 | 			if (unlink(path) != 0)
 66 | 				ereport(PANIC,
 67 | 						(errcode_for_file_access(),
 68 | 						 MTM_ERRMSG("pglogical_receiver could not remove spill file \"%s\": %m",
 69 | 									path)));
 70 | 		}
 71 | 	}
 72 | 	FreeDir(spill_dir);
 73 | }
 74 | 
 75 | 
 76 | int
 77 | MtmCreateSpillFile(int node_id, int *file_id)
 78 | {
 79 | 	static int	spill_file_id;
 80 | 	char		path[MAXPGPATH];
 81 | 	int			fd;
 82 | 
 83 | 	sprintf(path, "pg_mtm/%d/txn-%d.snap",
 84 | 			node_id, ++spill_file_id);
 85 | 	fd = BasicOpenFile(path,
 86 | 					   O_CREAT | O_TRUNC | O_WRONLY | O_APPEND | PG_BINARY);
 87 | 	if (fd < 0)
 88 | 	{
 89 | 		ereport(PANIC,
 90 | 				(errcode_for_file_access(),
 91 | 				 MTM_ERRMSG("pglogical_receiver could not create spill file \"%s\": %m",
 92 | 							path)));
 93 | 	}
 94 | 	*file_id = spill_file_id;
 95 | 	return fd;
 96 | }
 97 | 
 98 | int
 99 | MtmOpenSpillFile(int node_id, int file_id)
100 | {
101 | 	static char path[MAXPGPATH];
102 | 	int			fd;
103 | 
104 | 	sprintf(path, "pg_mtm/%d/txn-%d.snap",
105 | 			node_id, file_id);
106 | 	fd = OpenTransientFile(path,
107 | 						   O_RDONLY | PG_BINARY);
108 | 	if (fd < 0)
109 | 	{
110 | 		ereport(PANIC,
111 | 				(errcode_for_file_access(),
112 | 				 MTM_ERRMSG("pglogical_apply could not open spill file \"%s\": %m",
113 | 							path)));
114 | 	}
115 | 	if (unlink(path) < 0)
116 | 	{							/* Should remove file on close */
117 | 		ereport(LOG,
118 | 				(errcode_for_file_access(),
119 | 				 MTM_ERRMSG("pglogical_apply failed to unlink spill file: %m")));
120 | 	}
121 | 	return fd;
122 | }
123 | 
124 | void
125 | MtmReadSpillFile(int fd, char *data, size_t size)
126 | {
127 | 	Assert(fd >= 0);
128 | 	while (size != 0)
129 | 	{
130 | 		int			rc = read(fd, data, size);
131 | 
132 | 		if (rc <= 0)
133 | 		{
134 | 			CloseTransientFile(fd);
135 | 			ereport(ERROR,
136 | 					(errcode_for_file_access(),
137 | 					 MTM_ERRMSG("pglogical_apply failed to read spill file: %m")));
138 | 		}
139 | 		data += rc;
140 | 		size -= rc;
141 | 	}
142 | }
143 | 
144 | void
145 | MtmCloseSpillFile(int fd)
146 | {
147 | 	if (close(fd) < 0)
148 | 		ereport(ERROR,
149 | 				(errcode_for_file_access(),
150 | 				 MTM_ERRMSG("pglogical_recevier failed to close spill file: %m")));
151 | }
152 | 


--------------------------------------------------------------------------------
/src/test_bkb.sage.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env sage
 2 | import sys, os
 3 | 
 4 | from sage.all import *
 5 | from subprocess import Popen, PIPE, STDOUT
 6 | from random import randrange, randint
 7 | import unittest
 8 | 
 9 | def run_stdin(input):
10 |     mydir = os.path.dirname(os.path.realpath(__file__))
11 |     binfile = mydir + "/../src/a.out"
12 | 
13 |     p = Popen(binfile, stdout=PIPE, stdin=PIPE, stderr=STDOUT)
14 |     grep_stdout = p.communicate(input=input)[0]
15 |     return grep_stdout.decode()
16 | 
17 | def run_bkb(g):
18 |     n = len(g)
19 |     params = str(n) + "\n"
20 |     for i in range(n):
21 |         row = 0
22 |         row |= 1 << i
23 |         for j in range(n):
24 |             if g.has_edge(i, j):
25 |                 row |= 1 << j
26 |         params += str(row) + "\n"
27 | 
28 |     # print(params)
29 |     res = run_stdin(params).strip()
30 |     res = [int(n) for n in res.split(' ')]
31 |     return res
32 | 
33 | 
34 | class TestCliqueBKB(unittest.TestCase):
35 | 
36 |     # test only that max clique size is ok
37 |     def test_random_graphs_size(self):
38 | 
39 |         for _ in range(1000):
40 |             n_nodes = randint(1, 60)
41 |             n_edges = randrange(1 + (n_nodes * (n_nodes - 1) / 2))
42 |             print("graph |V|={}, |E|={}>".format(n_nodes, n_edges))
43 |             g = graphs.RandomGNM(n_nodes, n_edges)
44 | 
45 |             clique, clique_size = run_bkb(g)
46 |             clique_members = []
47 |             for i in range(n_nodes):
48 |                 if (clique & (1 << i)) != 0:
49 |                     clique_members.append(i)
50 | 
51 |             sage_clique_maximum = g.clique_maximum()
52 | 
53 |             print(clique, clique_members, clique_size, sage_clique_maximum, len(sage_clique_maximum))
54 |             self.assertEqual(clique_size, len(sage_clique_maximum))
55 | 
56 |     # test that found graph is indeed the clique, much more expensive
57 |     def test_random_graphs(self):
58 | 
59 |         for _ in range(1000):
60 |             n_nodes = randint(1, 30)
61 |             n_edges = randrange(1 + (n_nodes * (n_nodes - 1) / 2))
62 |             print("graph |V|={}, |E|={}>".format(n_nodes, n_edges))
63 |             g = graphs.RandomGNM(n_nodes, n_edges)
64 | 
65 |             clique, clique_size = run_bkb(g)
66 |             clique_members = []
67 |             for i in range(n_nodes):
68 |                 if (clique & (1 << i)) != 0:
69 |                     clique_members.append(i)
70 | 
71 |             sage_maxcliques = g.cliques_maximal()
72 |             print(sage_maxcliques[0])
73 | 
74 |             found = False
75 |             for sc in sage_maxcliques:
76 |                 if sc == clique_members:
77 |                     found = True
78 |             self.assertTrue(found)
79 | 
80 |             print(clique, clique_members, clique_size, sage_maxcliques[0], len(sage_maxcliques[0]))
81 | 
82 | 
83 | 
84 | if __name__ == '__main__':
85 |     unittest.main()
86 | 


--------------------------------------------------------------------------------
/t/000_cross._pl:
--------------------------------------------------------------------------------
  1 | # based on
  2 | # "Distributed snapshot isolation: global transactions pay globally,
  3 | # local transactions pay locally"
  4 | # by Binnig et al cross-phenomenon.
  5 | 
  6 | use strict;
  7 | use warnings;
  8 | 
  9 | use Cluster;
 10 | use TestLib;
 11 | use Test::More tests => 2;
 12 | use IPC::Run qw(start finish);
 13 | use Cwd;
 14 | 
 15 | my $nnodes = 2;
 16 | my $nclients = 2;
 17 | my $nkeys = $nnodes * $nclients;
 18 | my $cluster = new Cluster($nnodes);
 19 | 
 20 | $cluster->init();
 21 | $cluster->configure();
 22 | $cluster->start();
 23 | 
 24 | my ($rc, $in, $out, $err);
 25 | 
 26 | $cluster->await_nodes( [0,1] );
 27 | 
 28 | note("preparing the tables");
 29 | if ($cluster->psql(0, 'postgres', "create table t (k int primary key, v int)"))
 30 | {
 31 | 	$cluster->bail_out_with_logs('failed to create t');
 32 | }
 33 | 
 34 | if ($cluster->psql(0, 'postgres', "insert into t (select generate_series(0, $nkeys - 1), 0)"))
 35 | {
 36 | 	$cluster->bail_out_with_logs('failed to fill t');
 37 | }
 38 | 
 39 | sub appender
 40 | {
 41 | 	my ($appender_id, $clients, $seconds, $node, $inref, $outref) = @_;
 42 | 
 43 | 	my @argv = (
 44 | 		'pgbench',
 45 | 		'-n',
 46 | 		-c => $clients,
 47 | 		-j => $clients,
 48 | 		-T => $seconds,
 49 | 		-h => $node->host(),
 50 | 		-p => $node->port(),
 51 | 		-D => "appender_id=$appender_id",
 52 | 		-D => "clients=$clients",
 53 | 		-f => 'tests/appender.pgb',
 54 | 		'postgres',
 55 | 	);
 56 | 
 57 | 	note("running[" . getcwd() . "]: " . join(' ', @argv));
 58 | 
 59 | 	return start(\@argv, $inref, $outref);
 60 | }
 61 | 
 62 | sub state_dump
 63 | {
 64 | 	my $state = shift;
 65 | 
 66 | 	note("<<<<<");
 67 | 	while (my ($key, $value) = each(%{$state}))
 68 | 	{
 69 | 		note("$key -> $value");
 70 | 	}
 71 | 	note(">>>>>");
 72 | }
 73 | 
 74 | sub state_leq
 75 | {
 76 | 	my ($a, $b) = @_;
 77 | 
 78 | 	while (my ($key, $value) = each(%{$a}))
 79 | 	{
 80 | 		if (!exists($b->{$key}))
 81 | 		{
 82 | 			note("b has no key $key\n");
 83 | 			return 0;
 84 | 		}
 85 | 
 86 | 		if ($b->{$key} < $value)
 87 | 		{
 88 | 			note($b->{$key} . " < $value\n");
 89 | 			return 0;
 90 | 		}
 91 | 	}
 92 | 
 93 | 	return 1;
 94 | }
 95 | 
 96 | sub parse_state
 97 | {
 98 | 	my $str = shift;
 99 | 	my $state = {};
100 | 
101 | 	while ($str =~ /(\d+)\|(\d+)/g)
102 | 	{
103 | 		$state->{$1} = $2;
104 | 	}
105 | 
106 | 	return $state;
107 | }
108 | 
109 | note("starting appenders");
110 | note("starting benches");
111 | $in = '';
112 | $out = '';
113 | my @appenders = ();
114 | my $appender_id = 0;
115 | my $seconds = 30;
116 | foreach my $node (@{$cluster->{nodes}})
117 | {
118 | 	push(@appenders, appender($appender_id, $nclients, $seconds, $node, \$in, \$out));
119 | 	$appender_id++;
120 | }
121 | 
122 | my $selects = 0;
123 | my $anomalies = 0;
124 | my $started = time();
125 | my $node_id = 0;
126 | my $state_a = undef;
127 | my $state_b = undef;
128 | my $out_a = '';
129 | my $out_b = '';
130 | while (time() - $started < $seconds)
131 | {
132 | 	$node_id = ($node_id + 1) % $nnodes;
133 | 	$state_a = $state_b;
134 | 	$out_a = $out_b;
135 | 	($rc, $out, $err) = $cluster->psql($node_id, 'postgres', "select * from t;");
136 | 	$selects++;
137 | 	$state_b = parse_state($out);
138 | 	$out_b = $out;
139 | 	if (defined $state_a)
140 | 	{
141 | 		if (!state_leq($state_a, $state_b) && !state_leq($state_a, $state_b))
142 | 		{
143 | 			note("cross anomaly detected:\n===a\n$out_a\n+++b\n$out_b\n---\n");
144 | 			$anomalies++;
145 | 		}
146 | 	}
147 | }
148 | 
149 | note("finishing benches");
150 | foreach my $appender (@appenders)
151 | {
152 | 	if (!finish($appender))
153 | 	{
154 | 		$cluster->dumplogs();
155 | 		$cluster->bail_out_with_logs("pgbench exited with $?");
156 | 	}
157 | }
158 | 
159 | is($anomalies, 0, "no cross anomalies after $selects selects");
160 | 
161 | ok($cluster->stop('fast'), "cluster stops");
162 | 1;
163 | 


--------------------------------------------------------------------------------
/t/000_deadlock.pl:
--------------------------------------------------------------------------------
  1 | # simple deadlock test
  2 | 
  3 | use strict;
  4 | use warnings;
  5 | 
  6 | use Cluster;
  7 | use TestLib;
  8 | 
  9 | # Test whether we have both DBI and DBD::pg
 10 | my $dbdpg_rc = eval
 11 | {
 12 |   require DBI;
 13 |   require DBD::Pg;
 14 |   DBD::Pg->import(':async');
 15 |   1;
 16 | };
 17 | 
 18 | # And tell Test::More to skip the test entirely if not
 19 | require Test::More;
 20 | if (not $dbdpg_rc)
 21 | {
 22 | 	Test::More->import(skip_all => 'DBI and DBD::Pg are not available');
 23 | }
 24 | else
 25 | {
 26 | 	Test::More->import(tests => 1);
 27 | }
 28 | 
 29 | sub query_row
 30 | {
 31 | 	my ($dbi, $sql, @keys) = @_;
 32 | 	my $sth = $dbi->prepare($sql) || die;
 33 | 	$sth->execute(@keys) || die;
 34 | 	my $ret = $sth->fetchrow_array || undef;
 35 | 	return $ret;
 36 | }
 37 | 
 38 | sub query_exec
 39 | {
 40 | 	my ($dbi, $sql) = @_;
 41 | 	my $rv = $dbi->do($sql) || die;
 42 | 	return $rv;
 43 | }
 44 | 
 45 | sub query_exec_async
 46 | {
 47 | 	my ($dbi, $sql) = @_;
 48 | 	# Since we are not importing DBD::Pg at compilation time, we can't use
 49 | 	# constants from it.
 50 | 	my $DBD_PG_PG_ASYNC = 1;
 51 | 	my $rv = $dbi->do($sql, {pg_async => $DBD_PG_PG_ASYNC}) || die;
 52 | 	return $rv;
 53 | }
 54 | 
 55 | my $cluster = new Cluster(2);
 56 | 
 57 | $cluster->init();
 58 | $cluster->start();
 59 | $cluster->create_mm('regression');
 60 | 
 61 | my ($rc, $out, $err);
 62 | sleep(10);
 63 | 
 64 | $cluster->safe_psql(0, "create table t(k int primary key, v text)");
 65 | $cluster->safe_psql(0, "insert into t values (1, 'hello'), (2, 'world')");
 66 | 
 67 | my @conns = map { DBI->connect('DBI:Pg:' . $cluster->connstr($_)) } 0..1;
 68 | 
 69 | query_exec($conns[0], "begin");
 70 | query_exec($conns[1], "begin");
 71 | 
 72 | query_exec($conns[0], "update t set v = 'asd' where k = 1");
 73 | query_exec($conns[1], "update t set v = 'bsd'");
 74 | 
 75 | query_exec($conns[0], "update t set v = 'bar' where k = 2");
 76 | query_exec($conns[1], "update t set v = 'foo'");
 77 | 
 78 | query_exec_async($conns[0], "commit");
 79 | query_exec_async($conns[1], "commit");
 80 | 
 81 | my $timeout = 16;
 82 | while (--$timeout > 0)
 83 | {
 84 | 	my $r0 = $conns[0]->pg_ready();
 85 | 	my $r1 = $conns[1]->pg_ready();
 86 | 	if ($r0 && $r1) {
 87 | 		last;
 88 | 	}
 89 | 	sleep(1);
 90 | }
 91 | 
 92 | if ($timeout > 0)
 93 | {
 94 | 	my $succeeded = 0;
 95 | 	$succeeded++ if $conns[0]->pg_result();
 96 | 	$succeeded++ if $conns[1]->pg_result();
 97 | 
 98 | 	pass("queries finished");
 99 | }
100 | else
101 | {
102 | 	$conns[0]->pg_cancel() unless $conns[0]->pg_ready();
103 | 	$conns[1]->pg_cancel() unless $conns[1]->pg_ready();
104 | 
105 | 	fail("queries timed out");
106 | }
107 | 
108 | query_row($conns[0], "select * from t where k = 1");
109 | 
110 | $cluster->stop('fast');
111 | 


--------------------------------------------------------------------------------
/t/000_init._pl:
--------------------------------------------------------------------------------
 1 | # test that after create_mm awaited nodes we won't get non-online state
 2 | # immediately later. Catches races in MtmGetCurrentStatus logic.
 3 | # It is expensive, so not run in the regular suite.
 4 | 
 5 | use Cluster;
 6 | use Test::More tests => 1;
 7 | 
 8 | my $cluster = new Cluster(3);
 9 | $cluster->init(q{
10 | });
11 | $cluster->start();
12 | $cluster->create_mm('regression');
13 | 
14 | foreach(0..1000) # hopefully enough to catch all related races
15 | {
16 | 	foreach (0..2)
17 | 	{
18 | 		$cluster->safe_psql($_, "select 42");
19 | 	}
20 | }
21 | 
22 | is(0, 0, "dummy"); # Test::More doesn't like 0 tests, ha
23 | 


--------------------------------------------------------------------------------
/t/001_regress.pl:
--------------------------------------------------------------------------------
  1 | # run core regression tests on multimaster
  2 | 
  3 | # tests known to fail currently and failure reasons:
  4 | # - create_index (CREATE INDEX CONCURRENTLY not supported due to deadlock
  5 | #     issues, see ddl.c)
  6 | # - same for index_including, index_including_gist
  7 | # - create_table (due to CTAS prepared statement)
  8 | # - sanity check (due to pg_publication/subscription masking and other mtm tables)
  9 | # - transactions (lack of COMMIT AND CHAIN support)
 10 | # - rowsecurity
 11 | # - atx, atx5
 12 | # - rules (_pg_prepared_xacts and similar)
 13 | # - publication, subscription (_pg_publication/subscription masking)
 14 | # - prepare (CTAS prepared statement)
 15 | # - indexing (again CIC).
 16 | #
 17 | # original test output/diffs are at $ENV{TESTDIR}/tmp_check/regress_outdir;
 18 | # (in normal build TESTDIR is just mmts/; in vpath it is 'external' mmts/)
 19 | # then diff is censored and copied to $ENV{TESTDIR}/results.
 20 | 
 21 | use Cluster;
 22 | use File::Basename;
 23 | use IPC::Run 'run';
 24 | use Test::More;
 25 | 
 26 | # With PGXS the sources are unavailable, so we can't obtain schedules and core
 27 | # test themselves.
 28 | if ($ENV{'PGXS'})
 29 | {
 30 | 	# Test::More doesn't like no tests at all
 31 | 	is(0, 0, "dummy");
 32 | 	done_testing();
 33 | 	exit(0);
 34 | }
 35 | 
 36 | # determenistic ports for expected files
 37 | $PostgresNode::last_port_assigned = 55431;
 38 | 
 39 | my $cluster = new Cluster(3);
 40 | $cluster->init(q{
 41 | 	multimaster.volkswagen_mode = on
 42 | 	# allow to spoof pg_prepared_xacts view
 43 | 	allow_system_table_mods = on
 44 | });
 45 | $cluster->start();
 46 | $cluster->create_mm('regression');
 47 | 
 48 | ###############################################################################
 49 | # postgres regression tests
 50 | ###############################################################################
 51 | 
 52 | # configure db output format like pg_regress
 53 | # In particular, pg_regress explicitly sets PGTZ=PST8PDT, and it turns out some
 54 | # tests (including DDL! (see volatile_partbound_test)) depend on current_time,
 55 | # so mtm receiver ought to use the same timezone to pass them.
 56 | $cluster->{nodes}->[0]->safe_psql('regression', q{
 57 | 	ALTER DATABASE "regression" SET lc_messages TO 'C';
 58 | 	ALTER DATABASE "regression" SET lc_monetary TO 'C';
 59 | 	ALTER DATABASE "regression" SET lc_numeric TO 'C';
 60 | 	ALTER DATABASE "regression" SET lc_time TO 'C';
 61 | 	ALTER DATABASE "regression" SET timezone_abbreviations TO 'Default';
 62 | 	ALTER DATABASE "regression" SET TimeZone TO 'PST8PDT';
 63 | });
 64 | 
 65 | # do not show transaction from concurrent backends in pg_prepared_xacts
 66 | $cluster->{nodes}->[0]->safe_psql('regression', q{
 67 | 	ALTER VIEW pg_prepared_xacts RENAME TO _pg_prepared_xacts;
 68 | 	CREATE VIEW pg_prepared_xacts AS
 69 | 		select * from _pg_prepared_xacts where gid not like 'MTM-%'
 70 | 		ORDER BY transaction::text::bigint;
 71 | 	ALTER TABLE pg_publication RENAME TO _pg_publication;
 72 | 	CREATE VIEW pg_catalog.pg_publication AS SELECT * FROM pg_catalog._pg_publication WHERE pubname<>'multimaster';
 73 | 	ALTER TABLE pg_subscription RENAME TO _pg_subscription;
 74 | 	CREATE VIEW pg_catalog.pg_subscription AS SELECT * FROM pg_catalog._pg_subscription WHERE subname NOT LIKE 'mtm_sub_%';
 75 | });
 76 | 
 77 | $cluster->{nodes}->[0]->safe_psql('regression', q{
 78 | 	ALTER SYSTEM SET allow_system_table_mods = 'off';
 79 | });
 80 | foreach my $node (@{$cluster->{nodes}}){
 81 | 	$node->restart;
 82 | }
 83 | $cluster->await_nodes( [0,1,2] );
 84 | 
 85 | # load schedule without tablespace test which is not expected
 86 | # to work with several postgreses on a single node
 87 | my $schedule = TestLib::slurp_file('../../src/test/regress/parallel_schedule');
 88 | $schedule =~ s/test: tablespace/#test: tablespace/g;
 89 | $schedule =~ s/test: cfs/#test: cfs/g;
 90 | $schedule =~ s/test: largeobject//; # serial schedule
 91 | $schedule =~ s/largeobject//; # parallel schedule
 92 | $schedule =~ s/atx0//; # parallel schedule
 93 | unlink('parallel_schedule');
 94 | TestLib::append_to_file('parallel_schedule', $schedule);
 95 | 
 96 | my $regress_shlib = $ENV{REGRESS_SHLIB};
 97 | my $regress_libdir = dirname($regress_shlib);
 98 | my $regress_outdir = "$ENV{TESTDIR}/tmp_check/regress_outdir";
 99 | mkdir($regress_outdir);
100 | # REMOVEME: not needed in 14+, pg_regress fixed in upstream
101 | mkdir("${regress_outdir}/sql");
102 | mkdir("${regress_outdir}/expected");
103 | TestLib::system_log($ENV{'PG_REGRESS'},
104 | 	'--host=' . $cluster->{nodes}->[0]->host, '--port=' . $cluster->{nodes}->[0]->port,
105 | 	'--use-existing', '--bindir=',
106 | 	'--schedule=parallel_schedule',
107 | 	"--dlpath=${regress_libdir}",
108 | 	'--inputdir=../../src/test/regress',
109 |     "--outputdir=${regress_outdir}");
110 | unlink('parallel_schedule');
111 | 
112 | # rename s/diffs/diff as some upper level testing systems are searching for all
113 | # *.diffs files.
114 | rename "${regress_outdir}/regression.diffs", "${regress_outdir}/regression.diff"
115 |   or die "cannot rename file: $!";
116 | 
117 | # strip absolute paths and dates out of resulted regression.diffs
118 | my $res_diff = TestLib::slurp_file("${regress_outdir}/regression.diff");
119 | # In <= 11 default diff format was context, since 12 unified; handing lines
120 | # starting with ---|+++|*** covers both.
121 | # To make someone's life easier, we prepend .. to make relative paths correct.
122 | # (it allows goto file comparison in editors)
123 | # This of course unfortunately doesn't work for VPATH.
124 | $res_diff =~ s/(--- |\+\+\+ |\*\*\* ).+contrib\/mmts(.+\.out)\t.+\n/$1..$2\tCENSORED\n/g;
125 | # Since 12 header like
126 | #   diff -U3 /blabla/contrib/mmts/../../src/test/regress/expected/opr_sanity.out /blabla/mmts/../../src/test/regress/results/opr_sanity.out
127 | # was added to each file diff
128 | $res_diff =~ s/(diff ).+contrib\/mmts(.+\.out).+contrib\/mmts(.+\.out\n)/$1..$2 ..$3/g;
129 | $res_diff =~ s/(lo_import[ \(]')\/[^']+\//$1\/CENSORED\//g;
130 | #SELECT lo_export(loid, '/home/alex/projects/ppro/postgrespro/contrib/mmts/../../src/test/regress/results/lotest.txt') FROM lotest_stash_values;
131 | $res_diff =~ s/(lo_export.*\'\/).+\//$1CENSORED\//g;
132 | mkdir("$ENV{TESTDIR}/results");
133 | unlink("$ENV{TESTDIR}/results/regression.diff");
134 | 
135 | # finally compare regression.diffs with our version
136 | # Do not use diffs extension as some upper level testing systems are searching for all
137 | # *.diffs files.
138 | TestLib::append_to_file("$ENV{TESTDIR}/results/regression.diff", $res_diff);
139 | # TODO: work with diffs on per-test basis
140 | my $expected_file;
141 | if (Cluster::is_ee())
142 | {
143 | 	$expected_file = "expected/regression_ee.diff"
144 | }
145 | else
146 | {
147 | 	$expected_file = "expected/regression_vanilla.diff"
148 | }
149 | $diff = TestLib::system_log("diff -U3 ${expected_file} $ENV{TESTDIR}/results/regression.diff");
150 | run [ "diff", "-U3", "${expected_file}", "$ENV{TESTDIR}/results/regression.diff" ], ">", "$ENV{TESTDIR}/regression.diff.diff";
151 | my $res = $?;
152 | 
153 | is($res, 0, "postgres regress");
154 | 
155 | done_testing();
156 | 


--------------------------------------------------------------------------------
/t/002_regressmm.pl:
--------------------------------------------------------------------------------
 1 | # run sql/multimaster.sql tests
 2 | use Cluster;
 3 | use Test::More tests => 1;
 4 | 
 5 | # determenistic ports for expected files
 6 | $PostgresNode::last_port_assigned = 55431;
 7 | 
 8 | my $cluster = new Cluster(3);
 9 | $cluster->init(q{
10 | 	multimaster.volkswagen_mode = off
11 | });
12 | $cluster->start();
13 | $cluster->create_mm('regression');
14 | 
15 | ###############################################################################
16 | # multimaster regression tests
17 | ###############################################################################
18 | 
19 | my @tests = ('multimaster');
20 | # run atx test only on ee
21 | if (Cluster::is_ee())
22 | {
23 | 	push @tests, 'atx';
24 | }
25 | 
26 | my $ret = TestLib::system_log($ENV{'PG_REGRESS'},
27 | 	'--host=' . $cluster->{nodes}->[0]->host, '--port=' . $cluster->{nodes}->[0]->port,
28 |     '--use-existing', '--bindir=', @tests);
29 | if ($ret != 0)
30 | {
31 |     print "### Got regression! \n", TestLib::slurp_file('regression.diffs');
32 | }
33 | is($ret, 0, "multimaster regress");
34 | 


--------------------------------------------------------------------------------
/t/003_basic_recovery.pl:
--------------------------------------------------------------------------------
 1 | # Basic recovery: some inserts, get node down, some inserts, get node up, some
 2 | # inserts. There is no failures with concurrent load, so an easy variant.
 3 | 
 4 | use strict;
 5 | use warnings;
 6 | use Cluster;
 7 | use TestLib;
 8 | use Test::More tests => 4;
 9 | 
10 | my $cluster = new Cluster(3);
11 | $cluster->init();
12 | $cluster->start();
13 | $cluster->create_mm();
14 | 
15 | my $ret;
16 | my $psql_out;
17 | 
18 | ###############################################################################
19 | # Replication check
20 | ###############################################################################
21 | 
22 | $cluster->{nodes}->[0]->safe_psql('postgres', q{
23 | 	create table if not exists t(k int primary key, v int);
24 | 	insert into t values(1, 10);
25 | });
26 | $psql_out = $cluster->{nodes}->[2]->safe_psql('postgres', q{
27 | 	select v from t where k=1;
28 | });
29 | is($psql_out, '10', "Check replication while all nodes are up.");
30 | 
31 | ###############################################################################
32 | # Isolation regress checks
33 | ###############################################################################
34 | 
35 | # we can call pg_regress here
36 | 
37 | ###############################################################################
38 | # Work after node stop
39 | ###############################################################################
40 | 
41 | note("stopping node 2");
42 | $cluster->{nodes}->[2]->stop;
43 | 
44 | $cluster->await_nodes_after_stop( [0,1] );
45 | 
46 | $cluster->safe_psql(0, "insert into t values(2, 20);");
47 | $cluster->safe_psql(1, "insert into t values(3, 30);");
48 | $cluster->safe_psql(0, "insert into t values(4, 40);");
49 | $cluster->safe_psql(1, "insert into t values(5, 50);");
50 | 
51 | $psql_out = $cluster->safe_psql(0, "select v from t where k=4;");
52 | is($psql_out, '40', "Check replication after node failure.");
53 | 
54 | ###############################################################################
55 | # Work after node start
56 | ###############################################################################
57 | 
58 | note("starting node 2");
59 | $cluster->{nodes}->[2]->start;
60 | 
61 | # intentionally start from 2
62 | $cluster->await_nodes( [2,0,1] );
63 | 
64 | $cluster->safe_psql(0, "insert into t values(6, 60);");
65 | $cluster->safe_psql(1, "insert into t values(7, 70);");
66 | $cluster->safe_psql(0, "insert into t values(8, 80);");
67 | $cluster->safe_psql(1, "insert into t values(9, 90);");
68 | 
69 | $psql_out = $cluster->safe_psql(2, "select v from t where k=8;");
70 | is($psql_out, '80', "Check replication after failed node recovery.");
71 | 
72 | $psql_out = $cluster->safe_psql(2, "select v from t where k=5;");
73 | is($psql_out, '50', "Check replication after failed node recovery.");
74 | 
75 | $cluster->stop();
76 | 
77 | 1;
78 | 


--------------------------------------------------------------------------------
/t/004_recovery.pl:
--------------------------------------------------------------------------------
  1 | use strict;
  2 | use warnings;
  3 | 
  4 | use Cluster;
  5 | use TestLib;
  6 | use Test::More tests => 6;
  7 | 
  8 | my $cluster = new Cluster(3);
  9 | $cluster->init();
 10 | $cluster->start();
 11 | $cluster->create_mm();
 12 | 
 13 | ########################################################
 14 | # Check data integrity before and after recovery of single node.
 15 | # Easy variant: sequential pgbenches, recovery without concurrent load.
 16 | ########################################################
 17 | 
 18 | my $hash0; my $hash1; my $hash2; my $oldhash;
 19 | my $hash_query = q{
 20 | select
 21 |     md5('(' || string_agg(aid::text || ', ' || abalance::text , '),(') || ')')
 22 | from
 23 |     (select * from pgbench_accounts order by aid) t;
 24 | };
 25 | 
 26 | $cluster->pgbench(1, ('-i', -s => '10') );
 27 | $cluster->pgbench(0, ('-n','-N', -T => '4') );
 28 | $cluster->pgbench(1, ('-n','-N', -T => '4') );
 29 | $cluster->pgbench(2, ('-n','-N', -T => '4') );
 30 | 
 31 | $cluster->{nodes}->[2]->stop('fast');
 32 | $cluster->await_nodes_after_stop( [0,1] );
 33 | 
 34 | $cluster->pgbench(0, ('-n','-N', -T => '4') );
 35 | $cluster->pgbench(1, ('-n','-N', -T => '4') );
 36 | 
 37 | $cluster->await_nodes( [0,1] ); # just in case we've faced random timeout before
 38 | $hash0 = $cluster->safe_psql(0, $hash_query);
 39 | $hash1 = $cluster->safe_psql(1, $hash_query);
 40 | is($hash0, $hash1, "Check that hash is the same before recovery");
 41 | 
 42 | $cluster->{nodes}->[2]->start;
 43 | $cluster->await_nodes( [2,0,1] );
 44 | 
 45 | $oldhash = $hash0;
 46 | $hash0 = $cluster->safe_psql(0, $hash_query);
 47 | $hash1 = $cluster->safe_psql(1, $hash_query);
 48 | $hash2 = $cluster->safe_psql(2, $hash_query);
 49 | 
 50 | note("$oldhash, $hash0, $hash1, $hash2");
 51 | is( (($hash0 eq $hash1) and ($hash1 eq $hash2) and ($oldhash eq $hash0)) , 1,
 52 |     "Check that hash is the same after recovery");
 53 | 
 54 | ########################################################
 55 | # Check start after all nodes were disconnected
 56 | ########################################################
 57 | 
 58 | $cluster->safe_psql(0, "create table if not exists t(k int primary key, v int);");
 59 | 
 60 | $cluster->safe_psql(0, "insert into t values(1, 10);");
 61 | $cluster->safe_psql(1, "insert into t values(2, 20);");
 62 | $cluster->safe_psql(2, "insert into t values(3, 30);");
 63 | 
 64 | my $sum0; my $sum1; my $sum2;
 65 | 
 66 | $cluster->{nodes}->[1]->stop('fast');
 67 | $cluster->{nodes}->[2]->stop('fast');
 68 | 
 69 | $cluster->{nodes}->[1]->start;
 70 | $cluster->{nodes}->[2]->start;
 71 | 
 72 | $cluster->await_nodes( [1,2,0] );
 73 | 
 74 | $sum0 = $cluster->safe_psql(0, "select sum(v) from t;");
 75 | $sum1 = $cluster->safe_psql(1, "select sum(v) from t;");
 76 | $sum2 = $cluster->safe_psql(2, "select sum(v) from t;");
 77 | is( (($sum0 == 60) and ($sum1 == $sum0) and ($sum2 == $sum0)) , 1,
 78 |     "Check that nodes are working and sync");
 79 | 
 80 | ########################################################
 81 | # Check recovery during some load
 82 | ########################################################
 83 | 
 84 | $cluster->pgbench(0, ('-i', -s => '10') );
 85 | $cluster->pgbench(0, ('-N', -T => '1') );
 86 | $cluster->pgbench(1, ('-N', -T => '1') );
 87 | $cluster->pgbench(2, ('-N', -T => '1') );
 88 | 
 89 | # kill node while neighbour is under load
 90 | my $pgb_handle = $cluster->pgbench_async(1, ('-N', -T => '20', -c => '5') );
 91 | sleep(5);
 92 | $cluster->{nodes}->[2]->stop('fast');
 93 | $cluster->pgbench_await($pgb_handle);
 94 | 
 95 | # start node while neighbour is under load
 96 | $pgb_handle = $cluster->pgbench_async(0, ('-N', -T => '20', -c => '5') );
 97 | sleep(5);
 98 | $cluster->{nodes}->[2]->start;
 99 | $cluster->pgbench_await($pgb_handle);
100 | 
101 | # await recovery
102 | $cluster->await_nodes( [2,0,1] );
103 | 
104 | # check data identity
105 | $hash0 = $cluster->safe_psql(0, $hash_query);
106 | $hash1 = $cluster->safe_psql(1, $hash_query);
107 | $hash2 = $cluster->safe_psql(2, $hash_query);
108 | note("$hash0, $hash1, $hash2");
109 | is( (($hash0 eq $hash1) and ($hash1 eq $hash2)) , 1, "Check that hash is the same");
110 | 
111 | $sum0 = $cluster->safe_psql(0, "select sum(abalance) from pgbench_accounts;");
112 | $sum1 = $cluster->safe_psql(1, "select sum(abalance) from pgbench_accounts;");
113 | $sum2 = $cluster->safe_psql(2, "select sum(abalance) from pgbench_accounts;");
114 | 
115 | note("Sums: $sum0, $sum1, $sum2");
116 | is($sum2, $sum0, "Check that sum_2 == sum_0");
117 | is($sum2, $sum1, "Check that sum_2 == sum_1");
118 | 
119 | $sum0 = $cluster->safe_psql(0, "select count(*) from pg_prepared_xacts;");
120 | $sum1 = $cluster->safe_psql(1, "select count(*) from pg_prepared_xacts;");
121 | $sum2 = $cluster->safe_psql(2, "select count(*) from pg_prepared_xacts;");
122 | 
123 | note("Number of prepared tx: $sum0, $sum1, $sum2");
124 | 
125 | $cluster->stop;
126 | 


--------------------------------------------------------------------------------
/t/005_pgbench.pl:
--------------------------------------------------------------------------------
 1 | # Kinda bank test: on each node multiple clients transfer money from one acc to
 2 | # another, another bunch of clients make sure sum is constant always.
 3 | 
 4 | use strict;
 5 | use warnings;
 6 | 
 7 | use Cluster;
 8 | use TestLib;
 9 | use Test::More tests => 2;
10 | 
11 | my $cluster = new Cluster(2);
12 | $cluster->init();
13 | $cluster->start();
14 | $cluster->create_mm();
15 | 
16 | $cluster->safe_psql(0, q{
17 | 	create table t (k int primary key, v int);
18 | 	insert into t (select generate_series(0, 999), 0);
19 | 	create table reader_log (v int);
20 | });
21 | 
22 | my $clients = 5;
23 | my $seconds = 30;
24 | my @benches = ();
25 | foreach (0..$#{$cluster->{nodes}})
26 | {
27 | 	push @benches, $cluster->pgbench_async($_,
28 | 		('-n', -T => $seconds, -c => $clients, -f => 'tests/reader.pgb'));
29 | 	push @benches, $cluster->pgbench_async($_,
30 | 		('-n', -T => $seconds, -c => $clients, -f => 'tests/writer.pgb', -R => 10));
31 | }
32 | 
33 | $cluster->pgbench_await($_) foreach @benches;
34 | 
35 | my $out;
36 | 
37 | $out = $cluster->safe_psql(0,
38 | 	"select count(*) from reader_log where v != 0");
39 | is($out, 0, "there is nothing except zeros in reader_log");
40 | 
41 | $out = $cluster->safe_psql(0,
42 | 	"select count(*) from reader_log where v = 0");
43 | isnt($out, 0, "reader_log is not empty");
44 | 
45 | $cluster->stop;
46 | 


--------------------------------------------------------------------------------
/t/006_pgbenchdl.pl:
--------------------------------------------------------------------------------
 1 | # Like pgbench.pl, but the probability of deadlocks is much higher; check that
 2 | # they get detected.
 3 | 
 4 | use strict;
 5 | use warnings;
 6 | 
 7 | use Cluster;
 8 | use TestLib;
 9 | use Test::More tests => 1;
10 | use Data::Dumper;
11 | 
12 | use POSIX ":sys_wait_h";
13 | 
14 | my $cluster = new Cluster(3);
15 | $cluster->init();
16 | $cluster->start();
17 | $cluster->create_mm();
18 | 
19 | $cluster->safe_psql(0, q{
20 | 	create table transactions (id SERIAL primary key, dt timestamp default now(), uid int, amount int);
21 | 	create index on transactions using btree(uid);
22 | 	create table users (uid int primary key, sum bigint);
23 | });
24 | 
25 | my $clients = 10;
26 | my $seconds = 90;
27 | my @benches = ();
28 | foreach (0..$#{$cluster->{nodes}})
29 | {
30 | 	push @benches, $cluster->pgbench_async($_,
31 | 		('-n', -T => $seconds, -c => $clients, -f => 'tests/deadl.pgb'));
32 | }
33 | 
34 | sub isalive {
35 | 	my $benches = $_[0];
36 | 	my $any_alive = 0;
37 | 	waitpid(-1, WNOHANG);
38 | 	$any_alive = ($any_alive or (kill 0,$_->{'KIDS'}->[0]->{'PID'})) foreach @{$benches};
39 | 	return $any_alive;
40 | }
41 | 
42 | # ensure num of successfull xacts steadily goes up, i.e. deadlocks are detected
43 | # in time.
44 | my $ptrans = 0;
45 | my $dead_count = 0;
46 | while (isalive(\@benches)) {
47 | 	my $trans = $cluster->safe_psql(0,
48 | 		"select count(*) from transactions");
49 | 	if ($ptrans == 0) {
50 | 		$ptrans = $trans;
51 | 	} elsif ($ptrans == $trans) {
52 | 		$dead_count++;
53 | 	} else {
54 | 		$dead_count = 0;
55 | 		$ptrans = $trans;
56 | 	}
57 | 	if ($dead_count >=3) {
58 | 		last;
59 | 	}
60 | 	sleep 2;
61 | }
62 | 
63 | ok($dead_count < 3, 'at least one xact was committed during 6 seconds');
64 | $cluster->stop;
65 | 


--------------------------------------------------------------------------------
/t/007_add_stop_node.pl:
--------------------------------------------------------------------------------
  1 | use strict;
  2 | use warnings;
  3 | 
  4 | use Carp;
  5 | use PostgresNode;
  6 | use Cluster;
  7 | use TestLib;
  8 | use Test::More tests => 8;
  9 | 
 10 | # Generally add node with concurrent load (and failures) is not supported
 11 | # because of at least
 12 | # 1) it is not clear why non-donor nodes should properly keep WAL for new node;
 13 | # 2) if donor fails, it is not clear whether new node will obtain suitable
 14 | #    syncpoints to pull from non-donors;
 15 | # 3) A problem with slot creation and receiver start deadlocking each other,
 16 | #    see PGPRO-3618.
 17 | #
 18 | # drop_node with concurrent load is not safe at least because once it is done we
 19 | # can't determine origin node properly, so no its xacts would be replicated.
 20 | #
 21 | # An option is left for experiments/future work.
 22 | my $concurrent_load = 0;
 23 | 
 24 | my $cluster = new Cluster(3);
 25 | $cluster->init();
 26 | $cluster->start();
 27 | 
 28 | # XXXX: delete all '-n' ?
 29 | 
 30 | ################################################################################
 31 | # manually setup nodes with sparse node_id's
 32 | ################################################################################
 33 | 
 34 | foreach (0..$#{$cluster->{nodes}})
 35 | {
 36 |     my $node = $cluster->{nodes}->[$_];
 37 |     $node->{dbname} = 'postgres';
 38 | }
 39 | 
 40 | foreach (0..$#{$cluster->{nodes}})
 41 | {
 42 |     my $node = $cluster->{nodes}->[$_];
 43 | 
 44 |     note($cluster->connstr($_));
 45 | 
 46 |     $cluster->safe_psql($_, qq{
 47 |         create extension multimaster;
 48 | 		select mtm.state_create('{2, 4, 5}');
 49 |         insert into mtm.cluster_nodes values
 50 |             (2, \$\$@{[ $cluster->connstr(0) ]}\$\$, '@{[ $_ == 0 ? 't' : 'f' ]}'),
 51 |             (4, \$\$@{[ $cluster->connstr(1) ]}\$\$, '@{[ $_ == 1 ? 't' : 'f' ]}'),
 52 |             (5, \$\$@{[ $cluster->connstr(2) ]}\$\$, '@{[ $_ == 2 ? 't' : 'f' ]}');
 53 |     });
 54 | }
 55 | 
 56 | $cluster->await_nodes( [0..$#{$cluster->{nodes}}] );
 57 | 
 58 | $cluster->pgbench(0, ('-i', '-n', -s => '10') );
 59 | $cluster->pgbench(0, ('-N', '-n', -t => '100') );
 60 | $cluster->pgbench(1, ('-N', '-n', -t => '100') ); # XXX: pgbench stucks here for quite a long time
 61 | $cluster->pgbench(2, ('-N', '-n', -t => '100') );
 62 | 
 63 | ################################################################################
 64 | # auto recovery
 65 | ################################################################################
 66 | 
 67 | $cluster->{nodes}->[2]->stop('fast');
 68 | $cluster->await_nodes_after_stop( [0,1] );
 69 | $cluster->pgbench(0, ('-N', '-n', -T => '1') );
 70 | $cluster->{nodes}->[2]->start;
 71 | 
 72 | $cluster->await_nodes( [2,0,1] );
 73 | is($cluster->is_data_identic( (0,1,2) ), 1, "check auto recovery");
 74 | 
 75 | ################################################################################
 76 | # add basebackuped node
 77 | ################################################################################
 78 | 
 79 | # add table with sequence to check sequences after n_nodes change
 80 | $cluster->safe_psql(0, "create table test_seq(id serial primary key)");
 81 | $cluster->safe_psql(0, "insert into test_seq values(DEFAULT)");
 82 | $cluster->safe_psql(1, "insert into test_seq values(DEFAULT)");
 83 | $cluster->safe_psql(2, "insert into test_seq values(DEFAULT)");
 84 | 
 85 | my $pgb1;
 86 | my $pgb2;
 87 | if ($concurrent_load)
 88 | {
 89 | 	$pgb1= $cluster->pgbench_async(0, ('-N', '-n', -T => '3600', -c => '2') );
 90 | 	$pgb2= $cluster->pgbench_async(1, ('-N', '-n', -T => '3600', -c => '2') );
 91 | }
 92 | 
 93 | my $new_node_off = $cluster->add_node();
 94 | $cluster->{nodes}->[$new_node_off]->{dbname} = 'postgres';
 95 | my $connstr = $cluster->connstr($new_node_off);
 96 | my $new_node_id = $cluster->safe_psql(0, "SELECT mtm.add_node(\$\$$connstr\$\$)");
 97 | 
 98 | is($new_node_id, 1, "sparse id assignment");
 99 | is($new_node_off, 3, "sparse id assignment");
100 | if ($concurrent_load)
101 | {
102 | 	$cluster->pgbench(0, ('-N', '-n', -t => '100') );
103 | }
104 | # Ensure monitor creates slot for new node on donor. We don't use it for
105 | # basebackup anymore, but this is still a good idea (it would be even better to
106 | # wait for logical slot creation too).
107 | $cluster->poll_query_until(0, "select exists(select * from pg_replication_slots where slot_name = 'mtm_filter_slot_${new_node_id}');")
108 |     or croak "timed out waiting for slot creation";
109 | my $end_lsn = $cluster->backup_and_init(0, $new_node_off, $new_node_id);
110 | 
111 | # Prevent recovery of new node further than the end point returned by
112 | # basebackup as streaming will be requested since it, so not doing this might
113 | # result in attempting to receive already existing data. This realistically
114 | # happens with syncpoint rows, leading to insertion conflict.
115 | #
116 | # It would be much nicer to learn the correct (end of recovery) LSN at the new
117 | # node itself and not burden user with carrying it around, but there seems no
118 | # easy way to do that without core changes.
119 | $cluster->{nodes}->[$new_node_off]->append_conf(
120 | 		"postgresql.conf", qq(
121 | restore_command = 'false'
122 | recovery_target = 'immediate'
123 | recovery_target_action = 'promote'
124 | ));
125 | # create recovery.signal
126 | $cluster->{nodes}->[$new_node_off]->set_recovery_mode();
127 | $cluster->{nodes}->[$new_node_off]->start;
128 | $cluster->await_nodes([3,0,1,2], 0);
129 | $cluster->safe_psql(0, "SELECT mtm.join_node('$new_node_id', '$end_lsn')");
130 | note("join_node done");
131 | 
132 | if ($concurrent_load)
133 | {
134 | 	sleep(5);
135 | 	IPC::Run::kill_kill($pgb1);
136 | 	IPC::Run::kill_kill($pgb2);
137 | }
138 | 
139 | $cluster->await_nodes( [3,0,1,2] );
140 | $cluster->pgbench(0, ('-N', '-n', -t => '100') );
141 | $cluster->pgbench(3, ('-N', '-n', -t => '100') );
142 | 
143 | is($cluster->is_data_identic( (0,1,2,3) ), 1, "add basebackuped node");
144 | 
145 | my $bb_keycount = $cluster->safe_psql(3, q{
146 |     select count(*) from mtm.config where key='basebackup'
147 | });
148 | 
149 | is($bb_keycount, 0, "basebackup key was deleted");
150 | 
151 | # check that sequences in proper state
152 | $cluster->safe_psql(0, "insert into test_seq values(DEFAULT)");
153 | $cluster->safe_psql(1, "insert into test_seq values(DEFAULT)");
154 | $cluster->safe_psql(2, "insert into test_seq values(DEFAULT)");
155 | $cluster->safe_psql(3, "insert into test_seq values(DEFAULT)");
156 | 
157 | ################################################################################
158 | # basic check of recovery after add node succeeded
159 | ################################################################################
160 | 
161 | $cluster->{nodes}->[0]->stop('fast');
162 | $cluster->await_nodes_after_stop( [1,2,3] );
163 | $cluster->pgbench(3, ('-N', '-n', -T => '1') );
164 | $cluster->{nodes}->[0]->start;
165 | 
166 | $cluster->await_nodes( [2,0,1] );
167 | is($cluster->is_data_identic((0,1,2,3)), 1, "check recovery after add_node");
168 | 
169 | ################################################################################
170 | # drop one of the initial nodes
171 | ################################################################################
172 | 
173 | $cluster->{nodes}->[0]->stop('fast');
174 | $cluster->await_nodes_after_stop( [1,2,3] );
175 | $cluster->safe_psql(1, "select mtm.drop_node(2)");
176 | 
177 | # check basic recovery after drop_node
178 | $cluster->{nodes}->[1]->stop('fast');
179 | $cluster->await_nodes_after_stop( [2,3] );
180 | $cluster->pgbench(3, ('-N', '-n', -T => '1') );
181 | $cluster->pgbench(2, ('-N', '-n', -T => '1') );
182 | $cluster->{nodes}->[1]->start;
183 | $cluster->await_nodes( [3,2,1] );
184 | is($cluster->is_data_identic((1,2,3)), 1, "check recovery after drop_node");
185 | 
186 | 
187 | # TODO: check that WALs are not kept for dropped node anymore
188 | 
189 | ################################################################################
190 | # XXX: check remove/add of same node
191 | ################################################################################
192 | 
193 | ################################################################################
194 | # XXX: check self remove
195 | ################################################################################
196 | 


--------------------------------------------------------------------------------
/t/008_bugfixes.pl:
--------------------------------------------------------------------------------
  1 | use Carp;
  2 | use POSIX;
  3 | use strict;
  4 | use Test::More;
  5 | use TestLib;
  6 | use Time::HiRes qw(usleep);
  7 | use warnings;
  8 | 
  9 | use PostgresNode;
 10 | use Cluster;
 11 | 
 12 | use Test::More tests => Cluster::is_ee() ? 6 : 5;
 13 | 
 14 | my $cluster = new Cluster(3);
 15 | $cluster->init();
 16 | $cluster->start();
 17 | $cluster->create_mm();
 18 | 
 19 | my $hash0; my $hash1; my $hash2; my $hash_query;
 20 | 
 21 | # run pathman test only on ee
 22 | if (Cluster::is_ee())
 23 | {
 24 | 	$cluster->safe_psql(0, q{
 25 | 						CREATE EXTENSION pg_pathman;
 26 | 						CREATE SCHEMA test_update_node;
 27 | 						SET pg_pathman.enable_partitionrouter = ON;
 28 | 
 29 | 						CREATE TABLE test_update_node.test_range(val NUMERIC NOT NULL, comment TEXT);
 30 | 						CREATE INDEX val_idx ON test_update_node.test_range (val);
 31 | 						INSERT INTO test_update_node.test_range SELECT i, i FROM generate_series(1, 100) i;
 32 | 						SELECT create_range_partitions('test_update_node.test_range', 'val', 1, 10);
 33 | 
 34 | 						ALTER TABLE test_update_node.test_range DROP COLUMN comment CASCADE;
 35 | 
 36 | 						UPDATE test_update_node.test_range SET val = 115 WHERE val = 55;
 37 | 						});
 38 | 
 39 | 	$hash_query = q{
 40 | 	select
 41 | 	  md5('(' || string_agg(val::text, '),(') || ')')
 42 | 	  from
 43 | 	  (select * from test_update_node.test_range order by val) t;
 44 | 	};
 45 | 	$hash0 = $cluster->safe_psql(0, $hash_query);
 46 | 	$hash1 = $cluster->safe_psql(1, $hash_query);
 47 | 	$hash2 = $cluster->safe_psql(2, $hash_query);
 48 | 	note("$hash0, $hash1, $hash2");
 49 | 	is( (($hash0 eq $hash1) and ($hash1 eq $hash2)) , 1,
 50 | 		"Check that hash is the same after query");
 51 | }
 52 | 
 53 | $cluster->safe_psql(0, q{
 54 | 	CREATE TABLE unique_tbl (i int UNIQUE DEFERRABLE, t text);
 55 | 	INSERT INTO unique_tbl VALUES (0, 'one');
 56 | 	INSERT INTO unique_tbl VALUES (1, 'two');
 57 | 	INSERT INTO unique_tbl VALUES (2, 'tree');
 58 | 	INSERT INTO unique_tbl VALUES (3, 'four');
 59 | 	INSERT INTO unique_tbl VALUES (4, 'five');
 60 | 	});
 61 | $cluster->{nodes}->[1]->psql($cluster->{nodes}->[1]->{dbname}, q{
 62 | 	-- default is immediate so this should fail right away
 63 | 	UPDATE unique_tbl SET i = 1 WHERE i = 0;
 64 | 	});
 65 | $cluster->safe_psql(0, q{
 66 | 	UPDATE unique_tbl SET i = i+1;
 67 | 	});
 68 | 
 69 | $hash_query = q{
 70 | select
 71 |     md5('(' || string_agg(i::text || ', ' || t::text , '),(') || ')')
 72 | from
 73 |     (select * from unique_tbl order by i) t;
 74 | };
 75 | $hash0 = $cluster->safe_psql(0, $hash_query);
 76 | $hash1 = $cluster->safe_psql(1, $hash_query);
 77 | $hash2 = $cluster->safe_psql(2, $hash_query);
 78 | note("$hash0, $hash1, $hash2");
 79 | is( (($hash0 eq $hash1) and ($hash1 eq $hash2)) , 1,
 80 | 	"Check that hash is the same after query");
 81 | 
 82 | # ##############################################################################
 83 | #
 84 | # Check the PGPRO-3146 bug. Hard crash of backend causes restart of all postgres
 85 | # processes. Multimaster node must be survived after the crash and included into
 86 | # the multimaster after recovery.
 87 | #
 88 | # ##############################################################################
 89 | 
 90 | # Set GUC restart_after_crash in 'on' value
 91 | $cluster->stop();
 92 | foreach (0..$#{$cluster->{nodes}})
 93 | {
 94 | 	$cluster->{nodes}->[$_]->append_conf('postgresql.conf', q{restart_after_crash = on});
 95 | }
 96 | $cluster->start();
 97 | $cluster->await_nodes( [0,1,2] );
 98 | 
 99 | # Simulate payload
100 | $cluster->pgbench(0, ('-i', '-n', -s => '1') );
101 | note( strftime('%Y-%m-%d %H:%M:%S', localtime) . ": starting async pgbench" );
102 | my $pgb1 = $cluster->pgbench_async(0, ('-n', -T => '25', -j => '1', -c => '5') );
103 | 
104 | my $pid0;
105 | my $attempts     = 0;
106 | 
107 | note( strftime('%Y-%m-%d %H:%M:%S', localtime) . ": starting polling of backend pid" );
108 | while (1)
109 | {
110 | 	$pid0 = $cluster->safe_psql(0, "SELECT pid FROM pg_stat_activity
111 | 								WHERE	backend_type LIKE 'client backend'
112 | 								AND query LIKE 'UPDATE%' LIMIT 1;");
113 | 
114 | 	# bf says we might be really unlucky to find no backend doing update
115 | 	# It does not make much sense to try longer than pgbench run lasts,
116 | 	# since we need an active backend to kill.  So let it be 25 seconds
117 | 	# both for pgbench_async() and this pg_stat_activity polling.
118 | 	if ( ($pid0 ne "") || $attempts >= 25*10 )
119 | 	{
120 | 		last;
121 | 	}
122 | 
123 | 	# Wait 0.1 second before retrying.
124 | 	usleep(100_000);
125 | 	$attempts++;
126 | }
127 | note( strftime('%Y-%m-%d %H:%M:%S', localtime) . ": finished polling of backend pid" );
128 | is( ($pid0 ne ""), 1,
129 | 	"found an active backend doing UPDATE" );
130 | 
131 | # Simulate hard crash
132 | note("Simulate hard crash of a backend by SIGKILL to $pid0");
133 | kill -9, $pid0;
134 | 
135 | $cluster->pgbench_await($pgb1);
136 | $cluster->await_nodes( [0,1,2] );
137 | is($cluster->is_data_identic( (0,1,2) ), 1, "check consistency after crash");
138 | 
139 | 
140 | # ##############################################################################
141 | #
142 | # [PGPRO-3047] Test ALTER DOMAIN .. CONSTRAINT .. NOT VALID
143 | #
144 | # ##############################################################################
145 | 
146 | $hash0 = $cluster->safe_psql(0, "
147 | 	CREATE DOMAIN things AS INT;
148 | 	CREATE TABLE thethings (stuff things);
149 | 	INSERT INTO thethings (stuff) VALUES (55);
150 | 	ALTER DOMAIN things ADD CONSTRAINT meow CHECK (VALUE < 11) NOT VALID;
151 | 	UPDATE thethings SET stuff = 10;
152 | 	ALTER DOMAIN things VALIDATE CONSTRAINT meow;
153 | ");
154 | my $result0 = $cluster->safe_psql(0, "SELECT * FROM thethings");
155 | my $result1 = $cluster->safe_psql(1, "SELECT * FROM thethings");
156 | my $result2 = $cluster->safe_psql(2, "SELECT * FROM thethings");
157 | note("Value in the stuff column of thethings table is $result0 at the node1 and match to corresponding values from another nodes: 2 - $result1 and 3 - $result2 ");
158 | is( (($result0 eq 10) and ($result0 eq $result1) and ($result1 eq $result2)), 1,
159 | 	"Check that update not aborted by violation of constraint on old tuple value");
160 | 
161 | # ##############################################################################
162 | #
163 | # [PGPRO-3047] Check for problems with different OIDs on multimaster nodes
164 | # during logical replication of tuples contained attribute with domain over
165 | # arrays of composite.
166 | #
167 | # ##############################################################################
168 | 
169 | # Check that OIDs are different.
170 | $result0 = $cluster->safe_psql(0,
171 | 					"select oid from pg_class where relname like 'thethings';");
172 | $result1 = $cluster->safe_psql(1,
173 | 					"select oid from pg_class where relname like 'thethings';");
174 | $result2 = $cluster->safe_psql(2,
175 | 					"select oid from pg_class where relname like 'thethings';");
176 | note("OIDS of the thethings relation: node1 - $result0, node2 - $result1, node3 - $result2");
177 | is( ( ($result0 ne $result1) and ($result0 ne $result2) and ($result1 ne $result2) ), 1,
178 | 	"Check that oid of the thethings relation are different on each node");
179 | 
180 | # Do the test. Insertion of array type must be passed successfully.
181 | # Source: regression test domain.sql
182 | $cluster->safe_psql(0, "
183 | 	CREATE TYPE comptype AS (r float8, i float8);
184 | 	CREATE domain dcomptypea AS comptype[];
185 | 	CREATE table dcomptable (d1 dcomptypea UNIQUE);
186 | 	INSERT INTO dcomptable VALUES (array[row(1,2)]::dcomptypea);
187 | ");
188 | 
189 | $cluster->stop();
190 | 
191 | done_testing();
192 | 


--------------------------------------------------------------------------------
/t/009_identity_func.pl:
--------------------------------------------------------------------------------
  1 | use strict;
  2 | use warnings;
  3 | use PostgresNode;
  4 | use Cluster;
  5 | use TestLib;
  6 | use Test::More tests => 29;
  7 | 
  8 | my $cluster = new Cluster(3);
  9 | $cluster->init();
 10 | $cluster->start();
 11 | $cluster->create_mm(undef);
 12 | 
 13 | my $dbname = $cluster->{nodes}->[0]->{dbname};
 14 | my $nodes = $cluster->{nodes};
 15 | my $output;
 16 | my $err_out;
 17 | 
 18 | # ##############################################################################
 19 | #
 20 | # Incorrect query
 21 | #
 22 | # ##############################################################################
 23 | my $invalid_expr_pattern =
 24 | 			".*failed to run query on node[0-9]+, snapshot .*: "
 25 | 			. "ERROR:  relation \"t1\" does not exist\n";
 26 | 
 27 | # test node 1
 28 | $nodes->[0]->psql($dbname,
 29 | 					"SELECT mtm.check_query('SELECT * FROM t1')",
 30 | 					stdout => \$output, stderr => \$err_out);
 31 | is ( (($output eq '') and ($err_out ne '')), 1, "node1: check zero out on error");
 32 | like($err_out, qr{$invalid_expr_pattern}, "node1: check error output correctness");
 33 | 
 34 | # test node 2
 35 | $nodes->[1]->psql($dbname,
 36 | 					"SELECT mtm.check_query('SELECT * FROM t1')",
 37 | 					stdout => \$output, stderr => \$err_out);
 38 | is ( (($output eq '') and ($err_out ne '')), 1, "node2: check zero out on error");
 39 | like($err_out, qr{$invalid_expr_pattern}, "node2: check error output correctness");
 40 | 
 41 | # test node 3
 42 | $nodes->[2]->psql($dbname,
 43 | 					"SELECT mtm.check_query('SELECT * FROM t1')",
 44 | 					stdout => \$output, stderr => \$err_out);
 45 | is ( (($output eq '') and ($err_out ne '')), 1, "node3: check zero out on error");
 46 | like($err_out, qr{$invalid_expr_pattern}, "node3: check error output correctness");
 47 | 
 48 | # Substep: check no problems without one node
 49 | $nodes->[2]->stop();
 50 | $cluster->await_nodes_after_stop( [0,1] );
 51 | $nodes->[0]->psql($dbname,
 52 | 					"SELECT mtm.check_query('SELECT * FROM t1')",
 53 | 					stdout => \$output, stderr => \$err_out);
 54 | is ( (($output eq '') and ($err_out ne '')), 1, "node1: check zero out on error");
 55 | like($err_out, qr{$invalid_expr_pattern}, "node1: check error output correctness");
 56 | 
 57 | $nodes->[1]->psql($dbname,
 58 | 					"SELECT mtm.check_query('SELECT * FROM t1')",
 59 | 					stdout => \$output, stderr => \$err_out);
 60 | is ( (($output eq '') and ($err_out ne '')), 1, "node2: check zero out on error");
 61 | like($err_out, qr{$invalid_expr_pattern}, "node2: check error output correctness");
 62 | 
 63 | # Substep: node1 will be isolated
 64 | my $isolation_pattern = ".*node is not online\: current status .*";
 65 | $nodes->[1]->stop();
 66 | $nodes->[0]->psql($dbname,
 67 | 					"SELECT mtm.check_query('SELECT * FROM t1')",
 68 | 					stdout => \$output, stderr => \$err_out);
 69 | is ( (($output eq '') and ($err_out ne '')), 1, "node1: check zero out on error");
 70 | like($err_out, qr{$isolation_pattern}, "Check access to isolated node");
 71 | 
 72 | $nodes->[1]->start();
 73 | $nodes->[2]->start();
 74 | $cluster->await_nodes( [2,0,1] );
 75 | 
 76 | # ##############################################################################
 77 | #
 78 | # Interface functions protection.
 79 | #
 80 | # ##############################################################################
 81 | my $protection_pattern = "this function should only be called by mtm.check_query()";
 82 | $nodes->[0]->psql($dbname,
 83 | 					"SELECT mtm.hold_backends();",
 84 | 					stdout => \$output, stderr => \$err_out);
 85 | is ( (($output eq '') and ($err_out ne '')), 1, "hold_all() protection");
 86 | like($err_out, qr{$protection_pattern}, "Check error output");
 87 | 
 88 | $nodes->[0]->psql($dbname,
 89 | 					"SELECT mtm.release_backends();",
 90 | 					stdout => \$output, stderr => \$err_out);
 91 | is ( (($output eq '') and ($err_out ne '')), 1, "release_all() protection");
 92 | like($err_out, qr{$protection_pattern}, "Check error output");
 93 | 
 94 | $cluster->safe_psql(0, "CREATE TABLE t1 (a int PRIMARY KEY, b text);");
 95 | $nodes->[0]->psql($dbname,
 96 | 					"SELECT mtm.check_query('SELECT * FROM t1')",
 97 | 					stdout => \$output);
 98 | is( (($output eq 't')) , 1, "Check tables equivalence with no tuples");
 99 | 
100 | # Check consistency in the case of two nodes
101 | $nodes->[1]->stop();
102 | $cluster->await_nodes_after_stop( [0,2] );
103 | $nodes->[0]->psql($dbname,
104 | 					"SELECT mtm.check_query('SELECT * FROM t1')",
105 | 					stdout => \$output);
106 | is( (($output eq 't')) , 1, "Check tables equivalence with one off node");
107 | 
108 | $cluster->safe_psql(0, "INSERT INTO t1 (a, b) VALUES (1, NULL);");
109 | $nodes->[0]->psql($dbname,
110 | 					"SELECT mtm.check_query('SELECT * FROM t1')",
111 | 					stdout => \$output);
112 | 
113 | is( (($output eq 't')) , 1, "Check primitive table");
114 | $nodes->[1]->start();
115 | $cluster->await_nodes( [2,0,1] );
116 | 
117 | $cluster->safe_psql(0,
118 | 	"INSERT INTO t1 (a,b) (SELECT *, 'test' FROM generate_series(2,100) AS x1);
119 | 	");
120 | $nodes->[0]->psql($dbname,
121 | 					"SELECT mtm.check_query('SELECT * FROM t1 ORDER BY a')",
122 | 					stdout => \$output);
123 | is( (($output eq 't')) , 1, "Check big table");
124 | $nodes->[0]->psql($dbname,
125 | 				"SELECT mtm.check_query('SELECT md5(string_agg(x1::text,''''))
126 | 				FROM (SELECT * FROM t1 ORDER BY a) AS x1');",
127 | 				stdout => \$output);
128 | is( (($output eq 't')) , 1, "Another approach to check big table");
129 | 
130 | $nodes->[0]->psql($dbname,
131 | 				"SELECT mtm.check_query('SELECT mtm.status();');",
132 | 				stdout => \$output);
133 | note("Check result: $output");
134 | is( (($output eq 'f')) , 1, "Unsuccessful check");
135 | 
136 | $nodes->[2]->stop();
137 | $cluster->await_nodes_after_stop( [0,1] );
138 | $nodes->[0]->psql($dbname,
139 | 				"SELECT mtm.check_query('SELECT md5(string_agg(x1::text,''''))
140 | 				FROM (SELECT * FROM t1 ORDER BY a) AS x1');",
141 | 				stdout => \$output);
142 | is( (($output eq 't')) , 1, "Check tables identity after one node was down");
143 | 
144 | $nodes->[2]->start();
145 | $cluster->await_nodes( [2,0,1] );
146 | $nodes->[0]->psql($dbname,
147 | 			"SELECT mtm.check_query('SELECT my_node_id FROM mtm.status();');",
148 | 			stdout => \$output);
149 | is( (($output eq 'f')) , 1, "Check warning message on mismatch");
150 | 
151 | $nodes->[2]->psql($dbname,
152 | 			"SELECT mtm.check_query('SELECT a,b FROM t1, mtm.status() AS ms WHERE a > ms.my_node_id');",
153 | 			stdout => \$output, stderr => \$err_out);
154 | note("Check result: $output");
155 | is( (($output eq 'f')) , 1, "Check warning message on difference in rows number");
156 | like($err_out,
157 | 	qr{.*query results mismatch\: 99 rows and 2 columns on node1\, 98 rows and 2 columns on node2},
158 | 	"Check format of the error message");
159 | 
160 | $nodes->[2]->psql($dbname,
161 | 			"SELECT mtm.check_query('SELECT b FROM t1 WHERE a = 1');",
162 | 			stdout => \$output);
163 | note("Check result: $output");
164 | is( (($output eq 't')) , 1, "Check equivalence of nulls");
165 | 
166 | $nodes->[0]->psql($dbname,
167 | 			"SELECT mtm.check_query('SELECT b FROM t1, mtm.status() AS ms WHERE a = ms.my_node_id');",
168 | 			stdout => \$output, stderr => \$err_out);
169 | note("Check result: $output");
170 | is( (($output eq 'f')) , 1, "Check warning message on difference in null and not null values");
171 | like($err_out,
172 | 	qr{.*mismatch in column \'b\' of row 0\: null on node1\, test on node2},
173 | 	"Check format of the error message");
174 | 
175 | exit(0);
176 | 
177 | # Full pgbench test
178 | $cluster->pgbench(0, ('-i', -s => '10') );
179 | my $pgb0 = $cluster->pgbench_async(0, ('-N', -T => '30', -c => '5') );
180 | my $pgb1 = $cluster->pgbench_async(1, ('-N', -T => '30', -c => '5') );
181 | my $pgb2 = $cluster->pgbench_async(2, ('-N', -T => '30', -c => '5') );
182 | 
183 | $output='t';
184 | for (my $i = 0; ($i < 3) and ($output eq 't'); $i++)
185 | {
186 | 	$nodes->[0]->psql($dbname,
187 | 				"SELECT mtm.check_query('SELECT md5(string_agg(x1::text,''''))
188 | 				FROM (SELECT * FROM pgbench_accounts ORDER BY aid) AS x1');",
189 | 				stdout => \$output);
190 | 	note("check iteration $i, result: $output");
191 | 	is( (($output eq 't')) , 1, "Data on nodes are identic");
192 | 	sleep(6);
193 | }
194 | 
195 | $cluster->pgbench_await($pgb0);
196 | $cluster->pgbench_await($pgb1);
197 | $cluster->pgbench_await($pgb2);
198 | 
199 | $cluster->stop();
200 | 


--------------------------------------------------------------------------------
/tests/.gitignore:
--------------------------------------------------------------------------------
1 | .blockade
2 | .vagrant
3 | *.swp
4 | *.pyc
5 | 


--------------------------------------------------------------------------------
/tests/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | name = "pypi"
 3 | url = "https://pypi.org/simple"
 4 | verify_ssl = true
 5 | 
 6 | [dev-packages]
 7 | 
 8 | [packages]
 9 | aiopg = "==1.0.0"
10 | aioprocessing = "==1.0.1"
11 | docker-compose = "==1.26.2"
12 | docker = "*"
13 | 
14 | [requires]
15 | python_version = "3.7"
16 | 


--------------------------------------------------------------------------------
/tests/deadl.pgb:
--------------------------------------------------------------------------------
 1 | \set fromuser random(1,64)
 2 | \set touser random(1,64)
 3 | \set amount random(1,10000)
 4 | BEGIN;
 5 | INSERT INTO transactions (uid,amount) VALUES (:fromuser, -:amount);
 6 | INSERT INTO transactions (uid,amount) VALUES (:touser, :amount);
 7 | INSERT INTO users (uid,sum) VALUES (:fromuser, -:amount) ON CONFLICT(uid) DO UPDATE SET sum=users.sum-:amount WHERE users.uid=:fromuser;
 8 | INSERT INTO users (uid,sum) VALUES (:touser, :amount) ON CONFLICT(uid) DO UPDATE SET sum=users.sum+:amount WHERE users.uid=:touser;
 9 | END;
10 | 
11 | 


--------------------------------------------------------------------------------
/tests/docker-compose.yml:
--------------------------------------------------------------------------------
  1 | version: '2.2'
  2 | 
  3 | services:
  4 | 
  5 |   node1:
  6 |     container_name: node1
  7 |     build: ..
  8 |     shm_size: '1024mb'
  9 |     privileged: true
 10 |     ulimits:
 11 |       core: 14294967296
 12 |     environment:
 13 |       POSTGRES_USER: 'pg'
 14 |       POSTGRES_DB: 'regression'
 15 |       NODE_ID: 1
 16 |       CONNSTRS: >-
 17 |         dbname=regression user=pg host=node1,
 18 |         dbname=regression user=pg host=node2,
 19 |         dbname=regression user=pg host=node3
 20 |     ports:
 21 |       - "15432:5432"
 22 |     networks:
 23 |       mtm_bridge:
 24 |         ipv4_address: 192.168.253.1
 25 | 
 26 |   node2:
 27 |     container_name: node2
 28 |     build: ..
 29 |     shm_size: '1024mb'
 30 |     privileged: true
 31 |     ulimits:
 32 |       core: 14294967296
 33 |     environment:
 34 |       POSTGRES_USER: 'pg'
 35 |       POSTGRES_DB: 'regression'
 36 |       NODE_ID: 2
 37 |       CONNSTRS: >-
 38 |         dbname=regression user=pg host=node1,
 39 |         dbname=regression user=pg host=node2,
 40 |         dbname=regression user=pg host=node3
 41 |     ports:
 42 |       - "15433:5432"
 43 |     networks:
 44 |       mtm_bridge:
 45 |         ipv4_address: 192.168.253.2
 46 | 
 47 |   node3:
 48 |     container_name: node3
 49 |     build: ..
 50 |     shm_size: '1024mb'
 51 |     privileged: true
 52 |     ulimits:
 53 |       core: 14294967296
 54 |     environment:
 55 |       POSTGRES_USER: 'pg'
 56 |       POSTGRES_DB: 'regression'
 57 |       NODE_ID: 3
 58 |       CONNSTRS: >-
 59 |         dbname=regression user=pg host=node1,
 60 |         dbname=regression user=pg host=node2,
 61 |         dbname=regression user=pg host=node3
 62 |     ports:
 63 |       - "15434:5432"
 64 |     networks:
 65 |       mtm_bridge:
 66 |         ipv4_address: 192.168.253.3
 67 | 
 68 |   # toxi:
 69 |   #   image: kelvich/toxiproxy
 70 |   #   ports:
 71 |   #     - "8474:8474"
 72 | 
 73 |   # toxi_seed:
 74 |   #   image: kelvich/toxiproxy
 75 |   #   depends_on:
 76 |   #     - toxi
 77 |   #   entrypoint: |
 78 |   #     curl
 79 |   #     -X POST 'http://toxi:8474/populate'
 80 |   #     -H 'Content-Type: application/json; charset=utf-8'
 81 |   #     -d
 82 |   #     '[
 83 |   #       {"name": "rep12", "listen": "0.0.0.0:12000", "upstream": "node2:5432"},
 84 |   #       {"name": "arb12", "listen": "0.0.0.0:12001", "upstream": "node2:5433"},
 85 |   #       {"name": "rep13", "listen": "0.0.0.0:13000", "upstream": "node3:5432"},
 86 |   #       {"name": "arb13", "listen": "0.0.0.0:13001", "upstream": "node3:5433"},
 87 | 
 88 |   #       {"name": "rep21", "listen": "0.0.0.0:21000", "upstream": "node1:5432"},
 89 |   #       {"name": "arb21", "listen": "0.0.0.0:21001", "upstream": "node1:5433"},
 90 |   #       {"name": "rep23", "listen": "0.0.0.0:23000", "upstream": "node3:5432"},
 91 |   #       {"name": "arb23", "listen": "0.0.0.0:23001", "upstream": "node3:5433"},
 92 | 
 93 |   #       {"name": "rep31", "listen": "0.0.0.0:31000", "upstream": "node1:5432"},
 94 |   #       {"name": "arb31", "listen": "0.0.0.0:31001", "upstream": "node1:5433"},
 95 |   #       {"name": "rep32", "listen": "0.0.0.0:32000", "upstream": "node2:5432"},
 96 |   #       {"name": "arb32", "listen": "0.0.0.0:32001", "upstream": "node2:5433"}
 97 |   #     ]'
 98 | 
 99 | 
100 | networks:
101 |   mtm_bridge:
102 |     driver: bridge
103 |     ipam:
104 |      config:
105 |        - subnet: 192.168.253.0/24
106 |          gateway: 192.168.253.254
107 | 


--------------------------------------------------------------------------------
/tests/docker-entrypoint.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/sh
  2 | 
  3 | if [ "$1" = 'postgres' ]; then
  4 | 	mkdir -p "$PGDATA"
  5 | 	mkdir -p /pg/archive/
  6 | 	mkdir -p /pg/src/src/test/regress/testtablespace
  7 | 
  8 | 	# look specifically for PG_VERSION, as it is expected in the DB dir
  9 | 	if [ ! -s "$PGDATA/PG_VERSION" ]; then
 10 | 		initdb --nosync
 11 | 
 12 | 		{ echo; echo "host all all 0.0.0.0/0 trust"; } >> "$PGDATA/pg_hba.conf"
 13 | 		{ echo; echo "host replication all 0.0.0.0/0 trust"; } >> "$PGDATA/pg_hba.conf"
 14 | 
 15 | 		cat <<-EOF >> $PGDATA/postgresql.conf
 16 | 			listen_addresses='*'
 17 | 			log_line_prefix = '%m [%p] [[%a]]: '
 18 | 			archive_mode = on
 19 | 			archive_command = 'cp %p /pg/archive/%f'
 20 | 
 21 | 			fsync = on
 22 | 
 23 | 			max_prepared_transactions = 100
 24 | 			wal_level = logical
 25 | 			max_worker_processes = 100
 26 | 			max_replication_slots = 10
 27 | 			max_wal_senders = 10
 28 | 			log_statement = all
 29 | 			log_connections = true
 30 | 			log_lock_waits = true
 31 | 
 32 | 			shared_preload_libraries = 'multimaster'
 33 | 			multimaster.volkswagen_mode = off
 34 |                         multimaster.max_workers = 30
 35 | 
 36 | 			multimaster.connect_timeout = 10
 37 | 			# Be careful; tests expect commits on live
 38 | 			# nodes during others failures, and failure time is ~10s;
 39 | 			# if we simulate network loss, failure won't be
 40 | 			# detected until this timeout passes.
 41 | 			# OTOH, setting it too low might lead to node
 42 | 			# exclusions on weak machines during normal work.
 43 |                         # It was also noticed that if extensive logging is enabled
 44 | 			# (older, at least pre #6392) journald might not be able
 45 | 			# to swallow logs in time which also provoked exclusions
 46 | 			# with 2s timeout
 47 | 			multimaster.heartbeat_recv_timeout = 2000
 48 | 			multimaster.heartbeat_send_timeout = 200
 49 | 			# Heavily loaded receiver won't send progress until
 50 | 			# walsender requires it which happens at
 51 | 			# wal_sender_timeout / 2, so keep it relatively low
 52 | 			# for syncpoint test.
 53 | 			wal_sender_timeout = 60s
 54 | 			wal_receiver_status_interval = 10s
 55 | 
 56 | 			# extensive logging for tests
 57 | 			multimaster.TxTrace_log_level = LOG
 58 | 			multimaster.TxFinish_log_level = LOG
 59 | 
 60 | 			multimaster.CoordinatorTrace_log_level = LOG
 61 | 
 62 | 			multimaster.BgwPoolEventDebug_log_level = LOG
 63 | 
 64 | 			multimaster.ReceiverStateDebug_log_level = LOG
 65 | 			multimaster.ApplyMessage_log_level = LOG
 66 | 			multimaster.ApplyTrace_log_level = LOG
 67 | 			multimaster.ReceiverFeedback_log_level = LOG
 68 | 
 69 | 			multimaster.StateDebug_log_level = LOG
 70 | 
 71 | 		EOF
 72 | 
 73 | 		if [ -n "$REFEREE_CONNSTR" ]; then
 74 | 			echo "multimaster.referee_connstring = '$REFEREE_CONNSTR'" >> $PGDATA/postgresql.conf
 75 | 		fi
 76 | 
 77 | 		# internal start of server in order to allow set-up using psql-client
 78 | 		# does not listen on TCP/IP and waits until start finishes
 79 | 		pg_ctl -D "$PGDATA" \
 80 | 			-o "-c listen_addresses=''" \
 81 | 			-w start
 82 | 
 83 | 		: ${POSTGRES_USER:=postgres}
 84 | 		: ${POSTGRES_DB:=$POSTGRES_USER}
 85 | 		export POSTGRES_USER POSTGRES_DB
 86 | 
 87 | 		if [ "$POSTGRES_DB" != 'postgres' ]; then
 88 | 			psql -U `whoami` postgres <<-EOSQL
 89 | 				CREATE DATABASE "$POSTGRES_DB" ;
 90 | 			EOSQL
 91 | 			echo
 92 | 		fi
 93 | 
 94 | 		if [ "$POSTGRES_USER" = `whoami` ]; then
 95 | 			op='ALTER'
 96 | 		else
 97 | 			op='CREATE'
 98 | 		fi
 99 | 
100 | 		psql -U `whoami` postgres <<-EOSQL
101 | 			$op USER "$POSTGRES_USER" WITH SUPERUSER PASSWORD '';
102 | 		EOSQL
103 | 		echo
104 | 
105 | 		# psql -U `whoami` $POSTGRES_DB -c 'CREATE EXTENSION multimaster;';
106 | 		# psql -U `whoami` $POSTGRES_DB -c "select mtm.init_node($NODE_ID, '{$CONNSTRS}');"
107 | 
108 | 		pg_ctl -D "$PGDATA" -m fast -w stop
109 | 	fi
110 | fi
111 | 
112 | "$@"
113 | 


--------------------------------------------------------------------------------
/tests/lib/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/postgrespro/mmts/baab9238f784d428481ecfa1294e3f9a3910b2d2/tests/lib/__init__.py


--------------------------------------------------------------------------------
/tests/lib/failure_injector.py:
--------------------------------------------------------------------------------
  1 | import docker
  2 | import os
  3 | 
  4 | class FailureInjector(object):
  5 | 
  6 |     def __init__(self, node=None):
  7 |         timeout = os.environ.get('DOCKER_CLIENT_TIMEOUT')
  8 |         if timeout is not None:
  9 |             timeout = int(timeout)
 10 |         self.docker_api = docker.from_env(timeout=timeout)
 11 | 
 12 |     def container_exec(self, node, command):
 13 |         docker_node = self.docker_api.containers.get(node)
 14 |         docker_node.exec_run(command, user='root')
 15 | 
 16 | 
 17 | class NoFailure(FailureInjector):
 18 | 
 19 |     def start(self):
 20 |         return
 21 | 
 22 |     def stop(self):
 23 |         return
 24 | 
 25 | 
 26 | class SingleNodePartition(FailureInjector):
 27 | 
 28 |     def __init__(self, node):
 29 |         self.node = node
 30 |         super().__init__()
 31 | 
 32 |     def start(self):
 33 |         self.container_exec(self.node, "iptables -A INPUT -j DROP")
 34 |         self.container_exec(self.node, "iptables -A OUTPUT -j DROP")
 35 | 
 36 |     def stop(self):
 37 |         self.container_exec(self.node, "iptables -D INPUT -j DROP")
 38 |         self.container_exec(self.node, "iptables -D OUTPUT -j DROP")
 39 | 
 40 | class SingleNodePartitionReject(FailureInjector):
 41 | 
 42 |     def __init__(self, node):
 43 |         self.node = node
 44 |         super().__init__()
 45 | 
 46 |     def start(self):
 47 |         self.container_exec(self.node, "iptables -A INPUT -j REJECT")
 48 |         self.container_exec(self.node, "iptables -A OUTPUT -j REJECT")
 49 | 
 50 |     def stop(self):
 51 |         self.container_exec(self.node, "iptables -D INPUT -j REJECT")
 52 |         self.container_exec(self.node, "iptables -D OUTPUT -j REJECT")
 53 | 
 54 | 
 55 | class EdgePartition(FailureInjector):
 56 | 
 57 |     def __init__(self, nodeA, nodeB):
 58 |         self.nodeA = nodeA
 59 |         self.nodeB = nodeB
 60 |         super().__init__()
 61 | 
 62 |     def __change(self, action):
 63 |         self.container_exec(self.nodeA,
 64 |                             "iptables {} INPUT -s {} -j DROP".format(
 65 |                                 action, self.nodeB))
 66 |         self.container_exec(self.nodeA,
 67 |                             "iptables {} OUTPUT -d {} -j DROP".format(
 68 |                                 action, self.nodeB))
 69 | 
 70 |     def start(self):
 71 |         self.__change('-A')
 72 | 
 73 |     def stop(self):
 74 |         self.__change('-D')
 75 | 
 76 | 
 77 | class RestartNode(FailureInjector):
 78 | 
 79 |     def __init__(self, node):
 80 |         self.node = node
 81 |         super().__init__()
 82 | 
 83 |     # XXX: Is it really a good idea to call cli.stop inside method called start?
 84 |     def start(self):
 85 |         self.docker_api.containers.get(self.node).stop()
 86 | 
 87 |     def stop(self):
 88 |         self.docker_api.containers.get(self.node).start()
 89 | 
 90 | 
 91 | class FreezeNode(FailureInjector):
 92 | 
 93 |     def __init__(self, node):
 94 |         self.node = node
 95 |         super().__init__()
 96 | 
 97 |     def start(self):
 98 |         self.docker_api.containers.get(self.node).pause()
 99 | 
100 |     def stop(self):
101 |         self.docker_api.containers.get(self.node).unpause()
102 | 
103 | 
104 | class CrashRecoverNode(FailureInjector):
105 | 
106 |     def __init__(self, node):
107 |         self.node = node
108 |         super().__init__()
109 | 
110 |     def start(self):
111 |         self.docker_api.containers.get(self.node).kill()
112 | 
113 |     def stop(self):
114 |         self.docker_api.containers.get(self.node).start()
115 | 
116 | 
117 | class SkewTime(FailureInjector):
118 | 
119 |     def __init__(self, node):
120 |         self.node = node
121 |         super().__init__()
122 | 
123 | class StopNode(FailureInjector):
124 | 
125 |     def __init__(self, node):
126 |         self.node = node
127 |         super().__init__()
128 | 
129 |     # XXX: Is it really a good idea to call cli.stop inside method called start?
130 |     def start(self):
131 |         self.docker_api.containers.get(self.node).stop()
132 | 
133 |     def stop(self):
134 |         return
135 | 
136 | 
137 | class StartNode(FailureInjector):
138 | 
139 |     def __init__(self, node):
140 |         self.node = node
141 |         super().__init__()
142 | 
143 |     # XXX: Is it really a good idea to call cli.stop inside method
144 |     # called start?
145 |     def start(self):
146 |         return
147 | 
148 |     def stop(self):
149 |         self.docker_api.containers.get(self.node).start()
150 | 
151 | ONE_NODE_FAILURES = [SingleNodePartition, SingleNodePartitionReject,
152 |                      RestartNode, CrashRecoverNode, FreezeNode]
153 | TWO_NODE_FAILURES = [EdgePartition]
154 | 


--------------------------------------------------------------------------------
/tests/lib/log_helper.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import logging.config
 3 | import sys
 4 | import time
 5 | 
 6 | # FWIW I've attempted to keep the cfg in json/yaml file but sank in 'could not
 7 | # resolve UTCFormatter class' issue
 8 | 
 9 | # timestamp in UTC+-00:00 aka GMT
10 | class UTCFormatter(logging.Formatter):
11 |     converter = time.gmtime
12 | 
13 | LOGGING = {
14 |     "version": 1,
15 |     "formatters": {
16 | 	"defaultFormatter": {
17 | 	    "()": UTCFormatter,
18 | 	    "format": "%(asctime)s.%(msecs)-3d %(levelname)s [%(filename)s:%(lineno)d] %(message)s",
19 | 	    "datefmt": "%Y-%m-%d %H:%M:%S"
20 | 	}
21 |     },
22 |     "handlers": {
23 | 	"console": {
24 | 	    "class": "logging.StreamHandler",
25 | 	    "formatter": "defaultFormatter",
26 | 	    "level": "DEBUG",
27 | 	    "stream": "ext://sys.stderr"
28 | 	}
29 |     },
30 |     "loggers": {
31 | 	"root": {
32 | 	    "level": "DEBUG",
33 | 	    "handlers": ["console"]
34 | 	},
35 | 	"root.test_helper": {
36 | 	    "level": "INFO"
37 | 	},
38 | 	"root.bank_client": {
39 | 	    "level": "INFO"
40 | 	}
41 |     }
42 | }
43 | 
44 | logging.config.dictConfig(LOGGING)
45 | 


--------------------------------------------------------------------------------
/tests/reader.pgb:
--------------------------------------------------------------------------------
1 | begin;
2 | insert into reader_log select sum(v) from t;
3 | commit;


--------------------------------------------------------------------------------
/tests/requirements.txt:
--------------------------------------------------------------------------------
 1 | aiopg==1.0.0
 2 | aioprocessing==1.0.1
 3 | attrs==20.1.0
 4 | bcrypt==3.2.0
 5 | cached-property==1.5.1
 6 | certifi==2020.6.20
 7 | cffi==1.14.2
 8 | chardet==3.0.4
 9 | cryptography==3.1
10 | distro==1.5.0
11 | docker==4.3.1
12 | docker-compose==1.26.2
13 | dockerpty==0.4.1
14 | docopt==0.6.2
15 | idna==2.10
16 | importlib-metadata==1.7.0
17 | jsonschema==3.2.0
18 | paramiko==2.7.1
19 | psycopg2-binary==2.8.5
20 | pycparser==2.20
21 | PyNaCl==1.4.0
22 | pyrsistent==0.16.0
23 | python-dotenv==0.14.0
24 | PyYAML==5.3.1
25 | requests==2.24.0
26 | six==1.15.0
27 | texttable==1.6.2
28 | urllib3==1.25.10
29 | websocket-client==0.57.0
30 | zipp==3.1.0
31 | 


--------------------------------------------------------------------------------
/tests/support/bumptime.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * His (Aphyr) Majesty Script Bumptime.
 3 |  *
 4 |  * https://raw.githubusercontent.com/jepsen-io/jepsen/master/cockroachdb/resources/bumptime.c
 5 |  *
 6 |  */
 7 | 
 8 | #include <stdio.h>
 9 | #include <sys/time.h>
10 | #include <stdlib.h>
11 | #include <stdint.h>
12 | 
13 | int
14 | main(int argc, char **argv)
15 | {
16 | 	if (argc < 2)
17 | 	{
18 | 		fprintf(stderr, "usage: %s <delta>, where delta is in ms\n", argv[0]);
19 | 		return 1;
20 | 	}
21 | 
22 | 	/* Compute offset from argument */
23 | 	int64_t		delta = atof(argv[1]) * 1000;
24 | 	int64_t		delta_us = delta % 1000000;
25 | 	int64_t		delta_s = (delta - delta_us) / 1000000;
26 | 
27 | 	/* Get current time */
28 | 	struct timeval time;
29 | 	struct timezone tz;
30 | 
31 | 	if (0 != gettimeofday(&time, &tz))
32 | 	{
33 | 		perror("gettimeofday");
34 | 		return 1;
35 | 	}
36 | 
37 | 	/* Update time */
38 | 	time.tv_usec += delta_us;
39 | 	time.tv_sec += delta_s;
40 | 	/* Overflow */
41 | 	while (time.tv_usec <= 1000000)
42 | 	{
43 | 		time.tv_sec -= 1;
44 | 		time.tv_usec += 1000000;
45 | 	}
46 | 	while (1000000 <= time.tv_usec)
47 | 	{
48 | 		time.tv_sec += 1;
49 | 		time.tv_usec -= 1000000;
50 | 	}
51 | 
52 | 	/* Set time */
53 | 	if (0 != settimeofday(&time, &tz))
54 | 	{
55 | 		perror("settimeofday");
56 | 		return 2;
57 | 	}
58 | 
59 | 	return 0;
60 | }
61 | 


--------------------------------------------------------------------------------
/tests/support/docker-regress.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | cd /pg/src/src/test/regress
 4 | 
 5 | psql -U postgres regression <<-SQL 
 6 |     ALTER DATABASE "postgres" SET lc_messages TO 'C';
 7 |     ALTER DATABASE "postgres" SET lc_monetary TO 'C';
 8 |     ALTER DATABASE "postgres" SET lc_numeric TO 'C';
 9 |     ALTER DATABASE "postgres" SET lc_time TO 'C';
10 |     ALTER DATABASE "postgres" SET timezone_abbreviations TO 'Default';
11 | SQL
12 | 
13 | ./pg_regress --use-existing \
14 |     --schedule=serial_schedule \
15 |     --host=node1 \
16 |     --user=postgres
17 | 
18 | STATUS=$?
19 | 
20 | if [ -f "regression.diffs" ]
21 | then
22 | 	cat regression.diffs
23 | fi
24 | 
25 | exit $STATUS
26 | 


--------------------------------------------------------------------------------
/tests/support/two_nodes.yml:
--------------------------------------------------------------------------------
 1 | version: '2'
 2 | 
 3 | services:
 4 | 
 5 |   node1:
 6 |     container_name: node1
 7 |     build: ../..
 8 |     shm_size: '512mb'
 9 |     privileged: true
10 |     ulimits:
11 |       core: 14294967296
12 |     environment:
13 |       POSTGRES_USER: 'pg'
14 |       POSTGRES_DB: 'regression'
15 |       NODE_ID: 1
16 |       CONNSTRS: >-
17 |         dbname=regression user=pg host=node1,
18 |         dbname=regression user=pg host=node2
19 |       REFEREE_CONNSTR: 'dbname=regression user=pg host=referee'
20 |     ports:
21 |       - "15432:5432"
22 |     networks:
23 |       mtm_bridge:
24 |         ipv4_address: 192.168.253.1
25 | 
26 |   node2:
27 |     container_name: node2
28 |     build: ../..
29 |     shm_size: '512mb'
30 |     privileged: true
31 |     ulimits:
32 |       core: 14294967296
33 |     environment:
34 |       POSTGRES_USER: 'pg'
35 |       POSTGRES_DB: 'regression'
36 |       NODE_ID: 2
37 |       CONNSTRS: >-
38 |         dbname=regression user=pg host=node1,
39 |         dbname=regression user=pg host=node2
40 |       REFEREE_CONNSTR: 'dbname=regression user=pg host=referee'
41 |     ports:
42 |       - "15433:5432"
43 |     networks:
44 |       mtm_bridge:
45 |         ipv4_address: 192.168.253.2
46 | 
47 |   referee:
48 |     container_name: referee
49 |     build: ../..
50 |     shm_size: '512mb'
51 |     privileged: true
52 |     ulimits:
53 |       core: 14294967296
54 |     environment:
55 |       POSTGRES_USER: 'pg'
56 |       POSTGRES_DB: 'regression'
57 |       NODE_ID: 1
58 |     ports:
59 |       - "15435:5432"
60 |     networks:
61 |       mtm_bridge:
62 |         ipv4_address: 192.168.253.3
63 | 
64 | networks:
65 |   mtm_bridge:
66 |     driver: bridge
67 |     ipam:
68 |      config:
69 |        - subnet: 192.168.253.0/24
70 |          gateway: 192.168.253.254
71 | 


--------------------------------------------------------------------------------
/tests/test_bkb.sage.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env sage
 2 | import sys, os
 3 | 
 4 | from sage.all import *
 5 | from subprocess import Popen, PIPE, STDOUT
 6 | import unittest
 7 | 
 8 | def run_stdin(input):
 9 |     mydir = os.path.dirname(os.path.realpath(__file__))
10 |     binfile = mydir + "/../src/a.out"
11 | 
12 |     p = Popen(binfile, stdout=PIPE, stdin=PIPE, stderr=STDOUT)
13 |     grep_stdout = p.communicate(input=input)[0]
14 |     return grep_stdout.decode()
15 | 
16 | def run_bkb(g):
17 |     n = len(g)
18 |     params = str(n) + "\n"
19 |     for i in range(n):
20 |         row = 0
21 |         row |= 1 << i
22 |         for j in range(n):
23 |             if g.has_edge(i, j):
24 |                 row |= 1 << j
25 |         params += str(row) + "\n"
26 | 
27 |     print(params)
28 |     res = run_stdin(params).strip()
29 |     res = [int(n) for n in res.split(' ')]
30 |     return res
31 | 
32 | 
33 | class TestCliqueBKB(unittest.TestCase):
34 | 
35 |     def test_random_graphs(self):
36 | 
37 |         for _ in range(1000):
38 |             while True:
39 |                 g = graphs.RandomGNM(60,1700)
40 |                 if g.is_connected():
41 |                     break
42 | 
43 |             clique, clique_size = run_bkb(g)
44 | 
45 |             print(clique, clique_size, len(g.clique_maximum()))
46 | 
47 | 
48 | 
49 | if __name__ == '__main__':
50 |     unittest.main()
51 | 


--------------------------------------------------------------------------------
/tests/test_recovery_random.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | #
  4 | # Based on Aphyr's test for CockroachDB.
  5 | #
  6 | # Randomized recovery test for multimaster. Currently it picks a random node,
  7 | # crash-recovers it or drops/rejects packets to and from it under load and
  8 | # checks that things are ok, i.e. the rest two continue working and after
  9 | # eliminating the failure the victim successfully recovers, with no hanged
 10 | # prepares and data being identic everywhere. Lather, rinse, repeat.
 11 | 
 12 | import datetime
 13 | import docker
 14 | import os
 15 | import random
 16 | import socket
 17 | import subprocess
 18 | import time
 19 | import unittest
 20 | import warnings
 21 | import logging
 22 | 
 23 | from lib.bank_client import MtmClient
 24 | from lib.failure_injector import *
 25 | import lib.log_helper  # configures loggers
 26 | from lib.test_helper import *
 27 | 
 28 | log = logging.getLogger('root')
 29 | 
 30 | class RecoveryTest(MMTestCase, TestHelper):
 31 |     def test_normal_operations(self):
 32 |         log.info('### test_normal_operations ###')
 33 | 
 34 |         aggs_failure, aggs = self.performFailure(NoFailure())
 35 | 
 36 |         self.assertCommits(aggs_failure)
 37 |         self.assertIsolation(aggs_failure)
 38 | 
 39 |         self.assertCommits(aggs)
 40 |         self.assertIsolation(aggs)
 41 | 
 42 |     # main random tests
 43 |     def test_random_disasters(self):
 44 |         log.info('### test_random_disasters ###')
 45 | 
 46 |         for i in range(1, 16):
 47 |             log.info(f'running round #{i} of test_random_disasters')
 48 |             node_number = random.choice(range(1, 4))
 49 |             port = 15431 + node_number
 50 | 
 51 |             nodes_assert_commit_during_failure = [n for n in range(3) if n !=
 52 |                                                   node_number - 1]
 53 |             aggs_failure, aggs = self.performRandomFailure(
 54 |                 f'node{node_number}',
 55 |                 nodes_wait_for_commit=[n for n in range(3)],
 56 |                 nodes_wait_for_online=[f"dbname=regression user=postgres host={self.host_ip} port={port}"],
 57 |                 stop_load=True,
 58 |                 nodes_assert_commit_during_failure=
 59 |                 nodes_assert_commit_during_failure)
 60 | 
 61 |             for n in range(3):
 62 |                 if n == node_number - 1:
 63 |                     self.assertNoCommits([aggs_failure[n]])
 64 |                 else:
 65 |                     self.assertCommits([aggs_failure[n]])
 66 | 
 67 |             self.assertIsolation(aggs_failure)
 68 |             self.assertCommits(aggs)
 69 |             self.assertIsolation(aggs)
 70 |             self.assertDataSync()
 71 | 
 72 |             log.info(f'iteration #{i} is OK')
 73 | 
 74 |     # sausage topology test
 75 |     def test_edge_partition(self):
 76 |         log.info('### test_edge_partition ###')
 77 | 
 78 |         aggs_failure, aggs = self.performFailure(
 79 |             EdgePartition('node1', 'node3'),
 80 |             # clique selection picks up the min mask, so in 1-2-3 sausage 12
 81 |             # will be eventually the live nodes. However, there is a small risk
 82 |             # of 3 successfully voting for 23 before 1 understands what's going
 83 |             # on, in which case 1 is put into recovery which doesn't finish in
 84 |             # 10s of the test given that the load is not stopped. This actually
 85 |             # happened in CI. To avoid test failure, wait for both 1 and 3 to be
 86 |             # online.
 87 |             nodes_wait_for_online=[
 88 |                 f"dbname=regression user=postgres host={self.host_ip} port=15434",
 89 |                 f"dbname=regression user=postgres host={self.host_ip} port=15432"],
 90 |             stop_load=True)
 91 | 
 92 |         self.assertTrue(('commit' in aggs_failure[0]['transfer']['finish']) or
 93 |                         ('commit' in aggs_failure[2]['transfer']['finish']))
 94 |         self.assertCommits(aggs_failure[1:2])  # second node
 95 |         self.assertIsolation(aggs_failure)
 96 | 
 97 |         self.assertCommits(aggs)
 98 |         self.assertIsolation(aggs)
 99 | 
100 |     # can be used for manual running of some particular failure
101 |     def _test_single_failure(self):
102 |         log.info('### test_single_failure ###')
103 | 
104 |         failure = CrashRecoverNode('node3')
105 |         aggs_failure, aggs = self.performFailure(
106 |             failure,
107 |             nodes_wait_for_online=["dbname=regression user=postgres host=127.0.0.1 port=15434"],
108 |             stop_load=True)
109 | 
110 |         self.assertCommits(aggs_failure[:2])
111 |         self.assertNoCommits(aggs_failure[2:])
112 |         self.assertIsolation(aggs_failure)
113 | 
114 |         self.assertCommits(aggs)
115 |         self.assertIsolation(aggs)
116 | 
117 | 
118 | # you can run single test with something like
119 | # python -u -m unittest test_recovery.RecoveryTest.test_single_failure
120 | if __name__ == '__main__':
121 |     # run all tests
122 |     unittest.main()
123 | 


--------------------------------------------------------------------------------
/tests/test_regression.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import subprocess
 3 | import time
 4 | 
 5 | from lib.bank_client import MtmClient
 6 | from lib.test_helper import *
 7 | 
 8 | class RecoveryTest(unittest.TestCase, TestHelper):
 9 | 
10 |     @classmethod
11 |     def setUpClass(cls):
12 |         cls.dsns = [
13 |             "dbname=regression user=postgres host=127.0.0.1 port=15432",
14 |             "dbname=regression user=postgres host=127.0.0.1 port=15433",
15 |             "dbname=regression user=postgres host=127.0.0.1 port=15434"
16 |         ]
17 | 
18 |         print('setUp')
19 |         subprocess.check_call(['docker-compose','up',
20 |            '--force-recreate',
21 |            '--build',
22 |            '-d'])
23 | 
24 |         # Wait for all nodes to become online
25 |         [ cls.awaitOnline(dsn) for dsn in cls.dsns ]
26 | 
27 |         cls.client = MtmClient(cls.dsns, n_accounts=1000)
28 | 
29 |     @classmethod
30 |     def tearDownClass(cls):
31 |         print('tearDown')
32 | #        subprocess.check_call(['docker-compose','down'])
33 | 
34 |     def test_regression(self):
35 |         # XXX: make smth clever here
36 |         time.sleep(10)
37 |         subprocess.check_call(['docker', 'exec',
38 |             'node1',
39 |             '/pg/mmts/tests/support/docker-regress.sh',
40 |         ])
41 | 
42 | if __name__ == '__main__':
43 |     unittest.main()
44 | 


--------------------------------------------------------------------------------
/tests/test_syncpoint.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Basic syncpoint sanity check: ensure in normal mode (all nodes are up and
  3 | # running) old wal files are erased once they are not needed anymore.
  4 | # On the other hand we must ensure that if a node of the cluster is out of
  5 | # order the older wal files needed for the node recovery are NOT erased.
  6 | #
  7 | 
  8 | import unittest
  9 | import time
 10 | import subprocess
 11 | import datetime
 12 | import docker
 13 | import warnings
 14 | import pprint
 15 | import logging
 16 | 
 17 | import lib.log_helper  # configures loggers
 18 | from lib.bank_client import MtmClient
 19 | from lib.failure_injector import *
 20 | from lib.test_helper import *
 21 | 
 22 | log = logging.getLogger('root')
 23 | 
 24 | class SyncpointTest(MMTestCase, TestHelper):
 25 |     # Returns the newest wal
 26 |     def _get_last_wal(self, dsn):
 27 |         return self.nodeSelect(dsn, "SELECT name FROM pg_ls_waldir() WHERE "
 28 |                                     "name ~ '^[0-9A-F]+$' ORDER BY "
 29 |                                     "name DESC LIMIT 1")[0][0]
 30 | 
 31 |     def _get_last_wals(self, dsns):
 32 |         return [self._get_last_wal(dsn) for dsn in dsns]
 33 | 
 34 |     # Returns the oldest existing wal
 35 |     def _get_first_wal(self, dsn):
 36 |         # recycle old segments
 37 |         self.nodeExecute(dsn, ["CHECKPOINT"])
 38 |         return self.nodeSelect(dsn, "SELECT name FROM pg_ls_waldir() WHERE "
 39 |                                     "name ~ '^[0-9A-F]+$' ORDER BY "
 40 |                                     "name LIMIT 1")[0][0]
 41 | 
 42 |     def _get_first_wals(self, dsns):
 43 |         return [self._get_first_wal(dsn) for dsn in dsns]
 44 | 
 45 |     # get restart_lsn segment of slot to the recipient node id.
 46 |     def _get_slot_wal(self, dsn, recipient):
 47 |         return self.nodeSelect(dsn, """
 48 |         SELECT pg_walfile_name(restart_lsn)
 49 |         FROM pg_replication_slots WHERE slot_name = 'mtm_slot_{}'
 50 |         """.format(recipient))[0][0]
 51 | 
 52 |     def _get_slot_wals(self, dsns, recipient):
 53 |         return [self._get_slot_wal(dsn, recipient) for dsn in dsns]
 54 | 
 55 |     # Waits (up to iterations * iteration_sleep seconds)
 56 |     # until at least wals_to_pass segments appear on each node
 57 |     def _wait_wal(self, dsns, wals_to_pass=5,
 58 |                   iteration_sleep=20,
 59 |                   iterations=1000):
 60 |         last_wals_initial = self._get_last_wals(dsns)
 61 |         log.debug("waiting for wal, last_wals_initial={}, first_wals={}"
 62 |                   .format(last_wals_initial, self._get_first_wals(dsns)))
 63 |         for j in range(iterations):
 64 |             time.sleep(iteration_sleep)
 65 |             last_wals = self._get_last_wals(dsns)
 66 |             log.debug("waiting for wal, last_wals={}, first_wals={}"
 67 |                       .format(last_wals, self._get_first_wals(dsns)))
 68 |             # xxx: this is only correct for first 4GB of WAL due to the hole in
 69 |             # WAL file naming
 70 |             if all(int(lw, 16) - int(lw_i, 16) >= wals_to_pass
 71 |                    for (lw_i, lw) in zip(last_wals_initial, last_wals)):
 72 |                 return
 73 | 
 74 |         raise AssertionError('timed out while waiting for wal')
 75 | 
 76 |     def _chk_rec_trim(self, dsn, other_dsns, iteration_sleep=2,
 77 |                       iterations=1000):
 78 |         log.info('checking if wals were trimmed during recovery')
 79 |         dsns = other_dsns + [dsn]
 80 |         first_wals_before = self._get_first_wals(dsns)
 81 |         first_wals = []
 82 |         wals_trimmed = False
 83 |         status = ''
 84 |         for j in range(iterations):
 85 |             time.sleep(iteration_sleep)
 86 |             last_wals = self._get_last_wals(dsns)
 87 |             first_wals = self._get_first_wals(dsns)
 88 |             status = self.nodeSelect(dsn,
 89 |                                      'SELECT status from mtm.status()')[0][0]
 90 |             log.debug("status: %s" % status)
 91 |             log.debug('first wals - %s, ' % first_wals)
 92 |             log.debug('last wals - %s' % last_wals)
 93 |             if status == 'online':
 94 |                 break
 95 |             wals_trimmed = wals_trimmed or all(b<a for (b,a) in zip(
 96 |                 first_wals_before, first_wals))
 97 |         if status != 'online':
 98 |             raise Exception('timed out waiting for online status')
 99 |         if not wals_trimmed:
100 |             raise Exception(
101 |                 'wals were not trimmed during recovery, fw_before: %s, fw: %s'
102 |                 % (first_wals_before, first_wals))
103 |         return
104 | 
105 |     def test_syncpoints(self):
106 |         log.info('### test_syncpoints ###')
107 |         self.client.stop()
108 | 
109 |         # disable fsync for faster test execution
110 |         # checkpoint ensures wal we expect to be removed in the first test is
111 |         # indeed of the workload we've created
112 |         for dsn in self.dsns:
113 |             self.nodeExecute(dsn, ["ALTER SYSTEM SET fsync = 'off'",
114 |                                    "SELECT pg_reload_conf()",
115 |                                    "CHECKPOINT"])
116 |         log.debug('fsync is turned off')
117 |         time.sleep(5)
118 | 
119 |         # check that wals are trimmed when everyone is online
120 |         first_wals_before = self._get_first_wals(self.dsns)
121 |         self.client.bgrun()
122 |         # Note that _get_first_wals called inside for logging purposes has
123 |         # useful side effect: checkpoint recycles WAL and at the same time logs
124 |         # xl_running_xacts for future advancement. With default settings
125 |         # checkpoints may occur too rarely to pass assert.
126 |         self._wait_wal(self.dsns)
127 |         first_wals_after = self._get_first_wals(self.dsns)
128 |         if not all(b < a for (b, a) in zip(first_wals_before, first_wals_after)):
129 |             raise AssertionError('segments on some nodes were not trimmed in normal mode: before={}, after={}'.format(first_wals_before, first_wals_after))
130 | 
131 | 
132 |         # now check that wal is preserved if some node is offline
133 |         self.client.stop()
134 |         failure = CrashRecoverNode('node3')
135 |         log.info('putting node 3 down')
136 |         failure.start()
137 |         # wait until 1 and 2 exclude 3; use mm_ping to ensure generation
138 |         # switch is over
139 |         [self.awaitOnline(dsn, mm_ping=True) for dsn in self.dsns[:2]]
140 |         # getting first_wals here would be too strict -- unlikely, but probably
141 |         # there is some WAL which is not needed by offline node
142 |         slot_wals_before = self._get_slot_wals(self.dsns[:2], 3)
143 |         self.client.bgrun()
144 |         self._wait_wal(self.dsns[:2], 10)
145 |         first_wals_after = self._get_first_wals(self.dsns[:2])
146 |         if not all(b >= a for (b, a) in zip(slot_wals_before, first_wals_after)):
147 |             raise AssertionError('segments on some nodes were trimmed in degraded mode: before={}, after={}'.format(slot_wals_before, first_wals_after))
148 | 
149 |         # re-run client in weak mode to allow node to recover
150 |         # (but don't stop it completely to make test harder)
151 |         self.client.stop()
152 |         numworkers = {
153 |             'transfer': 1,
154 |             'sumtotal': 1,
155 |             'inserter': 1
156 |         }
157 |         self.client.bgrun(numworkers=numworkers)
158 |         log.info('getting node 3 up')
159 |         failure.stop()
160 |         # This allows to connect to MM node during recovery
161 |         recovery_dsn = self.dsns[2]+' application_name=mtm_admin'
162 |         # Wait for node becomes accessible (in recovery mode)
163 |         self.awaitOnline(recovery_dsn)
164 |         self._chk_rec_trim(recovery_dsn, self.dsns[:2])
165 |         self.awaitOnline(self.dsns[2])
166 |         # Now stop client
167 |         self.client.stop()
168 | 
169 | 
170 | if __name__ == '__main__':
171 |     unittest.main()
172 | 


--------------------------------------------------------------------------------
/tests/writer.pgb:
--------------------------------------------------------------------------------
1 | \set src random(0, 999)
2 | \set dst random(0, 999)
3 | \set amount random(1, 10)
4 | begin;
5 | update t set v = v - :amount where k=:src;
6 | update t set v = v + :amount where k=:dst;
7 | commit;


--------------------------------------------------------------------------------
/tests_testgres/.gitignore:
--------------------------------------------------------------------------------
1 | env
2 | venv
3 | __pycache__/
4 | *.pyc
5 | 


--------------------------------------------------------------------------------
/tests_testgres/connect.jsh:
--------------------------------------------------------------------------------
 1 | 
 2 | /env --class-path /usr/share/java/postgresql-jdbc/postgresql-jdbc4.jar
 3 | 
 4 | import java.sql.*;
 5 | Class.forName("org.postgresql.Driver");
 6 | 
 7 | int port1 = 12928;
 8 | int port2 = 16682;
 9 | int port3 = 18521;
10 | String connstring = String.format("jdbc:postgresql://localhost:%d,localhost:%d,localhost:%d/postgres", port1, port2, port3);
11 | 
12 | /* connect to DB */
13 | Connection con = DriverManager.getConnection(connstring);
14 | 
15 | /* show help */
16 | System.out.println("Use 'con' object!");
17 | 
18 | /* execute some commands */
19 | System.out.println("Execute 'SELECT 1'");
20 | Statement stmt = con.createStatement();
21 | ResultSet rs = stmt.executeQuery("select 1");
22 | rs.next();
23 | String s = rs.getString(1);
24 | System.out.println("result = " + s);
25 |     


--------------------------------------------------------------------------------
/tests_testgres/ddl.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import subprocess
 3 | import time
 4 | 
 5 | from mm_cluster import Cluster
 6 | 
 7 | NUM_NODES = 3
 8 | 
 9 | class TestDDL(unittest.TestCase):
10 | 
11 |     @classmethod
12 |     def setUpClass(cls):
13 |         cls.cluster = Cluster(NUM_NODES)
14 |         cls.cluster.print_conninfo()
15 |         cls.cluster.start().install().await_online((0,1,2))
16 |         # cls.cluster.print_conninfo()
17 | 
18 |     @classmethod
19 |     def tearDownClass(cls):
20 |         cls.cluster.stop()
21 | 
22 |     # Check that recovery properly processes
23 |     def test_dll_recovery(self):
24 |         # create table while one node is stopped
25 |         self.cluster.nodes[2].stop()
26 |         self.cluster.await_online((0,1))
27 |         self.cluster.nodes[0].safe_psql(query='create table t(id int primary key)')
28 | 
29 |         # now if second node didn't store logical message with DDL and third
30 |         # node will recover from second then it will not receive this
31 |         # 'create table' (PGPRO-1699)
32 |         self.cluster.nodes[2].start()
33 |         self.cluster.await_online((0,1,2))
34 |         self.cluster.nodes[2].safe_psql(query='insert into t values(42)')
35 | 
36 | 
37 | if __name__ == '__main__':
38 |     unittest.main()


--------------------------------------------------------------------------------
/tests_testgres/run_tests.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | set -e
 3 | ulimit -c unlimited
 4 | 
 5 | CURPATH=`pwd`
 6 | BASEDIR=$CURPATH/../../..
 7 | export PATH=$BASEDIR/tmp_install/usr/local/pgsql/bin/:$PATH
 8 | export DYLD_LIBRARY_PATH=$BASEDIR/tmp_install/usr/local/pgsql/lib/:$DYLD_LIBRARY_PATH
 9 | export DESTDIR=$BASEDIR/tmp_install
10 | 
11 | make -C $BASEDIR install
12 | make -C $BASEDIR/contrib/mmts install
13 | 
14 | if [ -z "$VIRTUAL_ENV" ]; then
15 | 	>&2 echo WARNING: not in virtualenv
16 | fi
17 | 
18 | # python3 -m unittest discover --pattern=*.py
19 | python3 ddl.py
20 | 


--------------------------------------------------------------------------------
/tests_testgres/test_failover.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from mm_cluster import Cluster
 4 | 
 5 | 
 6 | with Cluster(3).start().install() as cluster:
 7 |     print("Cluster is working")
 8 | 
 9 |     node_id = 0
10 |     for node in cluster.nodes:
11 |         node_id += 1
12 | 
13 |         print("Node #{}".format(node_id))
14 |         print("\t-> port: {}".format(node.port))
15 |         print("\t-> arbiter port: {}".format(node.mm_port))
16 |         print("\t-> dir: {}".format(node.base_dir))
17 |         print()
18 | 
19 |     jshell = """
20 | /env --class-path /usr/share/java/postgresql-jdbc/postgresql-jdbc4.jar
21 | 
22 | import java.sql.*;
23 | Class.forName("org.postgresql.Driver");
24 | 
25 | int port1 = {};
26 | int port2 = {};
27 | int port3 = {};
28 | String connstring = String.format("jdbc:postgresql://localhost:%d,localhost:%d,localhost:%d/postgres", port1, port2, port3);
29 | 
30 | /* connect to DB */
31 | Connection con = DriverManager.getConnection(connstring);
32 | 
33 | /* show help */
34 | System.out.println("Use 'con' object!");
35 | 
36 | /* execute some commands */
37 | System.out.println("Execute 'SELECT 1'");
38 | Statement stmt = con.createStatement();
39 | ResultSet rs = stmt.executeQuery("select 1");
40 | rs.next();
41 | String s = rs.getString(1);
42 | System.out.println("result = " + s);
43 |     """.format(cluster.nodes[0].port,
44 |                cluster.nodes[1].port,
45 |                cluster.nodes[2].port)
46 | 
47 |     with open('connect.jsh', 'w') as f:
48 |         f.write(jshell)
49 |         print("Now run jshell with connect.jsh")
50 |         print()
51 | 
52 |     print("Press ctrl+C to exit")
53 | 
54 |     while True:
55 |         import time
56 |         time.sleep(1)
57 | 


--------------------------------------------------------------------------------
/tests_testgres/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/postgrespro/mmts/baab9238f784d428481ecfa1294e3f9a3910b2d2/tests_testgres/tests/__init__.py


--------------------------------------------------------------------------------
/tests_testgres/tests/bootstrap.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from mm_cluster import Cluster
 3 | 
 4 | 
 5 | class Bootstrap(unittest.TestCase):
 6 |     def test_bootstrap(self):
 7 |         with Cluster(3).start().install() as cluster:
 8 |             for node in cluster.nodes:
 9 |                 status = 'select status from mtm.get_cluster_state()'
10 | 
11 |                 self.assertTrue(node.status())
12 |                 self.assertTrue(node.execute('postgres', 'select true'))
13 |                 self.assertTrue(node.execute('postgres', status))
14 | 


--------------------------------------------------------------------------------
/tests_testgres/tests/truncate.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import subprocess
 3 | import time
 4 | 
 5 | from mm_cluster import Cluster
 6 | 
 7 | 
 8 | NUM_NODES = 2
 9 | BENCH_SEC = 30
10 | 
11 | 
12 | class TestTruncate(unittest.TestCase):
13 |     def test_truncate(self):
14 |         with Cluster(NUM_NODES).start().install() as cluster:
15 |             assert(NUM_NODES >= 2)
16 | 
17 |             for node in cluster.nodes:
18 |                 self.assertTrue(node.status())
19 | 
20 |             node_1 = cluster.nodes[0]
21 |             node_1.pgbench_init(dbname=cluster.dbname)
22 | 
23 |             pgbench = node_1.pgbench(dbname=cluster.dbname,
24 |                                      stdout=subprocess.PIPE,
25 |                                      stderr=subprocess.STDOUT,
26 |                                      options=['-T%i' % BENCH_SEC])
27 | 
28 |             count = 0
29 |             started = time.time()
30 |             while time.time() - started < BENCH_SEC:
31 |                 for node in cluster.nodes:
32 |                     node.safe_psql(dbname=cluster.dbname,
33 |                                    username=cluster.username,
34 |                                    query='truncate pgbench_history;')
35 | 
36 |                     node.safe_psql(dbname=cluster.dbname,
37 |                                    username=cluster.username,
38 |                                    query='vacuum full;')
39 | 
40 |                 count += 1
41 | 
42 |                 # check that pgbench has been running for at least 1 loop
43 |                 assert (count > 0 or pgbench.poll is not None)
44 | 
45 |                 time.sleep(0.5)
46 | 
47 |             assert(count > 0)
48 |             print("{}: executed truncate {} times"
49 |                   .format(self.test_truncate.__name__, count))
50 | 
51 |             pgbench.wait()
52 | 


--------------------------------------------------------------------------------