├── .gitignore
├── .gitlab-ci.yml
├── Cluster.pm
├── Dockerfile
├── LICENSE
├── Makefile
├── README.md
├── doc
├── multimaster.xml
├── multimaster_book.xml
├── readme.md
├── specs
│ ├── .gitignore
│ ├── MtmGenerations.cfg
│ ├── MtmGenerations.tla
│ ├── MtmPrimitiveCurrent.cfg
│ ├── MtmPrimitiveCurrent.tla
│ ├── MtmPrimitiveCurrentMasks.cfg
│ ├── MtmPrimitiveCurrentMasks.tla
│ ├── MtmPrimitiveCurrentMasksFixed.cfg
│ ├── MtmPrimitiveCurrentMasksFixed.tla
│ ├── commit.cfg
│ ├── commit.md
│ ├── commit.tla
│ ├── generations2.md
│ └── mm_recovery.ipynb
├── stylesheet.css
└── stylesheet.xsl
├── expected
├── atx.out
├── multimaster.out
├── regression_ee.diff
└── regression_vanilla.diff
├── multimaster--1.0.sql
├── multimaster.control
├── referee
├── Makefile
├── expected
│ └── referee.out
├── referee--1.0.sql
├── referee.control
└── sql
│ └── referee.sql
├── run.pl
├── sql
├── atx.sql
└── multimaster.sql
├── src
├── bgwpool.c
├── bkb.c
├── bytebuf.c
├── commit.c
├── ddd.c
├── ddl.c
├── dmq.c
├── global_tx.c
├── include
│ ├── bgwpool.h
│ ├── bkb.h
│ ├── bytebuf.h
│ ├── commit.h
│ ├── compat.h
│ ├── ddd.h
│ ├── ddl.h
│ ├── dmq.h
│ ├── global_tx.h
│ ├── logger.h
│ ├── messaging.h
│ ├── mtm_utils.h
│ ├── multimaster.h
│ ├── pglogical_config.h
│ ├── pglogical_hooks.h
│ ├── pglogical_output.h
│ ├── pglogical_output
│ │ ├── compat.h
│ │ └── hooks.h
│ ├── pglogical_proto.h
│ ├── pglogical_relid_map.h
│ ├── receiver.h
│ ├── resolver.h
│ ├── spill.h
│ ├── state.h
│ └── syncpoint.h
├── mtm_utils.c
├── multimaster.c
├── pglogical_apply.c
├── pglogical_config.c
├── pglogical_hooks.c
├── pglogical_output.c
├── pglogical_proto.c
├── pglogical_receiver.c
├── pglogical_relid_map.c
├── resolver.c
├── spill.c
├── state.c
├── syncpoint.c
└── test_bkb.sage.py
├── t
├── 000_cross._pl
├── 000_deadlock.pl
├── 000_init._pl
├── 001_regress.pl
├── 002_regressmm.pl
├── 003_basic_recovery.pl
├── 004_recovery.pl
├── 005_pgbench.pl
├── 006_pgbenchdl.pl
├── 007_add_stop_node.pl
├── 008_bugfixes.pl
└── 009_identity_func.pl
├── tests
├── .gitignore
├── Pipfile
├── Pipfile.lock
├── deadl.pgb
├── docker-compose.yml
├── docker-entrypoint.sh
├── lib
│ ├── __init__.py
│ ├── bank_client.py
│ ├── failure_injector.py
│ ├── log_helper.py
│ └── test_helper.py
├── reader.pgb
├── requirements.txt
├── support
│ ├── bumptime.c
│ ├── docker-regress.sh
│ └── two_nodes.yml
├── test_bkb.sage.py
├── test_recovery_random.py
├── test_referee.py
├── test_regression.py
├── test_syncpoint.py
└── writer.pgb
└── tests_testgres
├── .gitignore
├── connect.jsh
├── ddl.py
├── mm_cluster.py
├── run_tests.sh
├── test_failover.py
└── tests
├── __init__.py
├── bootstrap.py
└── truncate.py
/.gitignore:
--------------------------------------------------------------------------------
1 | /log/
2 | /results/
3 | /tmp_check/
4 | regression.diff.diff
--------------------------------------------------------------------------------
/.gitlab-ci.yml:
--------------------------------------------------------------------------------
1 | # Run python tests.
2 |
3 | image: pgpgpro/dev:stretch
4 |
5 | .only-default: &only-default
6 | only:
7 | refs:
8 | - merge_requests
9 | - tags
10 | - schedules
11 | - branches
12 | - pushes
13 | - web
14 | - triggers
15 | changes:
16 | - '**/*'
17 |
18 | # Tests are docker-based, and so is gitlab executor itself. We are using a bit
19 | # monstrous (and recommended) approach of running dind 'service' container
20 | # alongside main executor; it runs docker and exposes its socket:
21 | # https://docs.gitlab.com/ee/ci/docker/using_docker_build.html#use-the-docker-executor-with-the-docker-image-docker-in-docker
22 | # These variables tell the executor how to reach the socket.
23 | #
24 | # The 'docker' hostname is the alias of the service container as described at
25 | # https://docs.gitlab.com/ee/ci/docker/using_docker_images.html#accessing-the-services
26 | .docker_variables: &docker_variables
27 | DOCKER_HOST: tcp://docker:2375/
28 | # When using dind, it's wise to use the overlayfs driver for
29 | # improved performance.
30 | DOCKER_DRIVER: overlay2
31 | DOCKER_TLS_CERTDIR: ""
32 |
33 | stages:
34 | - build_core_image
35 | - make_check
36 | # hardcoded stuff in python tests doesn't allow to run them in parallel
37 | - recovery random
38 | - referee
39 | - syncpoint
40 |
41 | # builds image with ee core and saves it as an artifact
42 | build_core_image:
43 | <<: *only-default
44 | stage: build_core_image
45 | retry: 1
46 | image: pgpgpro/dev:alpine
47 | # run container providing docker alongside
48 | services:
49 | - docker:dind
50 | variables:
51 | <<: *docker_variables
52 | branch: ee13_mm
53 | artifacts:
54 | expire_in: 24 hours
55 | when: always
56 | paths:
57 | - docker-image/pgmm.tar.gz
58 | - postgrespro.tar.gz
59 | script:
60 | # Add mm_gitlab_ci_ed25519 env var of type 'file' with the key in gitlab
61 | - ssh-agent sh -c 'ssh-add ${mm_gitlab_ci_ed25519}; GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone --depth=1 --branch "${branch}" git@git.postgrespro.ru:pgpro-dev/postgrespro.git'
62 | - cd postgrespro
63 | - docker build -t pgmm .
64 | - cd ..
65 | - mkdir docker-image
66 | - docker save pgmm > docker-image/pgmm.tar
67 | - tar czf docker-image/pgmm.tar.gz docker-image/pgmm.tar
68 | - ls -lah docker-image/
69 | - rm docker-image/pgmm.tar
70 | # also save archived sources of core
71 | - tar -czf postgrespro.tar.gz postgrespro
72 |
73 | # make check. We build core from sources again which is a bit ugly as we already
74 | # built the image, but let's not wobble here with yet another docker
75 | make_check:
76 | <<: *only-default
77 | stage: make_check
78 | # gives us the archive with core sources
79 | dependencies:
80 | - build_core_image
81 | artifacts:
82 | when: always
83 | paths:
84 | - postgrespro/contrib/mmts/tmp_check/log
85 | - postgrespro/contrib/mmts/tmp_check/regress_outdir
86 | script:
87 | - ls
88 | - tar -xzf postgrespro.tar.gz
89 | - shopt -s extglob
90 | - rm -rf postgrespro/contrib/mmts; mkdir postgrespro/contrib/mmts
91 | - mv !(postgrespro) postgrespro/contrib/mmts
92 | - cd postgrespro
93 | - CFLAGS="-ggdb3 -O0" ./configure --enable-cassert --enable-debug --with-perl --enable-tap-tests
94 | - make -j8
95 | - cd contrib/mmts && make check
96 |
97 | recovery_random:
98 | <<: *only-default
99 | stage: recovery random
100 | image: pgpgpro/dev:alpine
101 | services:
102 | - docker:dind
103 | dependencies:
104 | - build_core_image
105 | artifacts:
106 | when: on_failure
107 | paths:
108 | - tests/logs1
109 | - tests/logs2
110 | - tests/logs3
111 | variables:
112 | <<: *docker_variables
113 | before_script:
114 | - docker info
115 | script:
116 | - tar -xzvf docker-image/pgmm.tar.gz
117 | - docker load -i docker-image/pgmm.tar
118 | - cd tests/
119 | - env CI=1 python3 -u test_recovery_random.py --failfast
120 |
121 | referee:
122 | extends: recovery_random
123 | stage: referee
124 | artifacts:
125 | paths:
126 | - tests/logs1
127 | - tests/logs2
128 | - tests/logs_referee
129 | script:
130 | - tar -xzvf docker-image/pgmm.tar.gz
131 | - docker load -i docker-image/pgmm.tar
132 | - cd tests/
133 | - env CI=1 python3 -u test_referee.py --failfast
134 |
135 | syncpoint:
136 | extends: recovery_random
137 | stage: syncpoint
138 | script:
139 | - tar -xzvf docker-image/pgmm.tar.gz
140 | - docker load -i docker-image/pgmm.tar
141 | - cd tests/
142 | - env CI=1 python3 -u test_syncpoint.py --failfast
143 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM pgmm
2 |
3 | RUN mkdir /pg/mmts
4 | COPY ./ /pg/mmts/
5 |
6 | RUN export USE_PGXS=1 && \
7 | cd /pg/mmts && make clean && make install
8 |
9 | # pg_regress client assumes such dir exists on server
10 | RUN cp /pg/src/src/test/regress/*.so /pg/install/lib/postgresql/
11 | USER postgres
12 | ENV PGDATA /pg/data
13 | ENTRYPOINT ["/pg/mmts/tests/docker-entrypoint.sh"]
14 |
15 | EXPOSE 5432
16 | CMD ["postgres"]
17 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | EXTENSION = multimaster
2 | DATA = multimaster--1.0.sql
3 | OBJS = src/multimaster.o src/dmq.o src/commit.o src/bytebuf.o src/bgwpool.o \
4 | src/pglogical_output.o src/pglogical_proto.o src/pglogical_receiver.o \
5 | src/pglogical_apply.o src/pglogical_hooks.o src/pglogical_config.o \
6 | src/pglogical_relid_map.o src/ddd.o src/bkb.o src/spill.o src/state.o \
7 | src/resolver.o src/ddl.o src/syncpoint.o src/global_tx.o src/mtm_utils.o
8 | MODULE_big = multimaster
9 |
10 | ifndef USE_PGXS # hmm, user didn't requested to use pgxs
11 | # relative path to this makefile
12 | mkfile_path := $(word $(words $(MAKEFILE_LIST)),$(MAKEFILE_LIST))
13 | # relative path to dir with this makefile
14 | mkfile_dir := $(dir $(mkfile_path))
15 | # abs path to dir with this makefile
16 | mkfile_abspath := $(shell cd $(mkfile_dir) && pwd -P)
17 | # parent dir name of directory with makefile
18 | parent_dir_name := $(shell basename $(shell dirname $(mkfile_abspath)))
19 | ifneq ($(parent_dir_name),contrib) # a-ha, but the extension is not inside 'contrib' dir
20 | USE_PGXS := 1 # so use it anyway, most probably that's what the user wants
21 | endif
22 | endif
23 | # $(info) is introduced in 3.81, and PG doesn't support makes older than 3.80
24 | # ifeq ($(MAKE_VERSION),3.80)
25 | # $(warning $$USE_PGXS is [${USE_PGXS}] (we use it automatically if not in contrib dir))
26 | # else
27 | # $(info $$USE_PGXS is [${USE_PGXS}] (we use it automatically if not in contrib dir))
28 | # endif
29 |
30 | ifdef USE_PGXS # use pgxs
31 | # You can specify path to pg_config in PG_CONFIG var
32 | ifndef PG_CONFIG
33 | PG_CONFIG := pg_config
34 | endif
35 | PG_CPPFLAGS += -I$(CURDIR)/src/include
36 | # add installation top include directory for libpq header
37 | # (seems like server/ dir is added by pgxs)
38 | PG_CPPFLAGS += -I$(shell $(PG_CONFIG) --includedir)
39 | SHLIB_LINK += -lpq # add libpq
40 | PGXS := $(shell $(PG_CONFIG) --pgxs)
41 | include $(PGXS)
42 |
43 | else # assume the extension is in contrib/ dir of pg distribution
44 | PG_CPPFLAGS += -I$(top_srcdir)/$(subdir)/src/include
45 | PG_CPPFLAGS += -I$(libpq_srcdir) # include libpq-fe, defined in Makefile.global.in
46 | SHLIB_LINK = $(libpq) # defined in Makefile.global.in
47 | subdir = contrib/mmts
48 | top_builddir = ../..
49 | include $(top_builddir)/src/Makefile.global
50 | # in ee, install pathman as well
51 | ifeq (${PGPRO_EDITION}, enterprise)
52 | EXTRA_INSTALL=contrib/pg_pathman
53 | endif
54 | include $(top_srcdir)/contrib/contrib-global.mk
55 | endif # USE_PGXS
56 |
57 | REGRESS_SHLIB=$(abs_top_builddir)/src/test/regress/regress$(DLSUFFIX)
58 | export REGRESS_SHLIB
59 |
60 | .PHONY: all
61 |
62 | # recurse down to referee/ on install.
63 | # (I'd use $(call recurse...), but how can we pass USE_PGXS there?
64 | referee-install:
65 | USE_PGXS=$(USE_PGXS) $(MAKE) -C referee install
66 | install: referee-install
67 |
68 | all: multimaster.so
69 |
70 | submake-regress:
71 | $(MAKE) -C $(top_builddir)/src/test/regress all
72 | $(MAKE) -C $(top_builddir)/src/test/regress tablespace-setup
73 |
74 | # all .pl tests should pass now, but let's see what the buildfarm says
75 | # ifndef MTM_ALL
76 | # PROVE_TESTS ?=
77 | # endif
78 | PROVE_FLAGS += --timer
79 | ifndef USE_PGXS
80 | check: temp-install submake-regress
81 | $(prove_check)
82 | else # pgxs build
83 | # Note that for PGXS build we override here bail-out recipe defined in pgxs.mk,
84 | # but well, why should we chose another name?
85 | # submake-regress won't work as we have no access to the source; we assume
86 | # regress is already installed
87 | # final spell is inspired by
88 | # https://www.2ndquadrant.com/en/blog/using-postgresql-tap-framework-extensions/
89 | # and Makefile.global.in which is obviously the original source
90 | check:
91 | rm -rf '$(CURDIR)'/tmp_check
92 | $(MKDIR_P) '$(CURDIR)'/tmp_check
93 | PGXS=$(PGXS) TESTDIR='$(CURDIR)' PATH="$(bindir):$$PATH" PG_REGRESS='$(top_builddir)/src/test/regress/pg_regress' $(PROVE) $(PG_PROVE_FLAGS) $(PROVE_FLAGS) $(if $(PROVE_TESTS),$(PROVE_TESTS),t/*.pl)
94 | endif
95 |
96 | # PG_PROVE_FLAGS adds PostgresNode and friends include dir
97 | start: temp-install
98 | rm -rf '$(CURDIR)'/tmp_check
99 | $(MKDIR_P) '$(CURDIR)'/tmp_check
100 | cd $(srcdir) && TESTDIR='$(CURDIR)' \
101 | $(with_temp_install) \
102 | PG_REGRESS='$(CURDIR)/$(top_builddir)/src/test/regress/pg_regress' \
103 | perl $(PG_PROVE_FLAGS) run.pl --action=start $(RUN_OPTS)
104 |
105 | stop:
106 | cd $(srcdir) && TESTDIR='$(CURDIR)' \
107 | PG_REGRESS='$(top_builddir)/src/test/regress/pg_regress' \
108 | perl $(PG_PROVE_FLAGS) run.pl --action=stop $(RUN_OPTS)
109 |
110 | # for manual testing: runs core regress tests on 'make start'ed cluster
111 | run-pg-regress: submake-regress
112 | cd $(CURDIR)/$(top_builddir)/src/test/regress && \
113 | $(with_temp_install) \
114 | PGPORT='65432' \
115 | PGHOST='127.0.0.1' \
116 | PGUSER='$(USER)' \
117 | ./pg_regress \
118 | --bindir='' \
119 | --use-existing \
120 | --schedule=$(abs_top_srcdir)/src/test/regress/parallel_schedule \
121 | --dlpath=$(CURDIR)/$(top_builddir)/src/test/regress \
122 | --inputdir=$(abs_top_srcdir)/src/test/regress
123 |
124 | # for manual testing: runs contrib/test_partition on 'make start'ed cluster
125 | run-pathman-regress:
126 | cd $(CURDIR)/$(top_builddir)/src/test/regress && \
127 | $(with_temp_install) \
128 | PGPORT='65432' \
129 | PGHOST='127.0.0.1' \
130 | PGUSER='$(USER)' \
131 | ./pg_regress \
132 | --bindir='' \
133 | --use-existing \
134 | --temp-config=$(abs_top_srcdir)/contrib/test_partition/pg_pathman.add \
135 | --inputdir=$(abs_top_srcdir)/contrib/test_partition/ \
136 | partition
137 |
138 |
139 | # bgw-based partition spawning is not supported by mm, so I
140 | # commenting out body of set_spawn_using_bgw() sql function before
141 | # running that
142 | run-pathman-regress-ext:
143 | cd $(CURDIR)/$(top_builddir)/src/test/regress && \
144 | $(with_temp_install) \
145 | PGPORT='65432' \
146 | PGHOST='127.0.0.1' \
147 | PGUSER='$(USER)' \
148 | ./pg_regress \
149 | --bindir='' \
150 | --use-existing \
151 | --temp-config=$(abs_top_srcdir)/contrib/pg_pathman/conf.add \
152 | --inputdir=$(abs_top_srcdir)/contrib/pg_pathman/ \
153 | pathman_array_qual pathman_basic pathman_bgw pathman_calamity pathman_callbacks \
154 | pathman_column_type pathman_cte pathman_domains pathman_dropped_cols pathman_expressions \
155 | pathman_foreign_keys pathman_gaps pathman_inserts pathman_interval pathman_join_clause \
156 | pathman_lateral pathman_hashjoin pathman_mergejoin pathman_only pathman_param_upd_del \
157 | pathman_permissions pathman_rebuild_deletes pathman_rebuild_updates pathman_rowmarks \
158 | pathman_runtime_nodes pathman_subpartitions pathman_update_node pathman_update_triggers \
159 | pathman_upd_del pathman_utility_stmt pathman_views
160 |
161 | pg-regress: | start run-pg-regress
162 | pathman-regress: | start run-pathman-regress-ext stop
163 | installcheck:
164 | $(prove_installcheck)
165 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # multimaster
2 |
3 | multimaster is a Postgres extension with a set of core patches that turn the
4 | DBMS into a synchronous shared-nothing symmetric cluster providing high
5 | availability with strong consistency and read scalability.
6 |
7 | It offers the following benefits, some of which are not available in traditional streaming replication based solutions:
8 | * Fault tolerance and automatic node recovery
9 | * Fast failover
10 | * Both read and write transactions can be executed on any node.
11 | * Read scalability
12 | * Working with temporary tables on each cluster node
13 | * Online minor upgrades
14 |
15 | ## Documentation
16 |
17 | [current documentation](https://postgrespro.github.io/mmts/)
18 |
19 | Documentation for versions released with PostgresPro Enterprise can be found
20 | [here](https://postgrespro.ru/docs/enterprise/current/multimaster?lang=en).
21 |
22 | ## Building from source
23 |
24 | Since multimaster depends on core patches, both Postgres and extension must be compiled. The patched version (based on Postgres 13) is available [here](https://github.com/postgrespro/postgres_cluster/tree/rel13_mm_2). Follow the [documentation](https://www.postgresql.org/docs/current/installation.html) to build it.
25 |
26 | Then enter the build directory and install the extension with
27 | ```shell
28 | cd contrib
29 | git clone https://github.com/postgrespro/mmts/
30 | cd mmts
31 | make install
32 | ```
33 |
--------------------------------------------------------------------------------
/doc/multimaster_book.xml:
--------------------------------------------------------------------------------
1 |
5 | ]>
6 |
7 |
8 |
9 | multimaster Documentation
10 | &multimaster;
11 |
12 |
13 |
--------------------------------------------------------------------------------
/doc/readme.md:
--------------------------------------------------------------------------------
1 | # Generating documentation
2 | ```
3 | xmllint --noout --valid multimaster_book.xml
4 | xsltproc stylesheet.xsl multimaster_book.xml >multimaster.html
5 | ```
6 |
7 | and don't forget to install the result on postgrespro.github.io:
8 | ```
9 | cp multimaster.html stylesheet.css /mmts/
10 | ```
--------------------------------------------------------------------------------
/doc/specs/.gitignore:
--------------------------------------------------------------------------------
1 | *.toolbox/
2 | .ipynb_checkpoints/
3 |
--------------------------------------------------------------------------------
/doc/specs/MtmGenerations.cfg:
--------------------------------------------------------------------------------
1 | \* MV CONSTANT declarations
2 | CONSTANTS
3 | n1 = n1
4 | n2 = n2
5 | n3 = n3
6 | \* MV CONSTANT definitions
7 | CONSTANT
8 | nodes = {n1, n2, n3}
9 | \* SYMMETRY definition
10 | SYMMETRY perms
11 | \* CONSTANT definitions
12 | CONSTANT
13 | max_xacts = 3
14 | CONSTANT
15 | max_gen = 3
16 | \* INIT definition
17 | INIT
18 | Init
19 | \* NEXT definition
20 | NEXT
21 | Next
22 | \* INVARIANT definition
23 | INVARIANT
24 | OrderOk
25 | \* Generated on Fri Dec 06 18:48:51 MSK 2019
--------------------------------------------------------------------------------
/doc/specs/MtmPrimitiveCurrent.cfg:
--------------------------------------------------------------------------------
1 | \* MV CONSTANT declarations
2 | CONSTANTS
3 | n1 = n1
4 | n2 = n2
5 | n3 = n3
6 | \* MV CONSTANT definitions
7 | CONSTANT
8 | nodes = {n1, n2, n3}
9 | \* CONSTANT definitions
10 | \* INIT definition
11 | INIT
12 | Init
13 | \* NEXT definition
14 | NEXT
15 | Next
16 | \* INVARIANT definition
17 | INVARIANT
18 | OrderOk
19 | \* Generated on Fri Dec 06 18:48:51 MSK 2019
--------------------------------------------------------------------------------
/doc/specs/MtmPrimitiveCurrent.tla:
--------------------------------------------------------------------------------
1 | ---- MODULE MtmPrimitiveCurrent ----
2 |
3 | \* Primitive (meaning immediate PREPARE everywhere and immediate recovery)
4 | \* but pretty close model of current multimaster.
5 | \* - There is an obvious sausage problem, shown by TLC. One of sort of its appearances
6 | \* is that we push xact into node without checking its state at all; xact is
7 | \* just appended to all nodes coordinator regards as 'enabled'.
8 | \* - Also 'works' only on 3 nodes because we recover from single node.
9 | \* - I don't see any reason for RECOVERED->ONLINE transition condition,
10 | \* and associated maintenance of walsenders/walreceivers masks. We can allow
11 | \* our xacts even just after recovery or (simpler for selecting xacts needing
12 | \* resolving) when majority is enabled.
13 | \* - I also don't see the point of recovery phase in RECOVERED|ONLINE: we don't pull
14 | \* all origins and thus it doesn't save us from sausage-like problems,
15 | \* but we still don't confirm xacts and don't allow parallel apply in it.
16 |
17 | \* model depth constraint is hardcoded in do_tx
18 |
19 | EXTENDS Integers, Sequences, FiniteSets, TLC
20 | VARIABLES state, logs
21 |
22 | CONSTANT nodes
23 |
24 | n_nodes == Cardinality(nodes)
25 |
26 |
27 | \**************************************************************************************
28 | \* Helpers
29 | \**************************************************************************************
30 |
31 | \* is s1 subsequence of s2?
32 | IsSubSeq(s1, s2) ==
33 | /\ Len(s1) <= Len(s2)
34 | /\ SubSeq(s2, 1, Len(s1)) = s1
35 |
36 |
37 | quorum(mask) == Cardinality({i \in DOMAIN mask : mask[i] = 1}) >= (n_nodes \div 2 + 1)
38 |
39 | max(set) == IF set = {} THEN 0 ELSE CHOOSE e1 \in set: \A e2 \in set: e1 >= e2
40 |
41 | maxlen(seqs) == max({Len(seqs[n]) : n \in DOMAIN seqs})
42 |
43 | \* max lsn of given origin in given log
44 | maxlsn(log, origin) == max({log[j].olsn : j \in {i \in DOMAIN log : log[i].origin = origin }})
45 |
46 | \* how far each node's changes are applied in given log?
47 | rep_state(log) == [n \in nodes |-> maxlsn(log,n)]
48 |
49 | log_newer_than(log, origin_vec) == SelectSeq(log, LAMBDA e: e.olsn > origin_vec[e.origin])
50 |
51 | \*is_increasing(s) == IF Len(s) > 1
52 | \* THEN {s[i] < s[i+1] : i \in 1..(Len(s)-1)} = {TRUE}
53 | \* ELSE TRUE
54 |
55 | \* returns not just new status but record with new state because masks might change
56 | \* old status is taken from state[n]
57 | new_state(n, view, enabled, wsndmask, wrcvmask) ==
58 | LET
59 | old_status == state[n].status
60 | new_status == CASE
61 | \* This is hardly needed; safety won't be altered if we are in recovery
62 | \* with less than majority in view mask
63 | ~ quorum(view) -> "disabled"
64 | [] quorum(view) /\ old_status = "disabled" -> "recovery"
65 | \* recovery -> recovered done explicitly in do_recovery()
66 | [] quorum(view) /\ old_status = "recovered" /\ view = enabled /\ view = wsndmask /\ view = wrcvmask -> "online"
67 | \* I don't think we need that, nothing should be prepared with minority enabled anyway
68 | [] quorum(view) /\ old_status = "online" /\ ~quorum(enabled) -> "disabled"
69 | [] OTHER -> old_status
70 | \* all zeros but me
71 | zeros == [[_n \in nodes |-> 0] EXCEPT ![n] = 1]
72 | new_enabled == IF new_status = "disabled" THEN zeros ELSE enabled
73 | new_wsndmask == IF new_status = "disabled" THEN zeros ELSE wsndmask
74 | new_wrcvmask == IF new_status = "disabled" THEN zeros ELSE wrcvmask
75 | IN
76 | \* next_lsn goes unchanged
77 | [state[n] EXCEPT !.status = new_status,
78 | !.view = view,
79 | !.enabled = new_enabled,
80 | !.walsenders = new_wsndmask,
81 | !.walreceivers = new_wrcvmask]
82 |
83 |
84 | \**************************************************************************************
85 | \* Initial
86 | \**************************************************************************************
87 |
88 |
89 | Init == /\ state = [n \in nodes |-> [
90 | next_lsn |-> 1,
91 | status |-> "disabled",
92 | view |-> [[_n \in nodes |-> 0] EXCEPT ![n] = 1],
93 | enabled |-> [[_n \in nodes |-> 0] EXCEPT ![n] = 1],
94 | walsenders |-> [[_n \in nodes |-> 0] EXCEPT ![n] = 1],
95 | walreceivers |-> [[_n \in nodes |-> 0] EXCEPT ![n] = 1]
96 | ]]
97 | /\ logs = [n \in nodes |-> << >>]
98 |
99 | \**************************************************************************************
100 | \* Actions
101 | \**************************************************************************************
102 |
103 |
104 | \* n1 disconnects n2
105 | disconnect(n1, n2) ==
106 | /\ n1 /= n2
107 | /\ state[n1].view[n2] = 1
108 |
109 | /\ logs' = logs
110 | /\ LET
111 | view == [state[n1].view EXCEPT ![n2] = 0]
112 | enabled == [state[n1].enabled EXCEPT ![n2] = 0]
113 | n1_state == new_state(n1, view, enabled, state[n1].walsenders, state[n2].walreceivers)
114 | IN
115 | state' = [state EXCEPT ![n1] = n1_state]
116 |
117 |
118 | connect(n1, n2) ==
119 | /\ n1 /= n2
120 | /\ state[n1].view[n2] = 0
121 |
122 | /\ logs' = logs
123 | /\ LET
124 | view == [state[n1].view EXCEPT ![n2] = 1]
125 | n1_state == new_state(n1, view, state[n1].enabled, state[n1].walsenders, state[n1].walreceivers)
126 | IN
127 | state' = [state EXCEPT ![n1] = n1_state]
128 |
129 | \* n1 recovers from n2
130 | do_recovery(n1, n2) ==
131 | /\ n1 /= n2
132 | /\ state[n1].status = "recovery"
133 | /\ state[n1].view[n2] = 1
134 | \* Apparently this ensures we won't keep dead node as enabled
135 | /\ state[n2].view[n1] = 1
136 |
137 | /\ LET
138 | origin_vec == rep_state(logs[n1])
139 | new_entries == log_newer_than(logs[n2], origin_vec)
140 | \* enable n1
141 | n2_enabled == [state[n2].enabled EXCEPT ![n1] = 1]
142 | n2_state == new_state(n2, state[n2].view, n2_enabled, state[n2].walsenders, state[n2].walreceivers)
143 | IN
144 | /\ logs' = [logs EXCEPT ![n1] = logs[n1] \o new_entries]
145 | /\ state' = [state EXCEPT ![n1].status = "recovered",
146 | ![n2] = n2_state]
147 |
148 |
149 | do_recovered(n1, n2) ==
150 | /\ n1 /= n2
151 | /\ (state[n1].status = "recovered" \/ state[n1].status = "online")
152 | /\ state[n1].view[n2] = 1
153 | /\ state[n2].view[n1] = 1
154 |
155 | /\ LET
156 | our_last_lsn == maxlsn(logs[n1], n2)
157 | new_entries == SelectSeq(logs[n2], LAMBDA e: e.origin = n2 /\ e.olsn > our_last_lsn )
158 | IN
159 | logs' = [logs EXCEPT ![n1] = logs[n1] \o new_entries]
160 | /\ LET
161 | n2_enabled == [state[n2].enabled EXCEPT ![n1] = 1]
162 | n2_walsenders == [state[n2].walsenders EXCEPT ![n1] = 1]
163 | n2_state == new_state(n2, state[n2].view, n2_enabled, n2_walsenders, state[n2].walreceivers)
164 | n1_walreceivers == [state[n1].walreceivers EXCEPT ![n2] = 1]
165 | n1_state == new_state(n1, state[n1].view, state[n1].enabled, state[n1].walsenders, n1_walreceivers)
166 | IN
167 | state' = [state EXCEPT ![n1] = n1_state,
168 | ![n2] = n2_state]
169 |
170 |
171 | do_tx(node) ==
172 | \* model depth constraint
173 | /\ Len(logs[node]) <= 4
174 | /\ state[node].status = "online"
175 | /\ quorum(state[node].enabled)
176 | /\ logs' = [n \in nodes |->
177 | IF state[node].enabled[n] = 1
178 | THEN Append(logs[n], [origin |-> node, olsn |-> state[node].next_lsn])
179 | ELSE logs[n]]
180 | /\ state' = [state EXCEPT ![node].next_lsn = state[node].next_lsn + 1]
181 |
182 |
183 | \**************************************************************************************
184 | \* Final spec
185 | \**************************************************************************************
186 |
187 |
188 | Next == \/ \E n1,n2 \in nodes : connect(n1,n2)
189 | \/ \E n1,n2 \in nodes : disconnect(n1,n2)
190 | \/ \E n1,n2 \in nodes : do_recovery(n1,n2)
191 | \/ \E n1,n2 \in nodes : do_recovered(n1,n2)
192 | \/ \E n \in nodes : do_tx(n)
193 |
194 | spec == Init /\ [][Next]_<>
195 |
196 |
197 | \**************************************************************************************
198 | \* Stuff to check
199 | \**************************************************************************************
200 |
201 | \* Make sure every log is sublog of the longest one
202 | OrderOk ==
203 | LET
204 | most_advanced_node == CHOOSE n1 \in nodes: \A n2 \in nodes: Len(logs[n1]) >= Len(logs[n2])
205 | IN
206 | \A n \in nodes: IsSubSeq(logs[n], logs[most_advanced_node])
207 |
208 | ====
--------------------------------------------------------------------------------
/doc/specs/MtmPrimitiveCurrentMasks.cfg:
--------------------------------------------------------------------------------
1 | \* MV CONSTANT declarations
2 | CONSTANTS
3 | n1 = n1
4 | n2 = n2
5 | n3 = n3
6 | \* MV CONSTANT definitions
7 | CONSTANT
8 | nodes = {n1, n2, n3}
9 | \* CONSTANT definitions
10 | \* INIT definition
11 | INIT
12 | Init
13 | \* NEXT definition
14 | NEXT
15 | Next
16 | \* INVARIANT definition
17 | INVARIANT
18 | OrderOk
19 | \* Generated on Fri Dec 06 18:48:51 MSK 2019
--------------------------------------------------------------------------------
/doc/specs/MtmPrimitiveCurrentMasks.tla:
--------------------------------------------------------------------------------
1 | ---- MODULE MtmPrimitiveCurrentMasks ----
2 |
3 | \* This just adds to MtmPrimitiveCurrent.tla tracking of enabled masks: while
4 | \* doing xact coordinator stamps it with current enabled mask. Others apply it
5 | \* in normal mode iff their enabled mask is exactly the same. TLC demonstrates
6 | \* here that we still have a problem because in do_recovered we ask to enable us
7 | \* without pulling all origins.
8 |
9 | \* model depth constraint is hardcoded in do_tx
10 |
11 | EXTENDS Integers, Sequences, FiniteSets, TLC
12 | VARIABLES state, logs
13 |
14 | CONSTANT nodes
15 |
16 | n_nodes == Cardinality(nodes)
17 |
18 |
19 | \**************************************************************************************
20 | \* Helpers
21 | \**************************************************************************************
22 |
23 | \* is s1 subsequence of s2?
24 | IsSubSeq(s1, s2) ==
25 | /\ Len(s1) <= Len(s2)
26 | /\ SubSeq(s2, 1, Len(s1)) = s1
27 |
28 |
29 | quorum(mask) == Cardinality({i \in DOMAIN mask : mask[i] = 1}) >= (n_nodes \div 2 + 1)
30 |
31 | max(set) == IF set = {} THEN 0 ELSE CHOOSE e1 \in set: \A e2 \in set: e1 >= e2
32 |
33 | maxlen(seqs) == max({Len(seqs[n]) : n \in DOMAIN seqs})
34 |
35 | \* max lsn of given origin in given log
36 | maxlsn(log, origin) == max({log[j].olsn : j \in {i \in DOMAIN log : log[i].origin = origin }})
37 |
38 | \* how far each node's changes are applied in given log?
39 | rep_state(log) == [n \in nodes |-> maxlsn(log,n)]
40 |
41 | log_newer_than(log, origin_vec) == SelectSeq(log, LAMBDA e: e.olsn > origin_vec[e.origin])
42 |
43 | \*is_increasing(s) == IF Len(s) > 1
44 | \* THEN {s[i] < s[i+1] : i \in 1..(Len(s)-1)} = {TRUE}
45 | \* ELSE TRUE
46 |
47 | \* returns not just new status but record with new state because masks might change
48 | \* old status is taken from state[n]
49 | new_state(n, view, enabled, wsndmask, wrcvmask) ==
50 | LET
51 | old_status == state[n].status
52 | new_status == CASE
53 | \* This is hardly needed; safety won't be altered if we are in recovery
54 | \* with less than majority in view mask
55 | ~ quorum(view) -> "disabled"
56 | [] quorum(view) /\ old_status = "disabled" -> "recovery"
57 | \* recovery -> recovered done explicitly in do_recovery()
58 | [] quorum(view) /\ old_status = "recovered" /\ view = enabled /\ view = wsndmask /\ view = wrcvmask -> "online"
59 | \* I don't think we need that, nothing should be prepared with minority enabled anyway
60 | [] quorum(view) /\ old_status = "online" /\ ~quorum(enabled) -> "disabled"
61 | [] OTHER -> old_status
62 | \* all zeros but me
63 | zeros == [[_n \in nodes |-> 0] EXCEPT ![n] = 1]
64 | new_enabled == IF new_status = "disabled" THEN zeros ELSE enabled
65 | new_wsndmask == IF new_status = "disabled" THEN zeros ELSE wsndmask
66 | new_wrcvmask == IF new_status = "disabled" THEN zeros ELSE wrcvmask
67 | IN
68 | \* next_lsn goes unchanged
69 | [state[n] EXCEPT !.status = new_status,
70 | !.view = view,
71 | !.enabled = new_enabled,
72 | !.walsenders = new_wsndmask,
73 | !.walreceivers = new_wrcvmask]
74 |
75 |
76 | \**************************************************************************************
77 | \* Initial
78 | \**************************************************************************************
79 |
80 |
81 | Init == /\ state = [n \in nodes |-> [
82 | next_lsn |-> 1,
83 | status |-> "disabled",
84 | view |-> [[_n \in nodes |-> 0] EXCEPT ![n] = 1],
85 | enabled |-> [[_n \in nodes |-> 0] EXCEPT ![n] = 1],
86 | walsenders |-> [[_n \in nodes |-> 0] EXCEPT ![n] = 1],
87 | walreceivers |-> [[_n \in nodes |-> 0] EXCEPT ![n] = 1]
88 | ]]
89 | /\ logs = [n \in nodes |-> << >>]
90 |
91 | \**************************************************************************************
92 | \* Actions
93 | \**************************************************************************************
94 |
95 |
96 | \* n1 disconnects n2
97 | disconnect(n1, n2) ==
98 | /\ n1 /= n2
99 | /\ state[n1].view[n2] = 1
100 |
101 | /\ logs' = logs
102 | /\ LET
103 | view == [state[n1].view EXCEPT ![n2] = 0]
104 | enabled == [state[n1].enabled EXCEPT ![n2] = 0]
105 | n1_state == new_state(n1, view, enabled, state[n1].walsenders, state[n2].walreceivers)
106 | IN
107 | state' = [state EXCEPT ![n1] = n1_state]
108 |
109 |
110 | connect(n1, n2) ==
111 | /\ n1 /= n2
112 | /\ state[n1].view[n2] = 0
113 |
114 | /\ logs' = logs
115 | /\ LET
116 | view == [state[n1].view EXCEPT ![n2] = 1]
117 | n1_state == new_state(n1, view, state[n1].enabled, state[n1].walsenders, state[n1].walreceivers)
118 | IN
119 | state' = [state EXCEPT ![n1] = n1_state]
120 |
121 | \* n1 recovers from n2
122 | do_recovery(n1, n2) ==
123 | /\ n1 /= n2
124 | /\ state[n1].status = "recovery"
125 | /\ state[n1].view[n2] = 1
126 | \* Apparently this ensures we won't keep dead node as enabled
127 | /\ state[n2].view[n1] = 1
128 |
129 | /\ LET
130 | origin_vec == rep_state(logs[n1])
131 | new_entries == log_newer_than(logs[n2], origin_vec)
132 | \* enable n1
133 | n2_enabled == [state[n2].enabled EXCEPT ![n1] = 1]
134 | n2_state == new_state(n2, state[n2].view, n2_enabled, state[n2].walsenders, state[n2].walreceivers)
135 | IN
136 | /\ logs' = [logs EXCEPT ![n1] = logs[n1] \o new_entries]
137 | /\ state' = [state EXCEPT ![n1].status = "recovered",
138 | ![n2] = n2_state]
139 |
140 |
141 | do_recovered(n1, n2) ==
142 | /\ n1 /= n2
143 | /\ (state[n1].status = "recovered" \/ state[n1].status = "online")
144 | /\ state[n1].view[n2] = 1
145 | /\ state[n2].view[n1] = 1
146 |
147 | /\ LET
148 | our_last_lsn == maxlsn(logs[n1], n2)
149 | new_entries == SelectSeq(logs[n2], LAMBDA e: e.origin = n2 /\ e.olsn > our_last_lsn )
150 | IN
151 | /\ \A k \in DOMAIN new_entries: new_entries[k].participants = state[n1].enabled
152 | /\ logs' = [logs EXCEPT ![n1] = logs[n1] \o new_entries]
153 | /\ LET
154 | n2_enabled == [state[n2].enabled EXCEPT ![n1] = 1]
155 | n2_walsenders == [state[n2].walsenders EXCEPT ![n1] = 1]
156 | n2_state == new_state(n2, state[n2].view, n2_enabled, n2_walsenders, state[n2].walreceivers)
157 | n1_walreceivers == [state[n1].walreceivers EXCEPT ![n2] = 1]
158 | n1_state == new_state(n1, state[n1].view, state[n1].enabled, state[n1].walsenders, n1_walreceivers)
159 | IN
160 | state' = [state EXCEPT ![n1] = n1_state,
161 | ![n2] = n2_state]
162 |
163 |
164 | do_tx(node) ==
165 | \* model depth constraint
166 | /\ Len(logs[node]) <= 4
167 | /\ state[node].status = "online"
168 | /\ quorum(state[node].enabled)
169 | \* make sure set of enabled nodes is the same on all participants
170 | /\ \A n \in nodes: state[node].enabled[n] = 0 \/ state[n].enabled = state[node].enabled
171 | /\ logs' = [n \in nodes |->
172 | IF state[node].enabled[n] = 1
173 | THEN Append(logs[n], [origin |-> node, olsn |-> state[node].next_lsn, participants |-> state[node].enabled])
174 | ELSE logs[n]]
175 | /\ state' = [state EXCEPT ![node].next_lsn = state[node].next_lsn + 1]
176 |
177 |
178 | \**************************************************************************************
179 | \* Final spec
180 | \**************************************************************************************
181 |
182 |
183 | Next == \/ \E n1,n2 \in nodes : connect(n1,n2)
184 | \/ \E n1,n2 \in nodes : disconnect(n1,n2)
185 | \/ \E n1,n2 \in nodes : do_recovery(n1,n2)
186 | \/ \E n1,n2 \in nodes : do_recovered(n1,n2)
187 | \/ \E n \in nodes : do_tx(n)
188 |
189 | spec == Init /\ [][Next]_<>
190 |
191 |
192 | \**************************************************************************************
193 | \* Stuff to check
194 | \**************************************************************************************
195 |
196 | \* Make sure every log is sublog of the longest one
197 | OrderOk ==
198 | LET
199 | most_advanced_node == CHOOSE n1 \in nodes: \A n2 \in nodes: Len(logs[n1]) >= Len(logs[n2])
200 | IN
201 | \A n \in nodes: IsSubSeq(logs[n], logs[most_advanced_node])
202 |
203 | ====
--------------------------------------------------------------------------------
/doc/specs/MtmPrimitiveCurrentMasksFixed.cfg:
--------------------------------------------------------------------------------
1 | \* MV CONSTANT declarations
2 | CONSTANTS
3 | n1 = n1
4 | n2 = n2
5 | n3 = n3
6 | \* MV CONSTANT definitions
7 | CONSTANT
8 | nodes = {n1, n2, n3}
9 | \* SYMMETRY definition
10 | SYMMETRY perms
11 | \* CONSTANT definitions
12 | CONSTANT
13 | depth = 3
14 | \* INIT definition
15 | INIT
16 | Init
17 | \* NEXT definition
18 | NEXT
19 | Next
20 | \* INVARIANT definition
21 | INVARIANT
22 | OrderOk
23 | \* Generated on Fri Dec 06 18:48:51 MSK 2019
--------------------------------------------------------------------------------
/doc/specs/commit.cfg:
--------------------------------------------------------------------------------
1 | SPECIFICATION spec
2 | INVARIANTS consistency types_correct1 types_correct2
--------------------------------------------------------------------------------
/doc/stylesheet.css:
--------------------------------------------------------------------------------
1 | @import url('https://fonts.googleapis.com/css?family=Roboto:300,400,500,700&subset=cyrillic');
2 |
3 | body {
4 | font-family: 'Roboto',Arial,sans-serif;
5 | }
6 |
7 | body {
8 | font-size: 18px;
9 | font-weight: 300;
10 | }
11 |
12 | /* ../media/css/docs.css */
13 | .navheader th { text-align: center; } /* anti-bootstrap */
14 |
15 | .navheader tbody tr:nth-child(1) th { /* временно убрать ненужную строчку */
16 | display: none;
17 | }
18 |
19 | /* PostgreSQL.org Documentation Style */
20 |
21 | .book div.NAVHEADER table {
22 | margin-left: 0;
23 | }
24 |
25 | .book div.NAVHEADER th {
26 | text-align: center;
27 | }
28 |
29 | .book {
30 | font-size: 15px;
31 | line-height: 1.6;
32 | }
33 |
34 | /* Heading Definitions */
35 |
36 | .book h1,
37 | .book h2,
38 | .book h3 {
39 | font-weight: bold;
40 | margin-top: 2ex;
41 | }
42 |
43 | .book h1 a,
44 | .book h2 a,
45 | .book h3 a,
46 | .book h4 a
47 | {
48 | color: #EC5800;
49 | }
50 |
51 | /* EKa --> */
52 | .book h1 {
53 | font-size: 1.4em;
54 | }
55 |
56 | .book h2 {
57 | font-size: 1.25em;
58 | }
59 |
60 | .book h3 {
61 | font-size: 1.2em;
62 | }
63 |
64 | .book h4 {
65 | font-size: 1.15em;
66 | }
67 |
68 | .book h5 {
69 | font-size: 1.1em;
70 | }
71 |
72 | .book h6 {
73 | font-size: 1.0em;
74 | }
75 | /* <-- EKa */
76 |
77 | .book h1 a:hover {
78 | color: #EC5800;
79 | text-decoration: none;
80 | }
81 |
82 | .book h2 a:hover,
83 | .book h3 a:hover,
84 | .book h4 a:hover {
85 | color: #666666;
86 | text-decoration: none;
87 | }
88 |
89 |
90 |
91 | /* Text Styles */
92 |
93 | .book div.SECT2 {
94 | margin-top: 4ex;
95 | }
96 |
97 | .book div.SECT3 {
98 | margin-top: 3ex;
99 | margin-left: 3ex;
100 | }
101 |
102 | .book .txtCurrentLocation {
103 | font-weight: bold;
104 | }
105 |
106 | .book p,
107 | .book ol,
108 | .book ul,
109 | .book li {
110 | line-height: 1.5em;
111 | }
112 |
113 | .book code {
114 | font-size: 1em;
115 | padding: 0px;
116 | color: #525f6c;
117 | background-color: #FFF;
118 | border-radius: 0px;
119 | }
120 |
121 | .book code, kbd, pre, samp {
122 | font-family: monospace,monospace;
123 | }
124 |
125 | .book .txtCommentsWrap {
126 | border: 2px solid #F5F5F5;
127 | width: 100%;
128 | }
129 |
130 | .book .txtCommentsContent {
131 | background: #F5F5F5;
132 | padding: 3px;
133 | }
134 |
135 | .book .txtCommentsPoster {
136 | float: left;
137 | }
138 |
139 | .book .txtCommentsDate {
140 | float: right;
141 | }
142 |
143 | .book .txtCommentsComment {
144 | padding: 3px;
145 | }
146 |
147 | .book #docContainer pre code,
148 | .book #docContainer pre tt,
149 | .book #docContainer pre pre,
150 | .book #docContainer tt tt,
151 | .book #docContainer tt code,
152 | .book #docContainer tt pre {
153 | font-size: 1em;
154 | }
155 |
156 | .book pre.LITERALLAYOUT,
157 | .book .SCREEN,
158 | .book .SYNOPSIS,
159 | .book .PROGRAMLISTING,
160 | .book .REFSYNOPSISDIV p,
161 | .book table.CAUTION,
162 | .book table.WARNING,
163 | .book blockquote.NOTE,
164 | .book blockquote.TIP,
165 | .book div.note,
166 | .book div.tip,
167 | .book table.CALSTABLE {
168 | -moz-box-shadow: 3px 3px 5px #DFDFDF;
169 | -webkit-box-shadow: 3px 3px 5px #DFDFDF;
170 | -khtml-box-shadow: 3px 3px 5px #DFDFDF;
171 | -o-box-shadow: 3px 3px 5px #DFDFDF;
172 | box-shadow: 3px 3px 5px #DFDFDF;
173 | }
174 |
175 | .book pre.LITERALLAYOUT,
176 | .book .SCREEN,
177 | .book .SYNOPSIS,
178 | .book .PROGRAMLISTING,
179 | .book .REFSYNOPSISDIV p,
180 | .book table.CAUTION,
181 | .book table.WARNING,
182 | .book blockquote.NOTE,
183 | .book blockquote.TIP
184 | .book div.note,
185 | .book div.tip {
186 | color: black;
187 | border-width: 1px;
188 | border-style: solid;
189 | padding: 2ex;
190 | margin: 2ex 0 2ex 2ex;
191 | overflow: auto;
192 | -moz-border-radius: 8px;
193 | -webkit-border-radius: 8px;
194 | -khtml-border-radius: 8px;
195 | border-radius: 8px;
196 | }
197 |
198 | .book div.note,
199 | .book div.tip {
200 | -moz-border-radius: 8px !important;
201 | -webkit-border-radius: 8px !important;
202 | -khtml-border-radius: 8px !important;
203 | border-radius: 8px !important;
204 | }
205 |
206 |
207 | .book pre.LITERALLAYOUT,
208 | .book pre.SYNOPSIS,
209 | .book pre.PROGRAMLISTING,
210 | .book .REFSYNOPSISDIV p,
211 | .book .SCREEN {
212 | border-color: #CFCFCF;
213 | background-color: #F7F7F7;
214 | }
215 |
216 | .book blockquote.NOTE,
217 | .book blockquote.TIP,
218 | .book div.note,
219 | .book div.tip {
220 | border-color: #DBDBCC;
221 | background-color: #EEEEDD;
222 | padding: 14px;
223 | width: 572px;
224 | /* font-size: 12px; */
225 | }
226 |
227 | .book blockquote.NOTE,
228 | .book blockquote.TIP,
229 | .book table.CAUTION,
230 | .book table.WARNING {
231 | margin: 4ex auto;
232 | }
233 |
234 | .book div.note,
235 | .book div.tip {
236 | margin: 4ex auto !important;
237 | }
238 |
239 |
240 | .book blockquote.NOTE p,
241 | .book blockquote.TIP p,
242 | .book div.note p,
243 | .book div.tip p {
244 | margin: 0;
245 | }
246 |
247 | .book blockquote.NOTE pre,
248 | .book blockquote.NOTE code,
249 | .book div.note pre,
250 | .book div.note code,
251 | .book blockquote.TIP pre,
252 | .book blockquote.TIP code,
253 | .book div.tip pre,
254 | .book div.tio code {
255 | margin-left: 0;
256 | margin-right: 0;
257 | -moz-box-shadow: none;
258 | -webkit-box-shadow: none;
259 | -khtml-box-shadow: none;
260 | -o-box-shadow: none;
261 | box-shadow: none;
262 | }
263 |
264 | .book .emphasis,
265 | .book .c2 {
266 | font-weight: bold;
267 | }
268 |
269 | .book .REPLACEABLE {
270 | font-style: italic;
271 | }
272 |
273 | /* Table Styles */
274 |
275 | .book table {
276 | margin-left: 2ex;
277 | }
278 |
279 | .book table.CALSTABLE td,
280 | .book table.CALSTABLE th,
281 | .book table.CAUTION td,
282 | .book table.CAUTION th,
283 | .book table.WARNING td,
284 | .book table.WARNING th {
285 | border-style: solid;
286 | }
287 |
288 | .book table.CALSTABLE,
289 | .book table.CAUTION,
290 | .book table.WARNING {
291 | border-spacing: 0;
292 | border-collapse: collapse;
293 | }
294 |
295 | .book table.CALSTABLE
296 | {
297 | margin: 2ex 0 2ex 2ex;
298 | background-color: #E0ECEF;
299 | border: 2px solid #A7C6DF;
300 | }
301 |
302 | .book table.CALSTABLE tr:hover td
303 | {
304 | background-color: #EFEFEF;
305 | }
306 |
307 | .book table.CALSTABLE td {
308 | background-color: #FFF;
309 | }
310 |
311 | .book table.CALSTABLE td,
312 | .book table.CALSTABLE th {
313 | border: 1px solid #A7C6DF;
314 | padding: 0.5ex 0.5ex;
315 | }
316 |
317 | table.CAUTION,
318 | .book table.WARNING {
319 | border-collapse: separate;
320 | display: block;
321 | padding: 0;
322 | max-width: 600px;
323 | }
324 |
325 | .book table.CAUTION {
326 | background-color: #F5F5DC;
327 | border-color: #DEDFA7;
328 | }
329 |
330 | .book table.WARNING {
331 | background-color: #FFD7D7;
332 | border-color: #DF421E;
333 | }
334 |
335 | .book table.CAUTION td,
336 | .book table.CAUTION th,
337 | .book table.WARNING td,
338 | .book table.WARNING th {
339 | border-width: 0;
340 | padding-left: 2ex;
341 | padding-right: 2ex;
342 | }
343 |
344 | .book table.CAUTION td,
345 | .book table.CAUTION th {
346 | border-color: #F3E4D5
347 | }
348 |
349 | .book table.WARNING td,
350 | .book table.WARNING th {
351 | border-color: #FFD7D7;
352 | }
353 |
354 | .book td.c1,
355 | .book td.c2,
356 | .book td.c3,
357 | .book td.c4,
358 | .book td.c5,
359 | .book td.c6 {
360 | font-size: 1.1em;
361 | font-weight: bold;
362 | border-bottom: 0px solid #FFEFEF;
363 | padding: 1ex 2ex 0;
364 | }
365 |
366 | .book .table thead {
367 | background: #E0ECEF;
368 | border-bottom: 1px solid #000;
369 | }
370 | .book .table > thead > tr > th {
371 | border-bottom: 1px solid #000;
372 | }
373 |
374 | .book td, th {
375 | padding: 0.1ex 0.5ex;
376 | }
377 |
378 | .book .book table tr:hover td {
379 | background-color: #EFEFEF;
380 | }
381 |
382 | /* Link Styles */
383 |
384 | .book #docNav a {
385 | font-weight: bold;
386 | }
387 |
388 | .book code.FUNCTION tt {
389 | font-size: 1em;
390 | }
391 |
392 | .book table.docs-compare {
393 | align: center;
394 | width: 90%;
395 | border: 2px solid #999;
396 | border-collapse: collapse;
397 | }
398 |
399 | .book table.docs-compare td {
400 | padding: 12px;
401 | border: 1px solid #DDD;
402 | }
403 |
404 | .book dd {
405 | margin-left: 40px;
406 | }
407 |
408 |
409 | .book .sidebar {
410 | padding: 8px;
411 | background: #FFF;
412 | width: auto;
413 | }
414 |
415 | .book pre {
416 | background: #f5f5f5;
417 | padding: 10px;
418 | border: 1px solid #ccc;
419 | border-radius: 4px;
420 | }
421 |
--------------------------------------------------------------------------------
/doc/stylesheet.xsl:
--------------------------------------------------------------------------------
1 |
2 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 | 1
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 | sect1 toc
23 |
24 |
25 |
26 |
--------------------------------------------------------------------------------
/expected/atx.out:
--------------------------------------------------------------------------------
1 | create table atx_test1(a text);
2 | -- check that commit of autonomous tx will not steal locks from parent tx
3 | begin;
4 | insert into atx_test1 values (1);
5 | select count(*) from pg_locks where transactionid=txid_current();
6 | count
7 | -------
8 | 1
9 | (1 row)
10 |
11 | begin autonomous;
12 | insert into atx_test1 values (1);
13 | select count(*) from pg_locks where transactionid=txid_current();
14 | count
15 | -------
16 | 1
17 | (1 row)
18 |
19 | commit;
20 | -- here we still should see our lock
21 | select count(*) from pg_locks where transactionid=txid_current();
22 | count
23 | -------
24 | 1
25 | (1 row)
26 |
27 | commit;
28 | drop table atx_test1;
29 |
--------------------------------------------------------------------------------
/multimaster.control:
--------------------------------------------------------------------------------
1 | comment = 'Multimaster'
2 | default_version = '1.0'
3 | module_pathname = '$libdir/multimaster'
4 | schema = mtm
5 | relocatable = false
6 |
--------------------------------------------------------------------------------
/referee/Makefile:
--------------------------------------------------------------------------------
1 | EXTENSION = referee
2 | DATA = referee--1.0.sql
3 | REGRESS = referee
4 |
5 | ifdef USE_PGXS
6 | PG_CONFIG = pg_config
7 | PGXS := $(shell $(PG_CONFIG) --pgxs)
8 | include $(PGXS)
9 | else
10 | subdir = contrib/mmts/referee
11 | top_builddir = ../../../
12 | include $(top_builddir)/src/Makefile.global
13 | include $(top_srcdir)/contrib/contrib-global.mk
14 | endif
15 |
--------------------------------------------------------------------------------
/referee/expected/referee.out:
--------------------------------------------------------------------------------
1 | CREATE EXTENSION referee;
2 | SELECT * FROM referee.decision;
3 | key | node_id | gen_num
4 | -----+---------+---------
5 | (0 rows)
6 |
7 | SELECT referee.request_grant(1, 7);
8 | request_grant
9 | ---------------
10 |
11 | (1 row)
12 |
13 | -- node can get its grant reissued
14 | SELECT referee.request_grant(1, 9);
15 | request_grant
16 | ---------------
17 |
18 | (1 row)
19 |
20 | -- but another can't get it while the previous is not cleared
21 | SELECT referee.request_grant(2, 4);
22 | ERROR: grant was already issued to node 1 in generation 9
23 | CONTEXT: PL/pgSQL function request_grant(integer,bigint) line 19 at RAISE
24 | SELECT referee.request_grant(2, 10);
25 | ERROR: grant was already issued to node 1 in generation 9
26 | CONTEXT: PL/pgSQL function request_grant(integer,bigint) line 19 at RAISE
27 | SELECT * FROM referee.decision;
28 | key | node_id | gen_num
29 | --------+---------+---------
30 | winner | 1 | 9
31 | (1 row)
32 |
33 | DELETE FROM referee.decision WHERE gen_num < 8 OR (node_id = 1 AND gen_num <= 9);
34 | -- surely 2 node can acquire the grant after removal of the old one
35 | SELECT referee.request_grant(2, 11);
36 | request_grant
37 | ---------------
38 |
39 | (1 row)
40 |
41 | SELECT * FROM referee.decision;
42 | key | node_id | gen_num
43 | --------+---------+---------
44 | winner | 2 | 11
45 | (1 row)
46 |
47 |
--------------------------------------------------------------------------------
/referee/referee--1.0.sql:
--------------------------------------------------------------------------------
1 | -- complain if script is sourced in psql, rather than via CREATE EXTENSION
2 | \echo Use "CREATE EXTENSION referee" to load this file. \quit
3 |
4 | CREATE TABLE IF NOT EXISTS referee.decision(
5 | key text PRIMARY KEY NOT NULL,
6 | node_id int,
7 | -- storing gen_num here guarantees we clear (delete) the grant which is
8 | -- indeed can already be cleared instead of accidently removing newer one
9 | gen_num bigint
10 | );
11 |
12 | -- returns nothing on success, bails out with ERROR on conflict
13 | CREATE OR REPLACE FUNCTION referee.request_grant(applicant_id int, gen_num bigint) RETURNS void AS
14 | $$
15 | DECLARE
16 | winner_id int;
17 | winner_gen_num bigint;
18 | BEGIN
19 | INSERT INTO referee.decision AS d VALUES ('winner', applicant_id, gen_num)
20 | ON CONFLICT (key) DO UPDATE SET
21 | node_id=EXCLUDED.node_id, gen_num=EXCLUDED.gen_num
22 | -- reissue grant iff it was previously given to this node, not another
23 | WHERE d.node_id = EXCLUDED.node_id AND
24 | -- this could be assert as well, node never repeats request with the same
25 | -- gen num
26 | d.gen_num < EXCLUDED.gen_num
27 | RETURNING applicant_id INTO winner_id;
28 | -- if insertion hasn't happened, there must have been conflict with existing
29 | -- grant
30 | IF winner_id IS NULL THEN
31 | SELECT d.node_id, d.gen_num INTO winner_id, winner_gen_num FROM referee.decision d;
32 | RAISE EXCEPTION 'grant was already issued to node % in generation %', winner_id, winner_gen_num;
33 | END IF;
34 | END
35 | $$ LANGUAGE plpgsql;
36 |
37 | CREATE OR REPLACE FUNCTION referee.clean() RETURNS bool AS
38 | $$
39 | BEGIN
40 | delete from referee.decision where key = 'winner';
41 | return 'true';
42 | END
43 | $$ LANGUAGE plpgsql;
44 |
--------------------------------------------------------------------------------
/referee/referee.control:
--------------------------------------------------------------------------------
1 | comment = 'Multimaster referee'
2 | default_version = '1.0'
3 | module_pathname = '$libdir/referee'
4 | schema = referee
5 | relocatable = false
6 |
--------------------------------------------------------------------------------
/referee/sql/referee.sql:
--------------------------------------------------------------------------------
1 | CREATE EXTENSION referee;
2 |
3 | SELECT * FROM referee.decision;
4 |
5 | SELECT referee.request_grant(1, 7);
6 | -- node can get its grant reissued
7 | SELECT referee.request_grant(1, 9);
8 | -- but another can't get it while the previous is not cleared
9 | SELECT referee.request_grant(2, 4);
10 | SELECT referee.request_grant(2, 10);
11 | SELECT * FROM referee.decision;
12 |
13 | DELETE FROM referee.decision WHERE gen_num < 8 OR (node_id = 1 AND gen_num <= 9);
14 | -- surely 2 node can acquire the grant after removal of the old one
15 | SELECT referee.request_grant(2, 11);
16 | SELECT * FROM referee.decision;
17 |
--------------------------------------------------------------------------------
/run.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env perl
2 |
3 | use File::Basename;
4 | use Getopt::Long;
5 | BEGIN { unshift @INC, '.'; unshift @INC, '../../src/test/perl' }
6 | use Cluster;
7 |
8 | my $n_nodes = 3;
9 | my $referee = 0;
10 | my $action = 'start';
11 | GetOptions ("nnodes=i" => \$n_nodes, # numeric
12 | "referee" => \$referee, # flag
13 | "action=s" => \$action); # strings
14 | # referee works only with 2 nodes
15 | if ($referee)
16 | {
17 | $n_nodes = 2;
18 | }
19 |
20 | if ($action eq "start")
21 | {
22 | $Cluster::last_port_assigned = 65431;
23 |
24 | my $cluster = new Cluster($n_nodes, $referee);
25 | $cluster->init();
26 | $cluster->start();
27 | $cluster->create_mm('regression');
28 |
29 | # prevent PostgresNode.pm from shutting down nodes on exit in END {}
30 | @PostgresNode::all_nodes = ();
31 | }
32 | elsif ($action eq "stop")
33 | {
34 | my @datas = <$TestLib::tmp_check/*data>;
35 | foreach my $data (@datas) {
36 | TestLib::system_log('pg_ctl',
37 | '-D', "$data/pgdata",
38 | '-m', 'fast',
39 | 'stop');
40 | }
41 | }
42 | else
43 | {
44 | die("Usage: run.pl action= [opts]\n");
45 | }
46 |
--------------------------------------------------------------------------------
/sql/atx.sql:
--------------------------------------------------------------------------------
1 | create table atx_test1(a text);
2 |
3 |
4 | -- check that commit of autonomous tx will not steal locks from parent tx
5 | begin;
6 | insert into atx_test1 values (1);
7 | select count(*) from pg_locks where transactionid=txid_current();
8 | begin autonomous;
9 | insert into atx_test1 values (1);
10 | select count(*) from pg_locks where transactionid=txid_current();
11 | commit;
12 | -- here we still should see our lock
13 | select count(*) from pg_locks where transactionid=txid_current();
14 | commit;
15 |
16 | drop table atx_test1;
17 |
--------------------------------------------------------------------------------
/src/bkb.c:
--------------------------------------------------------------------------------
1 | /*
2 | * bkb.c
3 | *
4 | * Bron–Kerbosch algorithm to find maximum clique in a graph.
5 | *
6 | * Copyright (c) 2017-2021, Postgres Professional
7 | *
8 | */
9 | #ifndef TEST
10 | #include "bkb.h"
11 |
12 | #else
13 | #include
14 | #include
15 | #define Assert(expr) assert(expr)
16 | typedef uint64_t nodemask_t;
17 | #define MAX_NODES 64
18 | #define BIT_CHECK(mask, bit) (((mask) & ((nodemask_t)1 << (bit))) != 0)
19 | #define BIT_SET(mask, bit) (mask |= ((nodemask_t)1 << (bit)))
20 | #endif
21 |
22 | typedef struct {
23 | int size;
24 | int nodes[MAX_NODES];
25 | } NodeList;
26 |
27 | static void
28 | _list_append(NodeList* list, int n)
29 | {
30 | list->nodes[list->size++] = n;
31 | }
32 |
33 | static void
34 | _list_copy(NodeList* dst, NodeList const* src)
35 | {
36 | int i;
37 | int n = src->size;
38 | dst->size = n;
39 | for (i = 0; i < n; i++) {
40 | dst->nodes[i] = src->nodes[i];
41 | }
42 | }
43 |
44 | static nodemask_t
45 | _list_to_nodemask(NodeList *list)
46 | {
47 | nodemask_t res = 0;
48 | int i;
49 |
50 | for (i = 0; i < list->size; i++)
51 | BIT_SET(res, list->nodes[i]);
52 | return res;
53 | }
54 |
55 | /*
56 | * See original paper
57 | * Bron, Coen; Kerbosch, Joep (1973), "Algorithm 457: finding all cliques of
58 | * an undirected graph", Commun. ACM, ACM, 16 (9): 575–577
59 | * or wiki article (I recommend the latter). Var names (and generally the code)
60 | * here closely resemble ones in the original paper and deserve some deciphering:
61 | * - cur is R in wiki
62 | * - oldSet[0; ne) is X in wiki
63 | * - oldSet[ne; ce) is P in wiki
64 | *
65 | * Pristine Bron-Kerbosch algorithm calculates *all* max cliques. In mtm we
66 | * don't need that, so we return in result only one biggest max clique
67 | * (actually, this means we could avoid maintaining X altogether).
68 | * What we do need though is deterministic calculation, so that whenever we
69 | * have a majority of nodes seeing each other, *all* members of some such
70 | * majority calculate *the same* clique. e.g. with topology
71 | *
72 | * 2
73 | * /|\
74 | * 1 | 3
75 | * \|/
76 | * 4
77 | *
78 | * 2 and 4 must calculate the same clique, or we won't converge.
79 | * To this end, we compare max cliques by nodemask and pick the
80 | * smallest one.
81 | */
82 | static void
83 | extend(NodeList* cur, NodeList* result, nodemask_t* graph, int* oldSet, int ne, int ce)
84 | {
85 | int nod = 0;
86 | int minnod = ce;
87 | int fixp = -1; /* pivot (u in wiki) */
88 | /* index in oldSet of next vertice we'll include in R -- vertex v in wiki*/
89 | int s = -1;
90 | int i, j, k;
91 | int newce, newne;
92 | int sel; /* the vertex moved P->R itself, pointed to by s -- v in wiki */
93 | int newSet[MAX_NODES];
94 |
95 | /* Choose the pivot vertex fixp */
96 | for (i = 0; i < ce && minnod != 0; i++)
97 | {
98 | int p = oldSet[i];
99 | int cnt = 0;
100 | int pos = -1;
101 |
102 | /*
103 | * Count how many non-neighbours of potential pivot we have in P.
104 | * Counterintuitively, we require input to have self-loops, so node is
105 | * sorta neighbour of itself, though we must also recurse into it and
106 | * thus we miss it here (in cnt) and count it in nod instead.
107 | * This mumbo-jumbo is important as it forces (cnt < minnod) be true
108 | * when P contains only one vertex (minnod=1 initially).
109 | * I'd actually make initial minnod bigger and remove self loops...
110 | */
111 | for (j = ne; j < ce && cnt < minnod; j++)
112 | {
113 | if (!BIT_CHECK(graph[p], oldSet[j]))
114 | {
115 | cnt++;
116 | pos = j;
117 | }
118 | }
119 |
120 | if (cnt < minnod)
121 | {
122 | minnod = cnt;
123 | fixp = p;
124 | if (i < ne)
125 | {
126 | /* if pivot is from X, not P, take random non-neighbour */
127 | s = pos;
128 | }
129 | else
130 | {
131 | /*
132 | * else, process pivot itself first, otherwise we won't find
133 | * it in the loop below as pivot is a neighbour of itself
134 | */
135 | s = i;
136 | /* don't forget to increment num of nodes to recurse to */
137 | nod = 1;
138 | }
139 | }
140 | }
141 |
142 | for (k = minnod + nod; k >= 1; k--)
143 | {
144 | Assert(s >= 0);
145 | Assert(s < MAX_NODES);
146 | Assert(ne >= 0);
147 | Assert(ne < MAX_NODES);
148 | Assert(ce >= 0);
149 | Assert(ce < MAX_NODES);
150 |
151 | /*
152 | * put (wiki) v on the border of X and P, we'll move the border to
153 | * relocate the vertex
154 | */
155 | sel = oldSet[s];
156 | oldSet[s] = oldSet[ne];
157 | oldSet[ne] = sel;
158 |
159 | newne = 0;
160 | /* form X for recursive call -- leave only v's neighbours */
161 | for (i = 0; i < ne; i++) {
162 | if (BIT_CHECK(graph[sel], oldSet[i])) {
163 | newSet[newne++] = oldSet[i];
164 | }
165 | }
166 |
167 | newce = newne;
168 | /*
169 | * similarly, form P for recursive call -- leave only v's neighbours
170 | *
171 | * + 1 skips v itself, which is moved to R (again the crutch
172 | * introduced by self loops)
173 | */
174 | for (i = ne + 1; i < ce; i++) {
175 | if (BIT_CHECK(graph[sel], oldSet[i])) {
176 | newSet[newce++] = oldSet[i];
177 | }
178 | }
179 | /* push v to R */
180 | _list_append(cur, sel);
181 | if (newce == 0) { /* both P and X are empty => max clique */
182 | if (result->size < cur->size ||
183 | (result->size == cur->size &&
184 | _list_to_nodemask(result) > _list_to_nodemask(cur))) {
185 | _list_copy(result, cur);
186 | }
187 | } else if (newne < newce) { /* P is not empty, so recurse */
188 | if (cur->size + newce - newne > result->size) {
189 | extend(cur, result, graph, newSet, newne, newce);
190 | }
191 | }
192 | /* remove v back from R for the next iteration */
193 | cur->size -= 1;
194 | /* move v from P to X */
195 | ne += 1;
196 | /* and find in P next non-neighbour of pivot */
197 | if (k > 1)
198 | {
199 |
200 | for (s = ne; BIT_CHECK(graph[fixp], oldSet[s]); s++)
201 | {
202 | Assert(s < MAX_NODES);
203 | }
204 | }
205 | }
206 | }
207 |
208 | /*
209 | * Deterministically (c.f. extend) calculates biggest max clique of the graph.
210 | * The matrix must be symmetric (undirected graph) and must have 1 on the
211 | * diagonal (self loops).
212 | *
213 | * Note that this API renders impossible to distinguish absent node from node
214 | * without any edges -- absent nodes with ids <= n_nodes must still have 1
215 | * on the diagonal. This is fine as we are not interested much in cliques
216 | * of size 1, they never form majority; well, not as far as we don't support
217 | * cluster of size 1.
218 | */
219 | nodemask_t
220 | MtmFindMaxClique(nodemask_t* graph, int n_nodes, int* clique_size)
221 | {
222 | NodeList tmp;
223 | NodeList result;
224 | int all[MAX_NODES];
225 | int i;
226 | int j;
227 |
228 | tmp.size = 0;
229 | result.size = 0;
230 | for (i = 0; i < MAX_NODES; i++)
231 | all[i] = i;
232 |
233 | /* check that matrix is symmetric */
234 | for (i = 0; i < n_nodes; i++)
235 | for (j = 0; j < n_nodes; j++)
236 | Assert(BIT_CHECK(graph[i], j) == BIT_CHECK(graph[j], i));
237 |
238 | /* algorithm requires diagonal elements to be set */
239 | for (i = 0; i < n_nodes; i++)
240 | Assert(BIT_CHECK(graph[i], i));
241 |
242 | extend(&tmp, &result, graph, all, 0, n_nodes);
243 |
244 | *clique_size = result.size;
245 | return _list_to_nodemask(&result);
246 | }
247 |
248 | #ifdef TEST
249 | #include
250 |
251 | /*
252 | * To run some randomized tests, compile with -DTEST to ./a.out, e.g.
253 | * gcc -ggdb3 -O0 -DTEST bkb.c
254 | * , install sage and run ./test_bkb.sage.py
255 | */
256 |
257 | int main()
258 | {
259 | nodemask_t matrix[64] = {0};
260 | nodemask_t clique;
261 | int clique_size;
262 | int n_nodes;
263 |
264 | n_nodes = 4;
265 | matrix[0] = 15; /* 1111 */
266 | matrix[1] = 15; /* 1111 */
267 | matrix[2] = 7; /* 0111 */
268 | matrix[3] = 11; /* 1011 */
269 |
270 | scanf("%d", &n_nodes);
271 | for (int i = 0; i < n_nodes; i++)
272 | {
273 | nodemask_t row;
274 | scanf("%ld", &row);
275 | matrix[i] = row;
276 | }
277 |
278 | clique = MtmFindMaxClique(matrix, n_nodes, &clique_size);
279 | printf("%ld %d\n", clique, clique_size);
280 | return 0;
281 | }
282 | #endif
283 |
--------------------------------------------------------------------------------
/src/bytebuf.c:
--------------------------------------------------------------------------------
1 | /*
2 | * bytebuf.c
3 | *
4 | * Copyright (c) 2016-2021, Postgres Professional
5 | *
6 | */
7 | #include "postgres.h"
8 |
9 | #include "bytebuf.h"
10 |
11 | #define INIT_BUF_SIZE 1024
12 |
13 | void
14 | ByteBufferAlloc(ByteBuffer *buf)
15 | {
16 | buf->size = INIT_BUF_SIZE;
17 | buf->data = palloc(buf->size);
18 | buf->used = 0;
19 | }
20 |
21 | void
22 | ByteBufferAppend(ByteBuffer *buf, void *data, int len)
23 | {
24 | if (buf->used + len > buf->size)
25 | {
26 | buf->size = buf->used + len > buf->size * 2 ? buf->used + len : buf->size * 2;
27 | buf->data = (char *) repalloc(buf->data, buf->size);
28 | }
29 | memcpy(&buf->data[buf->used], data, len);
30 | buf->used += len;
31 | }
32 |
33 | void
34 | ByteBufferAppendInt32(ByteBuffer *buf, int data)
35 | {
36 | ByteBufferAppend(buf, &data, sizeof data);
37 | }
38 |
39 | void
40 | ByteBufferFree(ByteBuffer *buf)
41 | {
42 | pfree(buf->data);
43 | }
44 |
45 | void
46 | ByteBufferReset(ByteBuffer *buf)
47 | {
48 | buf->used = 0;
49 | }
50 |
--------------------------------------------------------------------------------
/src/ddd.c:
--------------------------------------------------------------------------------
1 | /*----------------------------------------------------------------------------
2 | *
3 | * ddd.c
4 | *
5 | * Distributed deadlock detector.
6 | *
7 | * Copyright (c) 2017-2021, Postgres Professional
8 | *
9 | *----------------------------------------------------------------------------
10 | */
11 |
12 | #include "postgres.h"
13 | #include "access/clog.h"
14 | #include "access/twophase.h"
15 | #include "access/transam.h"
16 | #include "storage/lwlock.h"
17 | #include "storage/ipc.h"
18 | #include "storage/proc.h"
19 | #include "utils/hsearch.h"
20 | #include "utils/timeout.h"
21 | #include "miscadmin.h"
22 | #include "replication/origin.h"
23 | #include "replication/message.h"
24 | #include "utils/builtins.h"
25 | #include "storage/lmgr.h"
26 | #include "storage/procarray.h"
27 |
28 | #include "multimaster.h"
29 |
30 | #include "ddd.h"
31 | #include "bytebuf.h"
32 | #include "state.h"
33 | #include "logger.h"
34 | #include "commit.h"
35 |
36 |
37 | /*
38 | * This DDD is based on following observations:
39 | *
40 | * Situation when a transaction (say T1) in apply_worker (or receiver
41 | * itself) stucks on some lock created by a transaction in a local backend (say
42 | * T2) will definitely lead to a deadlock since T2 after being prepared and
43 | * replicated will fail to obtain lock that is already held by T1.
44 | * Same reasoning may be applied to the situation when apply_worker (or
45 | * receiver) is waiting for an apply_worker (or receiver) belonging to other
46 | * origin -- no need to wait for a distributed deadlock detection and we may
47 | * just instantly abort.
48 | * Only case for distributed deadlock that is left is when apply_worker
49 | * (or receiver) is waiting for another apply_worker from same origin. However,
50 | * such situation isn't possible since one origin node can not have two
51 | * conflicting prepared transaction simultaneously.
52 | *
53 | * So we may construct distributed deadlock avoiding mechanism by disallowing
54 | * such edges. Now we may ask inverse question: what amount of wait graphs
55 | * with such edges are actually do not represent distributed deadlock? That may
56 | * happen in cases when holding transaction is purely local since it holding
57 | * locks only in SHARED mode. Only lock levels that are conflicting with this
58 | * modes are EXCLUSIVE and ACCESS EXCLUSIVE. In all other cases proposed
59 | * avoiding scheme should not yield false positives.
60 | *
61 | * To cope with false positives in EXCLUSIVE and ACCESS EXCLUSIVE modes we
62 | * may throw exception not in WaitOnLock() when we first saw forbidden edge
63 | * but later during first call to local deadlock detector. This way we still
64 | * have `deadlock_timeout` second to grab that lock and database user also can
65 | * increase it on per-transaction basis if there are long-living read-only
66 | * transactions.
67 | *
68 | * As a further optimization it is possible to check whether our lock is
69 | * EXCLUSIVE or higher so not to delay rollback till `deadlock_timeout` event.
70 | */
71 | bool
72 | MtmDetectGlobalDeadLock(PGPROC *proc)
73 | {
74 | StringInfoData locktagbuf;
75 | LOCK *lock = proc->waitLock;
76 | bool is_detected = false;
77 | Assert(proc == MyProc);
78 |
79 | /*
80 | * These locks never participate in deadlocks, ignore them. Without it,
81 | * spurious deadlocks might be reported due to concurrency on rel
82 | * extension.
83 | */
84 | if (LOCK_LOCKTAG(*lock) == LOCKTAG_RELATION_EXTEND ||
85 | (LOCK_LOCKTAG(*lock) == LOCKTAG_PAGE))
86 | return false;
87 |
88 | /*
89 | * There is no need to check for deadlocks in recovery: all
90 | * conflicting transactions must be eventually committed/aborted
91 | * by the resolver. It would not be fatal, but restarting due to
92 | * deadlock ERRORs might significantly slow down the recovery
93 | */
94 | is_detected = (curr_replication_mode == REPLMODE_NORMAL);
95 |
96 | if (is_detected)
97 | {
98 | initStringInfo(&locktagbuf);
99 | DescribeLockTag(&locktagbuf, &lock->tag);
100 | mtm_log(LOG, "apply worker %d waits for %s on %s",
101 | MyProcPid,
102 | GetLockmodeName(lock->tag.locktag_lockmethodid, proc->waitLockMode),
103 | locktagbuf.data);
104 | }
105 |
106 | return is_detected;
107 |
108 | }
109 |
--------------------------------------------------------------------------------
/src/include/bgwpool.h:
--------------------------------------------------------------------------------
1 | #ifndef __BGWPOOL_H__
2 | #define __BGWPOOL_H__
3 |
4 | #include "storage/lwlock.h"
5 | #include "storage/pg_sema.h"
6 | #include "postmaster/bgworker.h"
7 | #include "storage/condition_variable.h"
8 | #include "storage/dsm.h"
9 |
10 | #include "receiver.h"
11 |
12 | #define MAX_DBNAME_LEN 30
13 | #define MAX_DBUSER_LEN 30
14 | #define MAX_NAME_LEN 30
15 | #define MULTIMASTER_BGW_RESTART_TIMEOUT BGW_NEVER_RESTART /* seconds */
16 |
17 | typedef struct
18 | {
19 | int value; /* 0 - not used; 1 - transaction; 2 - sync
20 | * point */
21 | int prev;
22 | int next;
23 | } txlelem_t;
24 |
25 | typedef struct
26 | {
27 | txlelem_t *store;
28 | int tail;
29 | int head;
30 | int size;
31 | int nelems;
32 | LWLock lock;
33 | ConditionVariable syncpoint_cv;
34 | ConditionVariable transaction_cv;
35 | } txlist_t;
36 |
37 | /*
38 | * Shared data of BgwPool
39 | */
40 | typedef struct BgwPool
41 | {
42 | int sender_node_id;
43 | LWLock lock;
44 | ConditionVariable syncpoint_cv;
45 | int n_holders;
46 |
47 | /* Tell workers that queue contains a number of work. */
48 | ConditionVariable available_cv;
49 |
50 | /*
51 | * Queue is full. We can't insert a work data into the queue and wait
52 | * while any worker will take over a piece of data from queue and we will
53 | * do an attempt to try to add the work data into the queue.
54 | */
55 | ConditionVariable overflow_cv;
56 |
57 | /* Queue state */
58 | size_t head;
59 | size_t tail;
60 | size_t size; /* Size of queue aligned to INT word */
61 |
62 | bool producerBlocked;
63 |
64 | char poolName[MAX_NAME_LEN];
65 | Oid db_id;
66 | Oid user_id;
67 | dsm_handle dsmhandler; /* DSM descriptor. Workers use it for
68 | * attaching */
69 |
70 | size_t nWorkers; /* a number of pool workers launched */
71 | TimestampTz lastDynamicWorkerStartTime;
72 | /* Handlers of workers at the pool */
73 | BackgroundWorkerHandle **bgwhandles;
74 | pid_t receiver_pid;
75 |
76 | txlist_t txlist;
77 | } BgwPool;
78 |
79 |
80 | extern void BgwPoolStart(int sender_node_id, char *poolName, Oid db_id, Oid user_id);
81 | extern void BgwPoolExecute(BgwPool *pool, void *work, int size, MtmReceiverWorkerContext *rwctx);
82 | extern void BgwPoolShutdown(BgwPool *poolDesc);
83 | extern void BgwPoolCancel(BgwPool *pool);
84 |
85 | extern int txl_store(txlist_t *txlist, int value);
86 | extern void txl_remove(txlist_t *txlist, int txlist_pos);
87 | extern void txl_wait_syncpoint(txlist_t *txlist, int txlist_pos);
88 | extern void txl_wait_sphead(txlist_t *txlist, int txlist_pos);
89 | extern void txl_wait_txhead(txlist_t *txlist, int txlist_pos);
90 | extern void txl_wakeup_workers(txlist_t *txlist);
91 |
92 | #endif
93 |
--------------------------------------------------------------------------------
/src/include/bkb.h:
--------------------------------------------------------------------------------
1 | /*
2 | * Bron–Kerbosch algorithm to find maximum clique in graph
3 | */
4 | #ifndef __BKB_H__
5 | #define __BKB_H__
6 |
7 | #include "postgres.h"
8 |
9 | #include "multimaster.h" /* xxx move nodemask to separate file */
10 |
11 | extern uint64 MtmFindMaxClique(uint64 *matrix, int n_modes, int *clique_size);
12 |
13 | #endif
14 |
--------------------------------------------------------------------------------
/src/include/bytebuf.h:
--------------------------------------------------------------------------------
1 | #ifndef __BYTEBUF_H__
2 | #define __BYTEBUF_H__
3 |
4 | typedef struct
5 | {
6 | char *data;
7 | int size;
8 | int used;
9 | } ByteBuffer;
10 |
11 | extern void ByteBufferAlloc(ByteBuffer *buf);
12 | extern void ByteBufferAppend(ByteBuffer *buf, void *data, int len);
13 | extern void ByteBufferAppendInt32(ByteBuffer *buf, int data);
14 | extern void ByteBufferFree(ByteBuffer *buf);
15 | extern void ByteBufferReset(ByteBuffer *buf);
16 |
17 | #endif
18 |
--------------------------------------------------------------------------------
/src/include/commit.h:
--------------------------------------------------------------------------------
1 | /*----------------------------------------------------------------------------
2 | *
3 | * ddl.h
4 | * Statement based replication of DDL commands.
5 | *
6 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7 | * Portions Copyright (c) 1994, Regents of the University of California
8 | * Portions Copyright (c) 2021, Postgres Professional
9 | *
10 | *----------------------------------------------------------------------------
11 | */
12 |
13 | #ifndef COMMIT_H
14 | #define COMMIT_H
15 |
16 | #include "postgres.h"
17 | #include "access/xact.h"
18 |
19 | #include "messaging.h"
20 |
21 | /*
22 | * gid starting with MTM is used by internal multimaster 2PC xacts; clients
23 | * shouldn't use them for their own prepares.
24 | */
25 | #define IS_EXPLICIT_2PC_GID(gid) (strncmp((gid), "MTM-", 4) != 0)
26 |
27 | extern void MtmGenerateGid(char *gid, int node_id, TransactionId xid,
28 | uint64 gen_num);
29 | extern uint64 MtmGidParseGenNum(const char *gid);
30 | extern int MtmGidParseNodeId(const char *gid);
31 | extern TransactionId MtmGidParseXid(const char *gid);
32 |
33 | extern bool MtmTwoPhaseCommit(void);
34 | extern void MtmBeginTransaction(void);
35 | extern void MtmXactCallback(XactEvent event, void *arg);
36 |
37 | extern bool MtmExplicitPrepare(char *gid);
38 | extern void MtmExplicitFinishPrepared(bool isTopLevel, char *gid, bool isCommit);
39 |
40 | #endif
41 |
--------------------------------------------------------------------------------
/src/include/compat.h:
--------------------------------------------------------------------------------
1 | #ifndef MTMCOMPAT_H
2 | #define MTMCOMPAT_H
3 |
4 | /* EE pooler gets rid of static variable */
5 | #ifdef PGPRO_EE
6 | #define FeBeWaitSetCompat() (MyProcPort->pqcomm_waitset)
7 | #else
8 | #define FeBeWaitSetCompat() (FeBeWaitSet)
9 | #endif
10 |
11 | #ifdef PGPRO_EE /* atx */
12 | #define BeginTransactionBlockCompat() (BeginTransactionBlock(false, NIL))
13 | #define UserAbortTransactionBlockCompat(chain) (UserAbortTransactionBlock(false, (chain)))
14 | #else
15 | #define BeginTransactionBlockCompat() (BeginTransactionBlock())
16 | #define UserAbortTransactionBlockCompat(chain) (UserAbortTransactionBlock(chain))
17 | #endif
18 |
19 | /* atx renames this for some reason */
20 | #ifdef PGPRO_EE
21 | #define on_commits_compat() (pg_on_commit_actions)
22 | #else
23 | #define on_commits_compat() (on_commits)
24 | #endif
25 |
26 | #endif /* MTMCOMPAT_H */
27 |
--------------------------------------------------------------------------------
/src/include/ddd.h:
--------------------------------------------------------------------------------
1 | #ifndef __DDD_H__
2 | #define __DDD_H__
3 |
4 | extern bool MtmDetectGlobalDeadLock(PGPROC *proc);
5 |
6 | #endif
7 |
--------------------------------------------------------------------------------
/src/include/ddl.h:
--------------------------------------------------------------------------------
1 | /*----------------------------------------------------------------------------
2 | *
3 | * ddl.h
4 | * Statement based replication of DDL commands.
5 | *
6 | * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
7 | * Portions Copyright (c) 1994, Regents of the University of California
8 | * Portions Copyright (c) 2021, Postgres Professional
9 | *
10 | *----------------------------------------------------------------------------
11 | */
12 |
13 | #ifndef DML_H
14 | #define DML_H
15 |
16 | #include "utils/relcache.h"
17 |
18 | /* GUCs */
19 | extern bool MtmMonotonicSequences;
20 | extern char *MtmRemoteFunctionsList;
21 | extern bool MtmRemoteFunctionsUpdating;
22 | extern bool MtmVolksWagenMode;
23 | extern bool MtmIgnoreTablesWithoutPk;
24 |
25 | typedef enum
26 | {
27 | MTM_DDL_IN_PROGRESS_NOTHING,
28 | MTM_DDL_IN_PROGRESS_TX,
29 | MTM_DDL_IN_PROGRESS_NONTX,
30 | } MtmDDLInProgress;
31 |
32 | extern MtmDDLInProgress DDLApplyInProgress;
33 |
34 | extern void MtmDDLReplicationInit(void);
35 | extern void MtmDDLReplicationShmemStartup(void);
36 | extern void temp_schema_reset_all(int my_node_id);
37 | extern bool MtmIsRelationLocal(Relation rel);
38 | extern void MtmDDLResetStatement(void);
39 | extern void MtmApplyDDLMessage(const char *messageBody, bool transactional);
40 | extern void MtmDDLResetApplyState(void);
41 | extern void MtmSetRemoteFunction(char const *list, void *extra);
42 | extern void MtmToggleDML(void);
43 | extern void MtmMakeTableLocal(char const *schema, char const *name, bool locked);
44 | extern void multimaster_fmgr_hook(FmgrHookEventType event, FmgrInfo *flinfo, Datum *private);
45 |
46 | #endif
47 |
--------------------------------------------------------------------------------
/src/include/dmq.h:
--------------------------------------------------------------------------------
1 | #ifndef DMQ_H
2 | #define DMQ_H
3 |
4 | #include "libpq-fe.h"
5 | #include "lib/stringinfo.h"
6 |
7 | typedef int8 DmqDestinationId;
8 |
9 | #define DMQ_NAME_MAXLEN 32
10 | /* mm currently uses xact gid as stream name, so this should be >= GIDSIZE */
11 | #define DMQ_STREAM_NAME_MAXLEN 200
12 |
13 | extern void dmq_init(int send_timeout, int connect_timeout);
14 |
15 | #define DMQ_N_MASK_POS 16 /* ought to be >= MTM_MAX_NODES */
16 | extern DmqDestinationId dmq_destination_add(char *connstr, char *sender_name,
17 | char *receiver_name, int8 recv_mask_pos,
18 | int ping_period);
19 | extern void dmq_destination_drop(char *receiver_name);
20 | extern void dmq_destination_reconnect(char *receiver_name);
21 |
22 | extern void dmq_attach_receiver(char *sender_name, int8 mask_pos);
23 | extern void dmq_detach_receiver(char *sender_name);
24 |
25 | extern void dmq_terminate_receiver(char *name);
26 |
27 | extern void dmq_reattach_receivers(void);
28 | extern void dmq_stream_subscribe(char *stream_name);
29 | extern void dmq_stream_unsubscribe(void);
30 |
31 | extern void dmq_get_sendconn_cnt(uint64 participants, int *sconn_cnt);
32 | extern bool dmq_pop(int8 *sender_mask_pos, StringInfo msg, uint64 mask);
33 | extern bool dmq_pop_nb(int8 *sender_mask_pos, StringInfo msg, uint64 mask, bool *wait);
34 | extern uint64 dmq_purge_failed_participants(uint64 participants, int *sconn_cnt);
35 |
36 | extern void dmq_push(DmqDestinationId dest_id, char *stream_name, char *msg);
37 | extern void dmq_push_buffer(DmqDestinationId dest_id, char *stream_name, const void *buffer, size_t len);
38 |
39 | typedef void (*dmq_hook_type) (char *);
40 | extern void *(*dmq_receiver_start_hook)(char *sender_name);
41 | extern dmq_hook_type dmq_receiver_stop_hook;
42 | extern void (*dmq_receiver_heartbeat_hook)(char *sender_name, StringInfo msg, void *extra);
43 | extern dmq_hook_type dmq_sender_connect_hook;
44 | extern void (*dmq_sender_heartbeat_hook)(char *receiver_name, StringInfo buf);
45 | extern dmq_hook_type dmq_sender_disconnect_hook;
46 |
47 | #endif
48 |
--------------------------------------------------------------------------------
/src/include/global_tx.h:
--------------------------------------------------------------------------------
1 | /*----------------------------------------------------------------------------
2 | *
3 | * global_tx.h
4 | * Persistent and in-memory state necessary for our E3PC-like atomic commit
5 | # protocol.
6 | *
7 | * Copyright (c) 2016-2021, Postgres Professional
8 | *
9 | *----------------------------------------------------------------------------
10 | */
11 | #ifndef GLOBAL_TX_H
12 | #define GLOBAL_TX_H
13 |
14 | #include "multimaster.h"
15 |
16 | typedef struct
17 | {
18 | int ballot;
19 | int node_id;
20 | } GlobalTxTerm;
21 |
22 | #define InvalidGTxTerm ((GlobalTxTerm) {0, 0})
23 | /*
24 | * This term with ballot 1 and fake 0 node id is less than any term generated
25 | * by resolver; it is used by the coordinator itself.
26 | */
27 | #define InitialGTxTerm ((GlobalTxTerm) {1, 0})
28 |
29 | typedef enum
30 | {
31 | GTXInvalid = 0, /* we never gave a vote */
32 | GTXPreCommitted, /* voted for commit */
33 | GTXPreAborted, /* voted for abort */
34 | GTXCommitted, /* definitely know xact is committed */
35 | GTXAborted /* definitely know xact is aborted */
36 | } GlobalTxStatus;
37 |
38 | extern char const *const GlobalTxStatusMnem[];
39 |
40 | typedef enum
41 | {
42 | GTRS_AwaitStatus, /* 1a sent, wait for 1b */
43 | GTRS_AwaitAcks /* 2a sent, wait for 2b */
44 | } GlobalTxResolvingStage;
45 |
46 | typedef struct
47 | {
48 | GlobalTxTerm proposal; /* nextBal in terms of The Part-Time Parliament */
49 | GlobalTxTerm accepted; /* prevBal in terms of The Part-Time Parliament */
50 | GlobalTxStatus status; /*
51 | * prevDec in terms of The Part-Time Parliament
52 | * (or special never voted | commit | abort)
53 | */
54 | } GTxState;
55 |
56 | /*
57 | * Constant xact metadata which we encode into state_3pc. We could (and
58 | * previously did) carry that directly in gid, but this intervenes with
59 | * explicit 2PC usage: applier must know generation of the xact, and
60 | * scribbling over user-provided gid is ugly and/or inefficient.
61 | */
62 | typedef struct
63 | {
64 | int coordinator; /* node id who initiated the transaction */
65 | TransactionId xid; /* xid at coordinator */
66 | uint64 gen_num; /* the number of generation xact belongs to */
67 | nodemask_t configured; /* mask of configured nodes of this generation;
68 | * the idea was to use this by resolver, but it
69 | * wasn't finished. We shouldn't have any problems
70 | * with this anyway if all xacts created before
71 | * first node add-rm are resolved before the
72 | * second one is started
73 | */
74 | } XactInfo;
75 |
76 | typedef struct GlobalTx
77 | {
78 | char gid[GIDSIZE];
79 | XactInfo xinfo;
80 | XLogRecPtr coordinator_end_lsn;
81 | BackendId acquired_by;
82 | /* paxos voting state for this xact */
83 | GTxState state;
84 | /* transient thing used to rm shmem entry on error */
85 | bool prepared;
86 |
87 | /* resolver corner */
88 | bool orphaned; /* Indication for resolver that current tx needs
89 | * to be picked up. Comes from a failed backend or
90 | * a disabled node. */
91 | GTxState phase1_acks[MTM_MAX_NODES];
92 | /*
93 | * Technically phase2 ack contains just one term, which is acked. However,
94 | * we 1) collect decrees (in 'status') to perform sanity checks
95 | * 2) make it GTxState to reuse quorum() function.
96 | */
97 | GTxState phase2_acks[MTM_MAX_NODES];
98 | GlobalTxResolvingStage resolver_stage;
99 | } GlobalTx;
100 |
101 | typedef struct
102 | {
103 | LWLock *lock;
104 | HTAB *gid2gtx;
105 | } gtx_shared_data;
106 |
107 | extern gtx_shared_data *gtx_shared;
108 |
109 | void MtmGlobalTxInit(void);
110 | void MtmGlobalTxShmemStartup(void);
111 | void GlobalTxEnsureBeforeShmemExitHook(void);
112 | GlobalTx *GlobalTxAcquire(const char *gid, bool create, bool nowait_own_live,
113 | bool *busy, int coordinator);
114 | void GlobalTxRelease(GlobalTx *gtx);
115 | void GlobalTxAtExit(int code, Datum arg);
116 | void GlobalTxLoadAll(void);
117 | char *serialize_xstate(XactInfo *xinfo, GTxState *gtx_state);
118 | int term_cmp(GlobalTxTerm t1, GlobalTxTerm t2);
119 | int deserialize_xstate(const char *state, XactInfo *xinfo, GTxState *gtx_state,
120 | int elevel);
121 | GlobalTxTerm GlobalTxGetMaxProposal(void);
122 | void GlobalTxSaveInTable(const char *gid, XLogRecPtr coordinator_end_lsn,
123 | GlobalTxStatus status,
124 | GlobalTxTerm term_prop, GlobalTxTerm term_acc);
125 | void GlobalTxMarkOrphaned(int node_id);
126 |
127 | char *GlobalTxToString(GlobalTx *gtx);
128 |
129 | #endif /* GLOBAL_TX_H */
130 |
--------------------------------------------------------------------------------
/src/include/logger.h:
--------------------------------------------------------------------------------
1 | /*----------------------------------------------------------------------------
2 | *
3 | * logger.h
4 | * GUC-controlled map from application meaningful log tags to actual log
5 | * levels.
6 | *
7 | * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
8 | * Portions Copyright (c) 1994, Regents of the University of California
9 | * Portions Copyright (c) 2021, Postgres Professional
10 | *
11 | *----------------------------------------------------------------------------
12 | */
13 |
14 | #include "postgres.h"
15 |
16 | #include "postmaster/bgworker.h"
17 | #include "utils/elog.h"
18 | #include "utils/memutils.h"
19 |
20 | /*
21 | * this hack allows to use mtm_log with direct log level (e.g. ERROR), see
22 | * mtm_log
23 | */
24 | #define FIRST_UNUSED_ERRCODE (PANIC + 1)
25 |
26 | /* keep it in sync with mtm_log_gucs */
27 | typedef enum MtmLogTag
28 | {
29 | /* general */
30 | MtmTxTrace = FIRST_UNUSED_ERRCODE,
31 | MtmTxFinish,
32 |
33 | /* coordinator */
34 | MtmCoordinatorTrace,
35 |
36 | /* dmq */
37 | DmqStateIntermediate,
38 | DmqStateFinal,
39 | DmqTraceOutgoing,
40 | DmqTraceIncoming,
41 | DmqTraceShmMq,
42 | DmqPqTiming,
43 |
44 | /* resolver */
45 | ResolverState,
46 | ResolverTx,
47 | ResolverTasks,
48 |
49 | /* status worker */
50 | StatusRequest,
51 |
52 | /* pool */
53 | BgwPoolEvent,
54 | BgwPoolEventDebug,
55 |
56 | /* ddd */
57 | DeadlockCheck,
58 | DeadlockUpdate,
59 | DeadlockSerialize,
60 |
61 | /* ddl */
62 | DDLStmtOutgoing,
63 | DDLStmtIncoming,
64 | DDLProcessingTrace,
65 |
66 | /* walsender's proto */
67 | ProtoTraceFilter,
68 | ProtoTraceSender,
69 | ProtoTraceMessage,
70 | ProtoTraceState,
71 |
72 | /* receiver */
73 | MtmReceiverState,
74 | MtmReceiverStateDebug,
75 | MtmReceiverFilter,
76 | MtmApplyMessage,
77 | MtmApplyTrace,
78 | MtmApplyError,
79 | MtmApplyBgwFinish,
80 | MtmReceiverFeedback,
81 |
82 | /* state */
83 | MtmStateMessage,
84 | MtmStateSwitch,
85 | MtmStateDebug,
86 |
87 | /* syncpoints */
88 | SyncpointCreated,
89 | SyncpointApply,
90 |
91 | /* Node add/drop */
92 | NodeMgmt
93 | } MtmLogTag;
94 |
95 | typedef struct MtmLogGuc
96 | {
97 | const char *name;
98 | int default_val;
99 | int val;
100 | } MtmLogGuc;
101 |
102 | extern MtmLogGuc mtm_log_gucs[];
103 |
104 | #define MTM_TAG "[MTM]%s"
105 |
106 | /*
107 | * I tried to use get_ps_display instead of MyBgworkerEntry, but it returns
108 | * only dynamic 'activity' part which doesn't include bgw name. Apparently
109 | * there is no way to retrieve main part. Weird.
110 | */
111 | extern bool MtmBackgroundWorker; /* avoid including multimaster.h for this */
112 | extern char *walsender_name; /* same for pglogical_proto.h */
113 | static inline char *
114 | am(void)
115 | {
116 | char *res = " ";
117 | char *name = NULL;
118 |
119 | if (MtmBackgroundWorker)
120 | name = MyBgworkerEntry->bgw_name;
121 | else if (walsender_name)
122 | name = walsender_name;
123 | if (name)
124 | {
125 | /* this is for elog, so alloc in ErrorContext where fmt is evaluated */
126 | MemoryContext old_ctx = MemoryContextSwitchTo(ErrorContext);
127 | res = psprintf(" [%s] ", name);
128 | MemoryContextSwitchTo(old_ctx);
129 | }
130 | return res;
131 | }
132 |
133 | #define MTM_ERRMSG(fmt,...) errmsg(MTM_TAG fmt, am(), ## __VA_ARGS__)
134 |
135 | /*
136 | * tag can either one of MtmLogTag values (in which case corresponding GUC
137 | * defines the actual log level) or direct level like ERROR
138 | */
139 | #define mtm_log(tag, fmt, ...) ereport( \
140 | ((tag) >= FIRST_UNUSED_ERRCODE ? \
141 | mtm_log_gucs[tag - FIRST_UNUSED_ERRCODE].val : (tag)), \
142 | (errmsg(MTM_TAG fmt, \
143 | am(), ## __VA_ARGS__), \
144 | errhidestmt(true), errhidecontext(true)))
145 |
--------------------------------------------------------------------------------
/src/include/messaging.h:
--------------------------------------------------------------------------------
1 |
2 | /*****************************************************************************
3 | *
4 | * Messaging
5 | *
6 | *****************************************************************************/
7 | #ifndef MESSAGING_H
8 | #define MESSAGING_H
9 |
10 | #include "global_tx.h"
11 | #include "state.h"
12 |
13 | /*
14 | * All messages are stamped with MtmMessageTag that should came before the rest
15 | * of the message. That is used upon receival as typecasting criterion.
16 | */
17 | typedef enum
18 | {
19 | T_MtmPrepareResponse = 0,
20 | T_Mtm2AResponse,
21 | T_MtmTxRequest,
22 | T_MtmTxStatusResponse,
23 | T_MtmHeartbeat,
24 | T_MtmGenVoteRequest,
25 | T_MtmGenVoteResponse
26 | } MtmMessageTag;
27 |
28 | typedef struct MtmMessage
29 | {
30 | MtmMessageTag tag;
31 | } MtmMessage;
32 |
33 | #define messageTag(msgptr) (((const MtmMessage *)(msgptr))->tag)
34 |
35 | /* Response to PREPARE by apply worker */
36 | typedef struct
37 | {
38 | MtmMessageTag tag;
39 | int node_id;
40 | /* for PREPARE we care only about, well, prepare success */
41 | bool prepared;
42 | int32 errcode;
43 | const char *errmsg;
44 | TransactionId xid; /* identifies the message */
45 | } MtmPrepareResponse;
46 |
47 | /*
48 | * Response to 2A msg by apply worker or by replier (during resolving).
49 | * This could be named just 2B, ha.
50 | * It is also abused for COMMIT PREPARED ack (with .status = GTXCommitted).
51 | */
52 | typedef struct
53 | {
54 | MtmMessageTag tag;
55 | int node_id;
56 | /*
57 | * Our prevVote in terms of the Part-Time Parliament paper. Actually there
58 | * is no need to carry the decree (status) itself, ballot (term) is
59 | * enough, but it is kept for convenience.
60 | */
61 | GlobalTxStatus status;
62 | GlobalTxTerm accepted_term;
63 | int32 errcode;
64 | const char *errmsg;
65 | const char *gid; /* identifies the message */
66 | } Mtm2AResponse;
67 |
68 | /*
69 | * Response on MtmLastTermRequest request, holds last proposal value.
70 | */
71 | typedef struct
72 | {
73 | MtmMessageTag tag;
74 | GlobalTxTerm term;
75 | } MtmLastTermResponse;
76 |
77 | /*
78 | * Request to change transaction state. This messages are duplicate of
79 | * corresponding WAL records, but we need them during transaction resolution
80 | * upon recovery as WAL receiver may be blocked by a transaction that we
81 | * are actually resolving.
82 | *
83 | * Sent from mtm-resolver to mtm-status worker.
84 | */
85 | typedef enum
86 | {
87 | MTReq_Abort = 0,
88 | MTReq_Commit,
89 | MTReq_Precommit, /* 2a with value commit */
90 | MTReq_Preabort, /* 2a with value abort */
91 | MTReq_Status /* 1a */
92 | } MtmTxRequestValue;
93 |
94 | typedef struct
95 | {
96 | MtmMessageTag tag;
97 | MtmTxRequestValue type;
98 | GlobalTxTerm term;
99 | const char *gid;
100 | int coordinator;
101 | uint64 gen_num;
102 | XLogRecPtr coordinator_end_lsn; /* matters for 1a */
103 | } MtmTxRequest;
104 |
105 | extern char const * const MtmTxRequestValueMnem[];
106 |
107 | /*
108 | * Status response, phase 1b of paxos on a given transaction result.
109 | * Sent from mtm-status to mtm-resolver worker.
110 | */
111 | typedef struct
112 | {
113 | MtmMessageTag tag;
114 | int node_id;
115 | GTxState state;
116 | const char *gid;
117 | } MtmTxStatusResponse;
118 |
119 | /*
120 | * Data sent in dmq heartbeats.
121 | */
122 | typedef struct
123 | {
124 | MtmMessageTag tag;
125 | MtmGeneration current_gen;
126 | uint64 donors; /* xxx nodemask_t */
127 | uint64 last_online_in;
128 | uint64 connected_mask; /* xxx nodemask_t */
129 | } MtmHeartbeat;
130 |
131 | /*
132 | * Request to vote for new generation.
133 | */
134 | typedef struct
135 | {
136 | MtmMessageTag tag;
137 | MtmGeneration gen;
138 | } MtmGenVoteRequest;
139 |
140 | /*
141 | * Reply to new generation vote request.
142 | */
143 | typedef struct
144 | {
145 | MtmMessageTag tag;
146 | uint64 gen_num; /* identifies the message */
147 | uint8 vote_ok;
148 | /* last_online_in of replier on the moment of voting, determines donors */
149 | uint64 last_online_in;
150 | /*
151 | * if vote_ok is false this might be a valid gen number showing that
152 | * replier couldn't vote because its last_vote is higher.
153 | */
154 | uint64 last_vote_num;
155 | /*
156 | * curr gen donors of the responder and its donors. Sometimes we wish to
157 | * send it along with refusal to vote, see HandleGenVoteRequest.
158 | */
159 | MtmGeneration curr_gen;
160 | uint64_t curr_gen_donors;
161 | } MtmGenVoteResponse;
162 |
163 |
164 | StringInfo MtmMessagePack(MtmMessage *anymsg);
165 | MtmMessage *MtmMessageUnpack(StringInfo s);
166 | char *MtmMesageToString(MtmMessage *anymsg);
167 |
168 | #endif /* MESSAGING_H */
169 |
--------------------------------------------------------------------------------
/src/include/mtm_utils.h:
--------------------------------------------------------------------------------
1 | /*-------------------------------------------------------------------------
2 | *
3 | * mtm_utils.h
4 | * Utility functions:
5 | * - disable global timeouts settings;
6 | * - libpq connect function wrappers.
7 | *
8 | *
9 | * Copyright (c) 2022, Postgres Professional
10 | *
11 | *-------------------------------------------------------------------------
12 | */
13 | #ifndef MTM_UTILS_H
14 | #define MTM_UTILS_H
15 |
16 | #include "libpq/pqformat.h"
17 | #include "libpq-fe.h"
18 |
19 | extern void MtmDisableTimeouts(void);
20 |
21 | extern PostgresPollingStatusType MtmPQconnectPoll(PGconn *conn);
22 | extern PGconn* MtmPQconnectdb(const char *conninfo);
23 |
24 | #endif
25 |
--------------------------------------------------------------------------------
/src/include/pglogical_config.h:
--------------------------------------------------------------------------------
1 | #ifndef PG_LOGICAL_CONFIG_H
2 | #define PG_LOGICAL_CONFIG_H
3 |
4 | #ifndef PG_VERSION_NUM
5 | #error must be included first
6 | #endif
7 |
8 | #include "nodes/pg_list.h"
9 | #include "pglogical_output.h"
10 |
11 | inline static bool
12 | server_float4_byval(void)
13 | {
14 | #ifdef USE_FLOAT4_BYVAL
15 | return true;
16 | #else
17 | return false;
18 | #endif
19 | }
20 |
21 | inline static bool
22 | server_float8_byval(void)
23 | {
24 | #ifdef USE_FLOAT8_BYVAL
25 | return true;
26 | #else
27 | return false;
28 | #endif
29 | }
30 |
31 | inline static bool
32 | server_integer_datetimes(void)
33 | {
34 | #ifdef USE_INTEGER_DATETIMES
35 | return true;
36 | #else
37 | return false;
38 | #endif
39 | }
40 |
41 | inline static bool
42 | server_bigendian(void)
43 | {
44 | #ifdef WORDS_BIGENDIAN
45 | return true;
46 | #else
47 | return false;
48 | #endif
49 | }
50 |
51 | extern int process_parameters(List *options, PGLogicalOutputData *data);
52 |
53 | extern List *prepare_startup_message(PGLogicalOutputData *data);
54 |
55 | #endif
56 |
--------------------------------------------------------------------------------
/src/include/pglogical_hooks.h:
--------------------------------------------------------------------------------
1 | #ifndef PGLOGICAL_HOOKS_H
2 | #define PGLOGICAL_HOOKS_H
3 |
4 | #include "replication/reorderbuffer.h"
5 |
6 | /* public interface for hooks */
7 | #include "pglogical_output/hooks.h"
8 | #include "pglogical_output.h"
9 |
10 | extern void load_hooks(PGLogicalOutputData *data);
11 |
12 | extern void call_startup_hook(PGLogicalOutputData *data, List *plugin_params);
13 |
14 | extern void call_shutdown_hook(PGLogicalOutputData *data);
15 |
16 | extern bool call_row_filter_hook(PGLogicalOutputData *data,
17 | ReorderBufferTXN *txn, Relation rel, ReorderBufferChange *change);
18 |
19 | extern bool call_txn_filter_hook(PGLogicalOutputData *data,
20 | RepOriginId txn_origin);
21 |
22 |
23 | #endif
24 |
--------------------------------------------------------------------------------
/src/include/pglogical_output.h:
--------------------------------------------------------------------------------
1 | /*-------------------------------------------------------------------------
2 | *
3 | * pglogical_output.h
4 | * pglogical output plugin
5 | *
6 | * Copyright (c) 2015, PostgreSQL Global Development Group
7 | * Portions Copyright (c) 2021, Postgres Professional
8 | *
9 | * IDENTIFICATION
10 | * pglogical_output.h
11 | *
12 | *-------------------------------------------------------------------------
13 | */
14 | #ifndef PG_LOGICAL_OUTPUT_H
15 | #define PG_LOGICAL_OUTPUT_H
16 |
17 | #include "nodes/parsenodes.h"
18 |
19 | #include "replication/logical.h"
20 | #include "replication/output_plugin.h"
21 |
22 | #include "storage/lock.h"
23 |
24 | #include "pglogical_output/hooks.h"
25 |
26 | #include "pglogical_proto.h"
27 |
28 | #include "multimaster.h"
29 |
30 | #define PG_LOGICAL_PROTO_VERSION_NUM 1
31 | #define PG_LOGICAL_PROTO_MIN_VERSION_NUM 1
32 |
33 | /*
34 | * The name of a hook function. This is used instead of the usual List*
35 | * because can serve as a hash key.
36 | *
37 | * Must be zeroed on allocation if used as a hash key since padding is
38 | * *not* ignored on compare.
39 | */
40 | typedef struct HookFuncName
41 | {
42 | /* funcname is more likely to be unique, so goes first */
43 | char function[NAMEDATALEN];
44 | char schema[NAMEDATALEN];
45 | } HookFuncName;
46 |
47 | typedef struct MtmDecoderPrivate
48 | {
49 | int receiver_node_id;
50 | bool is_recovery;
51 | MtmConfig *cfg;
52 | } MtmDecoderPrivate;
53 |
54 | typedef struct PGLogicalOutputData
55 | {
56 | MemoryContext context;
57 |
58 | PGLogicalProtoAPI *api;
59 |
60 | /* protocol */
61 | bool allow_internal_basetypes;
62 | bool allow_binary_basetypes;
63 | bool forward_changesets;
64 | bool forward_changeset_origins;
65 | int field_datum_encoding;
66 |
67 | /*
68 | * client info
69 | *
70 | * Lots of this should move to a separate shorter-lived struct used only
71 | * during parameter reading, since it contains what the client asked for.
72 | * Once we've processed this during startup we don't refer to it again.
73 | */
74 | uint32 client_pg_version;
75 | uint32 client_max_proto_version;
76 | uint32 client_min_proto_version;
77 | const char *client_expected_encoding;
78 | const char *client_protocol_format;
79 | uint32 client_binary_basetypes_major_version;
80 | bool client_want_internal_basetypes_set;
81 | bool client_want_internal_basetypes;
82 | bool client_want_binary_basetypes_set;
83 | bool client_want_binary_basetypes;
84 | bool client_binary_bigendian_set;
85 | bool client_binary_bigendian;
86 | uint32 client_binary_sizeofdatum;
87 | uint32 client_binary_sizeofint;
88 | uint32 client_binary_sizeoflong;
89 | bool client_binary_float4byval_set;
90 | bool client_binary_float4byval;
91 | bool client_binary_float8byval_set;
92 | bool client_binary_float8byval;
93 | bool client_binary_intdatetimes_set;
94 | bool client_binary_intdatetimes;
95 | bool client_forward_changesets_set;
96 | bool client_forward_changesets;
97 | bool client_no_txinfo;
98 |
99 | /* hooks */
100 | List *hooks_setup_funcname;
101 | struct PGLogicalHooks hooks;
102 | MemoryContext hooks_mctxt;
103 |
104 | /* DefElem list populated by startup hook */
105 | List *extra_startup_params;
106 | } PGLogicalOutputData;
107 |
108 | typedef struct PGLogicalTupleData
109 | {
110 | Datum values[MaxTupleAttributeNumber];
111 | bool nulls[MaxTupleAttributeNumber];
112 | bool changed[MaxTupleAttributeNumber];
113 | } PGLogicalTupleData;
114 |
115 | extern void MtmOutputPluginWrite(LogicalDecodingContext *ctx, bool last_write, bool flush);
116 | extern void MtmOutputPluginPrepareWrite(LogicalDecodingContext *ctx, bool last_write, bool flush);
117 |
118 | #endif /* PG_LOGICAL_OUTPUT_H */
119 |
--------------------------------------------------------------------------------
/src/include/pglogical_output/compat.h:
--------------------------------------------------------------------------------
1 | #ifndef PG_LOGICAL_COMPAT_H
2 | #define PG_LOGICAL_COMPAT_H
3 |
4 | #include "pg_config.h"
5 |
6 | /* 9.4 lacks replication origins */
7 | #if PG_VERSION_NUM >= 90500
8 | #define HAVE_REPLICATION_ORIGINS
9 | #else
10 | /* To allow the same signature on hooks in 9.4 */
11 | typedef uint16 RepOriginId;
12 | #define InvalidRepOriginId 0
13 | #endif
14 |
15 | /* 9.4 lacks PG_UINT32_MAX */
16 | #ifndef PG_UINT32_MAX
17 | #define PG_UINT32_MAX UINT32_MAX
18 | #endif
19 |
20 | #ifndef PG_INT32_MAX
21 | #define PG_INT32_MAX INT32_MAX
22 | #endif
23 |
24 | #ifndef PG_INT32_MIN
25 | #define PG_INT32_MIN INT32_MIN
26 | #endif
27 |
28 | #endif
29 |
--------------------------------------------------------------------------------
/src/include/pglogical_output/hooks.h:
--------------------------------------------------------------------------------
1 | #ifndef PGLOGICAL_OUTPUT_HOOKS_H
2 | #define PGLOGICAL_OUTPUT_HOOKS_H
3 |
4 | #include "access/xlogdefs.h"
5 | #include "nodes/pg_list.h"
6 | #include "utils/rel.h"
7 | #include "utils/palloc.h"
8 | #include "replication/reorderbuffer.h"
9 |
10 | #include "pglogical_output/compat.h"
11 |
12 | /*
13 | * This header is to be included by extensions that implement pglogical output
14 | * plugin callback hooks for transaction origin and row filtering, etc. It is
15 | * installed as "pglogical_output/hooks.h"
16 | *
17 | * See the README.md and the example in examples/hooks/ for details on hooks.
18 | */
19 |
20 |
21 | struct PGLogicalStartupHookArgs
22 | {
23 | void *private_data;
24 | List *in_params;
25 | List *out_params;
26 | };
27 |
28 | typedef void (*pglogical_startup_hook_fn) (struct PGLogicalStartupHookArgs *args);
29 |
30 |
31 | struct PGLogicalTxnFilterArgs
32 | {
33 | void *private_data;
34 | RepOriginId origin_id;
35 | };
36 |
37 | typedef bool (*pglogical_txn_filter_hook_fn) (struct PGLogicalTxnFilterArgs *args);
38 |
39 |
40 | struct PGLogicalRowFilterArgs
41 | {
42 | void *private_data;
43 | Relation changed_rel;
44 | enum ReorderBufferChangeType change_type;
45 | /* detailed row change event from logical decoding */
46 | ReorderBufferChange *change;
47 | };
48 |
49 | typedef bool (*pglogical_row_filter_hook_fn) (struct PGLogicalRowFilterArgs *args);
50 |
51 |
52 | struct PGLogicalShutdownHookArgs
53 | {
54 | void *private_data;
55 | };
56 |
57 | typedef void (*pglogical_shutdown_hook_fn) (struct PGLogicalShutdownHookArgs *args);
58 |
59 | /*
60 | * This struct is passed to the pglogical_get_hooks_fn as the first argument,
61 | * typed 'internal', and is unwrapped with `DatumGetPointer`.
62 | */
63 | struct PGLogicalHooks
64 | {
65 | pglogical_startup_hook_fn startup_hook;
66 | pglogical_shutdown_hook_fn shutdown_hook;
67 | pglogical_txn_filter_hook_fn txn_filter_hook;
68 | pglogical_row_filter_hook_fn row_filter_hook;
69 | void *hooks_private_data;
70 | };
71 |
72 |
73 | #endif /* PGLOGICAL_OUTPUT_HOOKS_H */
74 |
--------------------------------------------------------------------------------
/src/include/pglogical_proto.h:
--------------------------------------------------------------------------------
1 | /*-------------------------------------------------------------------------
2 | *
3 | * pglogical_proto.h
4 | * pglogical protocol
5 | *
6 | * Copyright (c) 2015, PostgreSQL Global Development Group
7 | * Portions Copyright (c) 2021, Postgres Professional
8 | *
9 | * IDENTIFICATION
10 | * pglogical_proto.h
11 | *
12 | *-------------------------------------------------------------------------
13 | */
14 | #ifndef PG_LOGICAL_PROTO_H
15 | #define PG_LOGICAL_PROTO_H
16 |
17 | struct PGLogicalOutputData;
18 | struct PGLRelMetaCacheEntry;
19 |
20 | extern char *walsender_name;
21 |
22 | typedef void (*pglogical_write_rel_fn) (StringInfo out, struct PGLogicalOutputData *data,
23 | Relation rel /* , struct
24 | * PGLRelMetaCacheEntry
25 | * *cache_entry */ );
26 |
27 | typedef void (*pglogical_write_begin_fn) (StringInfo out, struct PGLogicalOutputData *data,
28 | ReorderBufferTXN *txn);
29 | typedef void (*pglogical_write_message_fn) (StringInfo out, LogicalDecodingContext *ctx,
30 | XLogRecPtr end_lsn,
31 | const char *prefix, Size sz, const char *message);
32 | typedef void (*pglogical_write_commit_fn) (StringInfo out, struct PGLogicalOutputData *data,
33 | ReorderBufferTXN *txn, XLogRecPtr commit_lsn);
34 |
35 | typedef void (*pglogical_write_origin_fn) (StringInfo out, const char *origin,
36 | XLogRecPtr origin_lsn);
37 |
38 | typedef void (*pglogical_write_insert_fn) (StringInfo out, struct PGLogicalOutputData *data,
39 | Relation rel, HeapTuple newtuple);
40 | typedef void (*pglogical_write_update_fn) (StringInfo out, struct PGLogicalOutputData *data,
41 | Relation rel, HeapTuple oldtuple,
42 | HeapTuple newtuple);
43 | typedef void (*pglogical_write_delete_fn) (StringInfo out, struct PGLogicalOutputData *data,
44 | Relation rel, HeapTuple oldtuple);
45 |
46 | typedef void (*pglogical_write_caughtup_fn) (StringInfo out, struct PGLogicalOutputData *data,
47 | XLogRecPtr wal_end_ptr);
48 |
49 | typedef void (*write_startup_message_fn) (StringInfo out, List *msg);
50 |
51 | typedef void (*pglogical_setup_hooks_fn) (struct PGLogicalHooks *hooks);
52 |
53 | typedef struct PGLogicalProtoAPI
54 | {
55 | pglogical_write_rel_fn write_rel;
56 | pglogical_write_begin_fn write_begin;
57 | pglogical_write_message_fn write_message;
58 | pglogical_write_commit_fn write_commit;
59 | pglogical_write_origin_fn write_origin;
60 | pglogical_write_insert_fn write_insert;
61 | pglogical_write_update_fn write_update;
62 | pglogical_write_delete_fn write_delete;
63 | pglogical_write_caughtup_fn write_caughtup;
64 | pglogical_setup_hooks_fn setup_hooks;
65 | write_startup_message_fn write_startup_message;
66 | } PGLogicalProtoAPI;
67 |
68 |
69 | typedef enum PGLogicalProtoType
70 | {
71 | PGLogicalProtoNative,
72 | PGLogicalProtoJson
73 | } PGLogicalProtoType;
74 |
75 | extern PGLogicalProtoAPI *pglogical_init_api(PGLogicalProtoType typ);
76 |
77 |
78 | extern void pglogical_write_abort(StringInfo out,
79 | struct PGLogicalOutputData *data,
80 | ReorderBufferTXN *txn, XLogRecPtr lsn);
81 | extern void pglogical_write_prepare(StringInfo out,
82 | struct PGLogicalOutputData *data,
83 | ReorderBufferTXN *txn, XLogRecPtr lsn);
84 | extern void pglogical_write_commit_prepared(StringInfo out,
85 | struct PGLogicalOutputData *data,
86 | ReorderBufferTXN *txn, XLogRecPtr lsn);
87 | extern void pglogical_write_abort_prepared(StringInfo out,
88 | struct PGLogicalOutputData *data,
89 | ReorderBufferTXN *txn, XLogRecPtr lsn);
90 |
91 | #endif /* PG_LOGICAL_PROTO_H */
92 |
--------------------------------------------------------------------------------
/src/include/pglogical_relid_map.h:
--------------------------------------------------------------------------------
1 | #ifndef PGLOGICAL_RELID_MAP
2 | #define PGLOGICAL_RELID_MAP
3 |
4 | #define PGL_INIT_RELID_MAP_SIZE 256
5 |
6 | typedef struct PGLRelidMapEntry
7 | {
8 | Oid remote_relid;
9 | Oid local_relid;
10 | } PGLRelidMapEntry;
11 |
12 | extern Oid pglogical_relid_map_get(Oid relid);
13 | extern bool pglogical_relid_map_put(Oid remote_relid, Oid local_relid);
14 | extern void pglogical_relid_map_reset(void);
15 | #endif
16 |
--------------------------------------------------------------------------------
/src/include/receiver.h:
--------------------------------------------------------------------------------
1 | #ifndef MTM_RECEIVER_H
2 | #define MTM_RECEIVER_H
3 |
4 | #include "libpq-fe.h"
5 |
6 | typedef enum
7 | {
8 | REPLMODE_DISABLED, /* stop the receiver */
9 | REPLMODE_RECOVERY, /* pull changes of all origins */
10 | REPLMODE_NORMAL /* pull only sender changes, apply in parallel */
11 | } MtmReplicationMode;
12 |
13 | /* ugly exported for the sake of MtmDetectGlobalDeadLock */
14 | extern MtmReplicationMode curr_replication_mode;
15 |
16 | #define BGW_POOL_BY_NODE_ID(node_id) (&Mtm->pools[(node_id) - 1])
17 |
18 | extern char const *const MtmReplicationModeMnem[];
19 |
20 | /* forward decl to avoid including global_tx.h */
21 | struct GlobalTx;
22 |
23 | /* same for bgwpool.h */
24 | struct BgwPool;
25 |
26 | /*
27 | * Part of MtmReceiverContext used by both main receiver and parallel workers.
28 | * Exposed for bgwpool/apply needs.
29 | */
30 | typedef struct
31 | {
32 | int sender_node_id;
33 | MtmReplicationMode mode;
34 | /* allows to release gtx on ERROR in apply */
35 | struct GlobalTx *gtx;
36 | /*
37 | * For parallel workers: position of current job in txlist.
38 | */
39 | int txlist_pos;
40 | /*
41 | * Info about xact currently being executed
42 | */
43 | TransactionId origin_xid;
44 | bool reply_pending;
45 | /*
46 | * true means this is xact with plain commit, so we cannot ignore
47 | * apply failure
48 | */
49 | bool bdr_like;
50 |
51 | struct BgwPool *pool;
52 | } MtmReceiverWorkerContext;
53 |
54 | extern void MtmWakeupReceivers(void);
55 |
56 | extern void MtmExecutor(void *work, size_t size, MtmReceiverWorkerContext *rwctx);
57 | extern void ApplyCancelHandler(SIGNAL_ARGS);
58 | extern void MtmUpdateLsnMapping(int node_id, XLogRecPtr end_lsn);
59 |
60 | extern void MtmBeginSession(int nodeId);
61 | extern void MtmEndSession(int nodeId, bool unlock);
62 |
63 | #endif
64 |
--------------------------------------------------------------------------------
/src/include/resolver.h:
--------------------------------------------------------------------------------
1 | #ifndef RESOLVER_H
2 | #define RESOLVER_H
3 |
4 | #include "postmaster/bgworker.h"
5 |
6 | extern void ResolverMain(Datum main_arg);
7 | void ResolverWake(void);
8 |
9 | #endif /* RESOLVER_H */
10 |
--------------------------------------------------------------------------------
/src/include/spill.h:
--------------------------------------------------------------------------------
1 | #ifndef __SPILL_H__
2 | #define __SPILL_H__
3 |
4 | void MtmSpillToFile(int fd, char const *data, size_t size);
5 | void MtmCreateSpillDirectory(int node_id);
6 | int MtmCreateSpillFile(int node_id, int *file_id);
7 | int MtmOpenSpillFile(int node_id, int file_id);
8 | void MtmReadSpillFile(int fd, char *data, size_t size);
9 | void MtmCloseSpillFile(int fd);
10 |
11 | #endif
12 |
--------------------------------------------------------------------------------
/src/include/state.h:
--------------------------------------------------------------------------------
1 | #ifndef STATE_H
2 | #define STATE_H
3 |
4 | /*
5 | * Generation is a uniquely numbered subset of configured nodes allowed to
6 | * commit transactions. Each xact is stamped with generation it belongs
7 | * to. Transaction must be PREPAREd on *all* generation members before commit;
8 | * this provides recovery -> normal work transition without risk of reordering
9 | * xacts.
10 | *
11 | * The two main properties of generations are
12 | * - At each node all prepares of generation n who might ever be committed
13 | * lie strictly before all such prepares of generation n+1.
14 | * - Node which is MTM_GEN_ONLINE in generation n holds all committable
15 | * xacts of all generations < n.
16 | * See generations2.md and MtmGenerations.tla for details.
17 | *
18 | * Normal (making xacts) generation contains at least majority
19 | * members. However, we allow to elect generation with less members as a sort
20 | * of mark that its members are recovered enough to be included in the
21 | * following normal generations. It allows nodes always add *only myself* (but
22 | * remove anyone else) when campaigning for new generations; thus only node
23 | * itself decides when it is recovered enough to force others wait for it,
24 | * which simplifies reasoning who should be next gen members.
25 | *
26 | * Another reason for minority gens existence is usage of generations to
27 | * directly abort transactions when we know they can't ever be prepared; this
28 | * allows to participate in normal transaction resolution iff node has
29 | * PREPARE. For that to work, we must be sure live connectivity clique forming
30 | * majority eventually forms its generation regardless of recovery process.
31 | * c.f. handle_1a for details.
32 | */
33 | typedef struct MtmGeneration
34 | {
35 | uint64 num; /* logical clock aka term number aka ballot */
36 | uint64 members; /* xxx extract nodemask.h and use it here */
37 | /*
38 | * Generation has fixed set of configured nodes, which helps consistent
39 | * xact resolving with dynamic add/rm of nodes.
40 | */
41 | uint64 configured; /* xxx extract nodemask.h and use it here */
42 | } MtmGeneration;
43 |
44 | #define MtmInvalidGenNum 0
45 | #define EQUAL_GENS(g1, g2) \
46 | ((g1).num == (g2).num && (g1).members == (g2).members && (g1).configured == (g2).configured)
47 | /*
48 | * Referee is enabled only with 2 nodes and single member gen is ever proposed
49 | * as referee one (requiring referee vote and allowing to be online this
50 | * single node), so instead of separate flag use this check.
51 | *
52 | * First condition is important as single node cluster shouldn't access
53 | * referee; also, with > 2 nodes there is at least theoretical possibility of
54 | * electing single-node generation after two consecutive minority gen
55 | * elections.
56 | */
57 | #define IS_REFEREE_GEN(members, configured) \
58 | (popcount(configured) == 2 && popcount(members) == 1)
59 |
60 | typedef enum
61 | {
62 | MTM_GEN_DEAD, /* can't ever be online in this gen */
63 | MTM_GEN_RECOVERY, /* need to pull in recovery latest xacts before */
64 | /* starting making my own and receiving normally */
65 | MTM_GEN_ONLINE /* participating normally */
66 | } MtmStatusInGen;
67 |
68 | typedef enum
69 | {
70 | /*
71 | * We were not excluded to the best of our knowledge, but we don't see all
72 | * peers from current generation, so commits will likely fail.
73 | */
74 | MTM_ISOLATED,
75 |
76 | /*
77 | * We were excluded and definitely need recovery, but not yet sure from
78 | * whom as we don't see majority.
79 | */
80 | MTM_DISABLED,
81 |
82 | /*
83 | * We are catching up, eating changes committed without us participating.
84 | * Other nodes don't wait for us yet, so this doesn't freeze the cluster.
85 | */
86 | MTM_CATCHUP,
87 |
88 | /*
89 | * Generation with us was elected and others started waiting for us, but
90 | * we need to eat the latest changes in recovery mode to participate
91 | * normally.
92 | */
93 | MTM_RECOVERY,
94 |
95 | /*
96 | * It's Twelve O'clock and All's Well.
97 | */
98 | MTM_ONLINE,
99 | } MtmNodeStatus;
100 |
101 | extern char const *const MtmNodeStatusMnem[];
102 |
103 | extern void MtmStateInit(void);
104 | extern void MtmStateShmemStartup(void);
105 | extern void MtmStateStartup(void);
106 |
107 | /* generation management */
108 | extern uint64 MtmGetCurrentGenNum(void);
109 | extern MtmGeneration MtmGetCurrentGen(bool locked);
110 | extern void MtmConsiderGenSwitch(MtmGeneration gen, nodemask_t donors);
111 | extern bool MtmHandleParallelSafe(MtmGeneration ps_gen, nodemask_t ps_donors,
112 | bool is_recovery, XLogRecPtr end_lsn);
113 | extern MtmStatusInGen MtmGetCurrentStatusInGen(void);
114 | extern MtmStatusInGen MtmGetCurrentStatusInGenNotLocked(void);
115 | extern MtmNodeStatus MtmGetCurrentStatus(bool gen_locked, bool vote_locked);
116 |
117 | /* receiver bits */
118 | extern void MtmReportReceiverCaughtup(int node_id);
119 | /* we should recover, but not not sure from whom yet */
120 | #define RECEIVE_MODE_DISABLED (~(uint32)0)
121 | /* all receivers work normally */
122 | #define RECEIVE_MODE_NORMAL 0
123 | #define IS_RECEIVE_MODE_DONOR(rcv_mode) ((rcv_mode) != RECEIVE_MODE_NORMAL && \
124 | ((rcv_mode) != RECEIVE_MODE_DISABLED))
125 | extern MtmReplicationMode MtmGetReceiverMode(int nodeId);
126 |
127 | /* connectivity */
128 | extern nodemask_t MtmGetDmqReceiversMask(void);
129 | extern nodemask_t MtmGetConnectedMask(bool locked);
130 | extern nodemask_t MtmGetConnectedMaskWithMe(bool locked);
131 | extern void *MtmOnDmqReceiverConnect(char *node_name);
132 | extern void MtmOnDmqReceiverHeartbeat(char *node_name, StringInfo msg, void *extra);
133 | extern void MtmOnDmqReceiverDisconnect(char *node_name);
134 | extern void MtmOnDmqSenderConnect(char *node_name);
135 | extern void MtmOnDmqSenderHeartbeat(char *node_name, StringInfo buf);
136 | extern void MtmOnDmqSenderDisconnect(char *node_name);
137 |
138 | extern void AcquirePBByPreparer(bool backend);
139 | extern void AcquirePBByHolder(bool full);
140 | extern void ReleasePB(void);
141 |
142 | /* bgws */
143 | extern void CampaignerMain(Datum main_arg);
144 | extern void ReplierMain(Datum main_arg);
145 | extern void MtmMonitor(Datum arg);
146 | extern void MtmMonitorStart(Oid db_id, Oid user_id);
147 |
148 | /* not cleaned up yet */
149 | extern void MtmRefreshClusterStatus(void);
150 | extern nodemask_t MtmGetDisabledNodeMask(void);
151 | extern nodemask_t MtmGetEnabledNodeMask(bool ignore_disabled);
152 | extern void CampaignerStop(void);
153 |
154 | #endif
155 |
--------------------------------------------------------------------------------
/src/include/syncpoint.h:
--------------------------------------------------------------------------------
1 | /*-------------------------------------------------------------------------
2 | *
3 | * syncpoint.h
4 | *
5 | * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
6 | * Portions Copyright (c) 1994, Regents of the University of California
7 | * Portions Copyright (c) 2021, Postgres Professional
8 | *
9 | *-------------------------------------------------------------------------
10 | */
11 | #ifndef SYNCPOINT_H
12 | #define SYNCPOINT_H
13 |
14 | #include "access/xlogdefs.h"
15 | #include "libpq-fe.h"
16 | #include "utils/hsearch.h"
17 | #include "replication/walsender.h"
18 |
19 | typedef struct
20 | {
21 | XLogRecPtr origin_lsn;
22 | XLogRecPtr local_lsn;
23 | } Syncpoint;
24 |
25 | /*
26 | * Used as a hashkey in recovery filter.
27 | *
28 | * NB: make sure to memset this structure to zeroes before using as hashkey
29 | * because it contains 4-byte padding hole in the middle.
30 | */
31 | typedef struct
32 | {
33 | int node_id;
34 | XLogRecPtr origin_lsn;
35 | } FilterEntry;
36 |
37 |
38 | extern int MtmSyncpointInterval;
39 |
40 |
41 | extern void MaybeLogSyncpoint(void);
42 | extern void SyncpointRegister(int origin_node_id, XLogRecPtr origin_lsn,
43 | XLogRecPtr receiver_lsn);
44 | extern Syncpoint SyncpointGetLatest(int origin_node_id);
45 | extern Syncpoint *SyncpointGetAllLatest(int sender_node_id);
46 | extern XLogRecPtr GetRecoveryHorizon(int sender_node_id);
47 | extern void UpdateRecoveryHorizons(void);
48 | extern HTAB *RecoveryFilterLoad(int filter_node_id, Syncpoint *spvector, MtmConfig *mtm_cfg);
49 |
50 | extern char* pg_lsn_out_c(XLogRecPtr lsn);
51 |
52 | #endif /* SYNCPOINT_H */
53 |
--------------------------------------------------------------------------------
/src/mtm_utils.c:
--------------------------------------------------------------------------------
1 | /*----------------------------------------------------------------------------
2 | *
3 | * mtm_utils.c
4 | * Utility functions
5 | *
6 | * Copyright (c) 2022, Postgres Professional
7 | *
8 | *----------------------------------------------------------------------------
9 | */
10 |
11 | #include "logger.h"
12 | #include "mtm_utils.h"
13 |
14 | #include "utils/timeout.h"
15 |
16 | /*
17 | * Disables timeouts on a client side:
18 | * - statement_timeout;
19 | * - lock_timeout;
20 | * - idle_in_transaction_session_timeout;
21 | * - idle_session_timeout.
22 | *
23 | * This timeouts, when set in the postgres config file, affect all process.
24 | * The multimaster needs his sessions not to be interrupted, so we disable
25 | * these timeouts.
26 | *
27 | * This function raises an error on PQExec failed.
28 | */
29 | static bool
30 | disable_client_timeouts(PGconn *conn)
31 | {
32 | PGresult *res;
33 |
34 | res = PQexec(conn, "SET statement_timeout = 0");
35 | if (PQresultStatus(res) != PGRES_COMMAND_OK)
36 | {
37 | mtm_log(WARNING, "failed to set statement_timeout: %s",
38 | pchomp(PQerrorMessage(conn)));
39 | return false;
40 | }
41 |
42 | res = PQexec(conn, "SET lock_timeout = 0");
43 | if (PQresultStatus(res) != PGRES_COMMAND_OK)
44 | {
45 | mtm_log(WARNING, "failed to set lock_timeout: %s",
46 | pchomp(PQerrorMessage(conn)));
47 | return false;
48 | }
49 |
50 | res = PQexec(conn, "SET idle_in_transaction_session_timeout = 0");
51 | if (PQresultStatus(res) != PGRES_COMMAND_OK)
52 | {
53 | mtm_log(WARNING, "failed to set idle_in_transaction_session_timeout: %s",
54 | pchomp(PQerrorMessage(conn)));
55 | return false;
56 | }
57 |
58 | res = PQexec(conn, "SET idle_session_timeout = 0");
59 | if (PQresultStatus(res) != PGRES_COMMAND_OK)
60 | {
61 | mtm_log(WARNING, "failed to set idle_session_timeout: %s",
62 | pchomp(PQerrorMessage(conn)));
63 | return false;
64 | }
65 |
66 | return true;
67 | }
68 |
69 | /*
70 | * Disable timeouts for a current process
71 | * - statement_timeout;
72 | * - lock_timeout;
73 | * - idle_in_transaction_session_timeout;
74 | * - idle_session_timeout.
75 | *
76 | * We disable these timeout for the same reason as in the disable_client_timeout()
77 | */
78 | extern void
79 | MtmDisableTimeouts(void)
80 | {
81 | if (get_timeout_active(STATEMENT_TIMEOUT))
82 | disable_timeout(STATEMENT_TIMEOUT, false);
83 | if (get_timeout_active(LOCK_TIMEOUT))
84 | disable_timeout(LOCK_TIMEOUT, false);
85 | if (get_timeout_active(IDLE_IN_TRANSACTION_SESSION_TIMEOUT))
86 | disable_timeout(IDLE_IN_TRANSACTION_SESSION_TIMEOUT, false);
87 | if (get_timeout_active(IDLE_SESSION_TIMEOUT))
88 | disable_timeout(IDLE_SESSION_TIMEOUT, false);
89 | }
90 |
91 | /*
92 | * Wrapper on PQconnectPoll
93 | *
94 | * On connect disables timeouts on a client side
95 | */
96 | PostgresPollingStatusType
97 | MtmPQconnectPoll(PGconn *conn)
98 | {
99 | PostgresPollingStatusType status;
100 |
101 | status = PQconnectPoll(conn);
102 | if (status != PGRES_POLLING_OK)
103 | return status;
104 |
105 | if (!disable_client_timeouts(conn))
106 | status = PGRES_POLLING_FAILED;
107 |
108 | return status;
109 | }
110 |
111 | /*
112 | * Wrapper on PQconnectdb
113 | *
114 | * On connect disables timeouts on a client side
115 | */
116 | PGconn *
117 | MtmPQconnectdb(const char *conninfo)
118 | {
119 | PGconn *conn;
120 |
121 | conn = PQconnectdb(conninfo);
122 | if (PQstatus(conn) != CONNECTION_OK)
123 | return conn;
124 |
125 | if (!disable_client_timeouts(conn))
126 | {
127 | PQfinish(conn);
128 | return NULL;
129 | }
130 |
131 | return conn;
132 | }
133 |
134 |
--------------------------------------------------------------------------------
/src/pglogical_hooks.c:
--------------------------------------------------------------------------------
1 | /*-------------------------------------------------------------------------
2 | *
3 | * pglogical_hooks.c
4 | *
5 | * Portions Copyright (c) 2015-2021, Postgres Professional
6 | * Portions Copyright (c) 2015-2020, PostgreSQL Global Development Group
7 | *
8 | *-------------------------------------------------------------------------
9 | */
10 | #include "postgres.h"
11 |
12 | #include "access/xact.h"
13 |
14 | #include "catalog/pg_proc.h"
15 | #include "catalog/pg_type.h"
16 |
17 | #include "replication/origin.h"
18 |
19 | #include "parser/parse_func.h"
20 |
21 | #include "utils/acl.h"
22 | #include "utils/lsyscache.h"
23 |
24 | #include "miscadmin.h"
25 |
26 | #include "pglogical_hooks.h"
27 | #include "pglogical_output.h"
28 |
29 | #include "multimaster.h"
30 | #include "logger.h"
31 |
32 | /*
33 | * Returns Oid of the hooks function specified in funcname.
34 | *
35 | * Error is thrown if function doesn't exist or doen't return correct datatype
36 | * or is volatile.
37 | */
38 | static Oid
39 | get_hooks_function_oid(List *funcname)
40 | {
41 | Oid funcid;
42 | Oid funcargtypes[1];
43 |
44 | funcargtypes[0] = INTERNALOID;
45 |
46 | /* find the the function */
47 | funcid = LookupFuncName(funcname, 1, funcargtypes, false);
48 |
49 | /* Validate that the function returns void */
50 | if (get_func_rettype(funcid) != VOIDOID)
51 | {
52 | ereport(ERROR,
53 | (errcode(ERRCODE_WRONG_OBJECT_TYPE),
54 | MTM_ERRMSG("function %s must return void",
55 | NameListToString(funcname))));
56 | }
57 |
58 | if (func_volatile(funcid) == PROVOLATILE_VOLATILE)
59 | {
60 | ereport(ERROR,
61 | (errcode(ERRCODE_WRONG_OBJECT_TYPE),
62 | MTM_ERRMSG("function %s must not be VOLATILE",
63 | NameListToString(funcname))));
64 | }
65 |
66 | if (pg_proc_aclcheck(funcid, GetUserId(), ACL_EXECUTE) != ACLCHECK_OK)
67 | {
68 | const char *username;
69 | #if PG_VERSION_NUM >= 90500
70 | username = GetUserNameFromId(GetUserId(), false);
71 | #else
72 | username = GetUserNameFromId(GetUserId());
73 | #endif
74 | ereport(ERROR,
75 | (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
76 | MTM_ERRMSG("current user %s does not have permission to call function %s",
77 | username, NameListToString(funcname))));
78 | }
79 |
80 | return funcid;
81 | }
82 |
83 | /*
84 | * If a hook setup function was specified in the startup parameters, look it up
85 | * in the catalogs, check permissions, call it, and store the resulting hook
86 | * info struct.
87 | */
88 | void
89 | load_hooks(PGLogicalOutputData *data)
90 | {
91 | Oid hooks_func;
92 | MemoryContext old_ctxt;
93 | bool txn_started = false;
94 |
95 | if (!IsTransactionState())
96 | {
97 | txn_started = true;
98 | StartTransactionCommand();
99 | }
100 |
101 | if (data->hooks_setup_funcname != NIL)
102 | {
103 | hooks_func = get_hooks_function_oid(data->hooks_setup_funcname);
104 |
105 | old_ctxt = MemoryContextSwitchTo(data->hooks_mctxt);
106 | (void) OidFunctionCall1(hooks_func, PointerGetDatum(&data->hooks));
107 | MemoryContextSwitchTo(old_ctxt);
108 |
109 | elog(DEBUG3, "pglogical_output: Loaded hooks from function %u. Hooks are: \n"
110 | "\tstartup_hook: %p\n"
111 | "\tshutdown_hook: %p\n"
112 | "\trow_filter_hook: %p\n"
113 | "\ttxn_filter_hook: %p\n"
114 | "\thooks_private_data: %p\n",
115 | hooks_func,
116 | data->hooks.startup_hook,
117 | data->hooks.shutdown_hook,
118 | data->hooks.row_filter_hook,
119 | data->hooks.txn_filter_hook,
120 | data->hooks.hooks_private_data);
121 | }
122 | else if (data->api->setup_hooks)
123 | {
124 | old_ctxt = MemoryContextSwitchTo(data->hooks_mctxt);
125 | (*data->api->setup_hooks) (&data->hooks);
126 | MemoryContextSwitchTo(old_ctxt);
127 | }
128 |
129 | if (txn_started)
130 | CommitTransactionCommand();
131 | }
132 |
133 | void
134 | call_startup_hook(PGLogicalOutputData *data, List *plugin_params)
135 | {
136 | struct PGLogicalStartupHookArgs args;
137 | MemoryContext old_ctxt;
138 |
139 | if (data->hooks.startup_hook != NULL)
140 | {
141 | bool tx_started = false;
142 |
143 | args.private_data = data->hooks.hooks_private_data;
144 | args.in_params = plugin_params;
145 | args.out_params = NIL;
146 |
147 | elog(DEBUG3, "calling pglogical startup hook");
148 |
149 | if (!IsTransactionState())
150 | {
151 | tx_started = true;
152 | StartTransactionCommand();
153 | }
154 |
155 | old_ctxt = MemoryContextSwitchTo(data->hooks_mctxt);
156 | (void) (*data->hooks.startup_hook) (&args);
157 | MemoryContextSwitchTo(old_ctxt);
158 |
159 | if (tx_started)
160 | CommitTransactionCommand();
161 |
162 | data->extra_startup_params = args.out_params;
163 | /* The startup hook might change the private data seg */
164 | data->hooks.hooks_private_data = args.private_data;
165 |
166 | elog(DEBUG3, "called pglogical startup hook");
167 | }
168 | }
169 |
170 | void
171 | call_shutdown_hook(PGLogicalOutputData *data)
172 | {
173 | struct PGLogicalShutdownHookArgs args;
174 | MemoryContext old_ctxt;
175 |
176 | if (data->hooks.shutdown_hook != NULL)
177 | {
178 | args.private_data = data->hooks.hooks_private_data;
179 |
180 | elog(DEBUG3, "calling pglogical shutdown hook");
181 |
182 | old_ctxt = MemoryContextSwitchTo(data->hooks_mctxt);
183 | (void) (*data->hooks.shutdown_hook) (&args);
184 | MemoryContextSwitchTo(old_ctxt);
185 |
186 | data->hooks.hooks_private_data = args.private_data;
187 |
188 | elog(DEBUG3, "called pglogical shutdown hook");
189 | }
190 | }
191 |
192 | /*
193 | * Decide if the individual change should be filtered out by
194 | * calling a client-provided hook.
195 | */
196 | bool
197 | call_row_filter_hook(PGLogicalOutputData *data, ReorderBufferTXN *txn,
198 | Relation rel, ReorderBufferChange *change)
199 | {
200 | struct PGLogicalRowFilterArgs hook_args;
201 | MemoryContext old_ctxt;
202 | bool ret = true;
203 |
204 | if (data->hooks.row_filter_hook != NULL)
205 | {
206 | hook_args.change_type = change->action;
207 | hook_args.private_data = data->hooks.hooks_private_data;
208 | hook_args.changed_rel = rel;
209 |
210 | elog(DEBUG3, "calling pglogical row filter hook");
211 |
212 | old_ctxt = MemoryContextSwitchTo(data->hooks_mctxt);
213 | ret = (*data->hooks.row_filter_hook) (&hook_args);
214 | MemoryContextSwitchTo(old_ctxt);
215 |
216 | /* Filter hooks shouldn't change the private data ptr */
217 | Assert(data->hooks.hooks_private_data == hook_args.private_data);
218 |
219 | elog(DEBUG3, "called pglogical row filter hook, returned %d", (int) ret);
220 | }
221 |
222 | return ret;
223 | }
224 |
225 | bool
226 | call_txn_filter_hook(PGLogicalOutputData *data, RepOriginId txn_origin)
227 | {
228 | struct PGLogicalTxnFilterArgs hook_args;
229 | bool ret = true;
230 | MemoryContext old_ctxt;
231 |
232 | if (data->hooks.txn_filter_hook != NULL)
233 | {
234 | hook_args.private_data = data->hooks.hooks_private_data;
235 | hook_args.origin_id = txn_origin;
236 |
237 | elog(DEBUG3, "calling pglogical txn filter hook");
238 |
239 | old_ctxt = MemoryContextSwitchTo(data->hooks_mctxt);
240 | ret = (*data->hooks.txn_filter_hook) (&hook_args);
241 | MemoryContextSwitchTo(old_ctxt);
242 |
243 | /* Filter hooks shouldn't change the private data ptr */
244 | Assert(data->hooks.hooks_private_data == hook_args.private_data);
245 |
246 | elog(DEBUG3, "called pglogical txn filter hook, returned %d", (int) ret);
247 | }
248 |
249 | return ret;
250 | }
251 |
--------------------------------------------------------------------------------
/src/pglogical_relid_map.c:
--------------------------------------------------------------------------------
1 | /*-------------------------------------------------------------------------
2 | *
3 | * pglogical_relid_map.c
4 | * Logical Replication map of local Oids to to remote
5 | *
6 | * Portions Copyright (c) 2015-2021, Postgres Professional
7 | * Portions Copyright (c) 2015-2020, PostgreSQL Global Development Group
8 | *
9 | *
10 | * IDENTIFICATION
11 | * pglogical_relid_map.c
12 | *
13 | *-------------------------------------------------------------------------
14 | */
15 | #include "postgres.h"
16 | #include "utils/hsearch.h"
17 | #include "pglogical_relid_map.h"
18 |
19 | static HTAB *relid_map;
20 |
21 | static void
22 | pglogical_relid_map_init(void)
23 | {
24 | HASHCTL ctl;
25 |
26 | Assert(relid_map == NULL);
27 |
28 | MemSet(&ctl, 0, sizeof(ctl));
29 | ctl.keysize = sizeof(Oid);
30 | ctl.entrysize = sizeof(PGLRelidMapEntry);
31 | relid_map = hash_create("pglogical_relid_map", PGL_INIT_RELID_MAP_SIZE, &ctl, HASH_ELEM | HASH_BLOBS);
32 |
33 | Assert(relid_map != NULL);
34 | }
35 |
36 | Oid
37 | pglogical_relid_map_get(Oid relid)
38 | {
39 | if (relid_map != NULL)
40 | {
41 | PGLRelidMapEntry *entry = (PGLRelidMapEntry *) hash_search(relid_map, &relid, HASH_FIND, NULL);
42 |
43 | return entry ? entry->local_relid : InvalidOid;
44 | }
45 | return InvalidOid;
46 | }
47 |
48 | bool
49 | pglogical_relid_map_put(Oid remote_relid, Oid local_relid)
50 | {
51 | bool found;
52 | PGLRelidMapEntry *entry;
53 |
54 | if (relid_map == NULL)
55 | {
56 | pglogical_relid_map_init();
57 | }
58 | entry = hash_search(relid_map, &remote_relid, HASH_ENTER, &found);
59 | if (found)
60 | {
61 | entry->local_relid = local_relid;
62 | return false;
63 | }
64 | entry->local_relid = local_relid;
65 | return true;
66 | }
67 |
68 | void
69 | pglogical_relid_map_reset(void)
70 | {
71 | if (relid_map != NULL)
72 | {
73 | hash_destroy(relid_map);
74 | relid_map = NULL;
75 | }
76 | }
77 |
--------------------------------------------------------------------------------
/src/spill.c:
--------------------------------------------------------------------------------
1 | /*-----------------------------------------------------------------------------
2 | * spill.c
3 | *
4 | * Copyright (c) 2017-2021, Postgres Professional
5 | *
6 | *-----------------------------------------------------------------------------
7 | */
8 | #include "postgres.h"
9 |
10 | #include
11 | #include
12 | #include "storage/fd.h"
13 | #include "spill.h"
14 | #include "pgstat.h"
15 |
16 | #include "multimaster.h"
17 | #include "logger.h"
18 |
19 | void
20 | MtmSpillToFile(int fd, char const *data, size_t size)
21 | {
22 | Assert(fd >= 0);
23 | while (size != 0)
24 | {
25 | int written = write(fd, data, size);
26 |
27 | if (written <= 0)
28 | {
29 | close(fd);
30 | ereport(ERROR,
31 | (errcode_for_file_access(),
32 | MTM_ERRMSG("pglogical_recevier failed to spill transaction to file: %m")));
33 | }
34 | data += written;
35 | size -= written;
36 | }
37 | }
38 |
39 | void
40 | MtmCreateSpillDirectory(int node_id)
41 | {
42 | char path[MAXPGPATH];
43 | struct dirent *spill_de;
44 | DIR *spill_dir;
45 |
46 | mkdir("pg_mtm", S_IRWXU);
47 | sprintf(path, "pg_mtm/%d", node_id);
48 | mkdir(path, S_IRWXU);
49 |
50 | spill_dir = AllocateDir(path);
51 | if (spill_dir == NULL)
52 | {
53 | ereport(PANIC,
54 | (errcode_for_file_access(),
55 | MTM_ERRMSG("pglogical_receiver failed to create spill directory \"%s\": %m",
56 | path)));
57 | }
58 | /* cleanup old files in case of previous crash */
59 | while ((spill_de = ReadDir(spill_dir, path)) != NULL)
60 | {
61 | if (strncmp(spill_de->d_name, "txn", 3) == 0)
62 | {
63 | sprintf(path, "pg_mtm/%d/%s", node_id, spill_de->d_name);
64 |
65 | if (unlink(path) != 0)
66 | ereport(PANIC,
67 | (errcode_for_file_access(),
68 | MTM_ERRMSG("pglogical_receiver could not remove spill file \"%s\": %m",
69 | path)));
70 | }
71 | }
72 | FreeDir(spill_dir);
73 | }
74 |
75 |
76 | int
77 | MtmCreateSpillFile(int node_id, int *file_id)
78 | {
79 | static int spill_file_id;
80 | char path[MAXPGPATH];
81 | int fd;
82 |
83 | sprintf(path, "pg_mtm/%d/txn-%d.snap",
84 | node_id, ++spill_file_id);
85 | fd = BasicOpenFile(path,
86 | O_CREAT | O_TRUNC | O_WRONLY | O_APPEND | PG_BINARY);
87 | if (fd < 0)
88 | {
89 | ereport(PANIC,
90 | (errcode_for_file_access(),
91 | MTM_ERRMSG("pglogical_receiver could not create spill file \"%s\": %m",
92 | path)));
93 | }
94 | *file_id = spill_file_id;
95 | return fd;
96 | }
97 |
98 | int
99 | MtmOpenSpillFile(int node_id, int file_id)
100 | {
101 | static char path[MAXPGPATH];
102 | int fd;
103 |
104 | sprintf(path, "pg_mtm/%d/txn-%d.snap",
105 | node_id, file_id);
106 | fd = OpenTransientFile(path,
107 | O_RDONLY | PG_BINARY);
108 | if (fd < 0)
109 | {
110 | ereport(PANIC,
111 | (errcode_for_file_access(),
112 | MTM_ERRMSG("pglogical_apply could not open spill file \"%s\": %m",
113 | path)));
114 | }
115 | if (unlink(path) < 0)
116 | { /* Should remove file on close */
117 | ereport(LOG,
118 | (errcode_for_file_access(),
119 | MTM_ERRMSG("pglogical_apply failed to unlink spill file: %m")));
120 | }
121 | return fd;
122 | }
123 |
124 | void
125 | MtmReadSpillFile(int fd, char *data, size_t size)
126 | {
127 | Assert(fd >= 0);
128 | while (size != 0)
129 | {
130 | int rc = read(fd, data, size);
131 |
132 | if (rc <= 0)
133 | {
134 | CloseTransientFile(fd);
135 | ereport(ERROR,
136 | (errcode_for_file_access(),
137 | MTM_ERRMSG("pglogical_apply failed to read spill file: %m")));
138 | }
139 | data += rc;
140 | size -= rc;
141 | }
142 | }
143 |
144 | void
145 | MtmCloseSpillFile(int fd)
146 | {
147 | if (close(fd) < 0)
148 | ereport(ERROR,
149 | (errcode_for_file_access(),
150 | MTM_ERRMSG("pglogical_recevier failed to close spill file: %m")));
151 | }
152 |
--------------------------------------------------------------------------------
/src/test_bkb.sage.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env sage
2 | import sys, os
3 |
4 | from sage.all import *
5 | from subprocess import Popen, PIPE, STDOUT
6 | from random import randrange, randint
7 | import unittest
8 |
9 | def run_stdin(input):
10 | mydir = os.path.dirname(os.path.realpath(__file__))
11 | binfile = mydir + "/../src/a.out"
12 |
13 | p = Popen(binfile, stdout=PIPE, stdin=PIPE, stderr=STDOUT)
14 | grep_stdout = p.communicate(input=input)[0]
15 | return grep_stdout.decode()
16 |
17 | def run_bkb(g):
18 | n = len(g)
19 | params = str(n) + "\n"
20 | for i in range(n):
21 | row = 0
22 | row |= 1 << i
23 | for j in range(n):
24 | if g.has_edge(i, j):
25 | row |= 1 << j
26 | params += str(row) + "\n"
27 |
28 | # print(params)
29 | res = run_stdin(params).strip()
30 | res = [int(n) for n in res.split(' ')]
31 | return res
32 |
33 |
34 | class TestCliqueBKB(unittest.TestCase):
35 |
36 | # test only that max clique size is ok
37 | def test_random_graphs_size(self):
38 |
39 | for _ in range(1000):
40 | n_nodes = randint(1, 60)
41 | n_edges = randrange(1 + (n_nodes * (n_nodes - 1) / 2))
42 | print("graph |V|={}, |E|={}>".format(n_nodes, n_edges))
43 | g = graphs.RandomGNM(n_nodes, n_edges)
44 |
45 | clique, clique_size = run_bkb(g)
46 | clique_members = []
47 | for i in range(n_nodes):
48 | if (clique & (1 << i)) != 0:
49 | clique_members.append(i)
50 |
51 | sage_clique_maximum = g.clique_maximum()
52 |
53 | print(clique, clique_members, clique_size, sage_clique_maximum, len(sage_clique_maximum))
54 | self.assertEqual(clique_size, len(sage_clique_maximum))
55 |
56 | # test that found graph is indeed the clique, much more expensive
57 | def test_random_graphs(self):
58 |
59 | for _ in range(1000):
60 | n_nodes = randint(1, 30)
61 | n_edges = randrange(1 + (n_nodes * (n_nodes - 1) / 2))
62 | print("graph |V|={}, |E|={}>".format(n_nodes, n_edges))
63 | g = graphs.RandomGNM(n_nodes, n_edges)
64 |
65 | clique, clique_size = run_bkb(g)
66 | clique_members = []
67 | for i in range(n_nodes):
68 | if (clique & (1 << i)) != 0:
69 | clique_members.append(i)
70 |
71 | sage_maxcliques = g.cliques_maximal()
72 | print(sage_maxcliques[0])
73 |
74 | found = False
75 | for sc in sage_maxcliques:
76 | if sc == clique_members:
77 | found = True
78 | self.assertTrue(found)
79 |
80 | print(clique, clique_members, clique_size, sage_maxcliques[0], len(sage_maxcliques[0]))
81 |
82 |
83 |
84 | if __name__ == '__main__':
85 | unittest.main()
86 |
--------------------------------------------------------------------------------
/t/000_cross._pl:
--------------------------------------------------------------------------------
1 | # based on
2 | # "Distributed snapshot isolation: global transactions pay globally,
3 | # local transactions pay locally"
4 | # by Binnig et al cross-phenomenon.
5 |
6 | use strict;
7 | use warnings;
8 |
9 | use Cluster;
10 | use TestLib;
11 | use Test::More tests => 2;
12 | use IPC::Run qw(start finish);
13 | use Cwd;
14 |
15 | my $nnodes = 2;
16 | my $nclients = 2;
17 | my $nkeys = $nnodes * $nclients;
18 | my $cluster = new Cluster($nnodes);
19 |
20 | $cluster->init();
21 | $cluster->configure();
22 | $cluster->start();
23 |
24 | my ($rc, $in, $out, $err);
25 |
26 | $cluster->await_nodes( [0,1] );
27 |
28 | note("preparing the tables");
29 | if ($cluster->psql(0, 'postgres', "create table t (k int primary key, v int)"))
30 | {
31 | $cluster->bail_out_with_logs('failed to create t');
32 | }
33 |
34 | if ($cluster->psql(0, 'postgres', "insert into t (select generate_series(0, $nkeys - 1), 0)"))
35 | {
36 | $cluster->bail_out_with_logs('failed to fill t');
37 | }
38 |
39 | sub appender
40 | {
41 | my ($appender_id, $clients, $seconds, $node, $inref, $outref) = @_;
42 |
43 | my @argv = (
44 | 'pgbench',
45 | '-n',
46 | -c => $clients,
47 | -j => $clients,
48 | -T => $seconds,
49 | -h => $node->host(),
50 | -p => $node->port(),
51 | -D => "appender_id=$appender_id",
52 | -D => "clients=$clients",
53 | -f => 'tests/appender.pgb',
54 | 'postgres',
55 | );
56 |
57 | note("running[" . getcwd() . "]: " . join(' ', @argv));
58 |
59 | return start(\@argv, $inref, $outref);
60 | }
61 |
62 | sub state_dump
63 | {
64 | my $state = shift;
65 |
66 | note("<<<<<");
67 | while (my ($key, $value) = each(%{$state}))
68 | {
69 | note("$key -> $value");
70 | }
71 | note(">>>>>");
72 | }
73 |
74 | sub state_leq
75 | {
76 | my ($a, $b) = @_;
77 |
78 | while (my ($key, $value) = each(%{$a}))
79 | {
80 | if (!exists($b->{$key}))
81 | {
82 | note("b has no key $key\n");
83 | return 0;
84 | }
85 |
86 | if ($b->{$key} < $value)
87 | {
88 | note($b->{$key} . " < $value\n");
89 | return 0;
90 | }
91 | }
92 |
93 | return 1;
94 | }
95 |
96 | sub parse_state
97 | {
98 | my $str = shift;
99 | my $state = {};
100 |
101 | while ($str =~ /(\d+)\|(\d+)/g)
102 | {
103 | $state->{$1} = $2;
104 | }
105 |
106 | return $state;
107 | }
108 |
109 | note("starting appenders");
110 | note("starting benches");
111 | $in = '';
112 | $out = '';
113 | my @appenders = ();
114 | my $appender_id = 0;
115 | my $seconds = 30;
116 | foreach my $node (@{$cluster->{nodes}})
117 | {
118 | push(@appenders, appender($appender_id, $nclients, $seconds, $node, \$in, \$out));
119 | $appender_id++;
120 | }
121 |
122 | my $selects = 0;
123 | my $anomalies = 0;
124 | my $started = time();
125 | my $node_id = 0;
126 | my $state_a = undef;
127 | my $state_b = undef;
128 | my $out_a = '';
129 | my $out_b = '';
130 | while (time() - $started < $seconds)
131 | {
132 | $node_id = ($node_id + 1) % $nnodes;
133 | $state_a = $state_b;
134 | $out_a = $out_b;
135 | ($rc, $out, $err) = $cluster->psql($node_id, 'postgres', "select * from t;");
136 | $selects++;
137 | $state_b = parse_state($out);
138 | $out_b = $out;
139 | if (defined $state_a)
140 | {
141 | if (!state_leq($state_a, $state_b) && !state_leq($state_a, $state_b))
142 | {
143 | note("cross anomaly detected:\n===a\n$out_a\n+++b\n$out_b\n---\n");
144 | $anomalies++;
145 | }
146 | }
147 | }
148 |
149 | note("finishing benches");
150 | foreach my $appender (@appenders)
151 | {
152 | if (!finish($appender))
153 | {
154 | $cluster->dumplogs();
155 | $cluster->bail_out_with_logs("pgbench exited with $?");
156 | }
157 | }
158 |
159 | is($anomalies, 0, "no cross anomalies after $selects selects");
160 |
161 | ok($cluster->stop('fast'), "cluster stops");
162 | 1;
163 |
--------------------------------------------------------------------------------
/t/000_deadlock.pl:
--------------------------------------------------------------------------------
1 | # simple deadlock test
2 |
3 | use strict;
4 | use warnings;
5 |
6 | use Cluster;
7 | use TestLib;
8 |
9 | # Test whether we have both DBI and DBD::pg
10 | my $dbdpg_rc = eval
11 | {
12 | require DBI;
13 | require DBD::Pg;
14 | DBD::Pg->import(':async');
15 | 1;
16 | };
17 |
18 | # And tell Test::More to skip the test entirely if not
19 | require Test::More;
20 | if (not $dbdpg_rc)
21 | {
22 | Test::More->import(skip_all => 'DBI and DBD::Pg are not available');
23 | }
24 | else
25 | {
26 | Test::More->import(tests => 1);
27 | }
28 |
29 | sub query_row
30 | {
31 | my ($dbi, $sql, @keys) = @_;
32 | my $sth = $dbi->prepare($sql) || die;
33 | $sth->execute(@keys) || die;
34 | my $ret = $sth->fetchrow_array || undef;
35 | return $ret;
36 | }
37 |
38 | sub query_exec
39 | {
40 | my ($dbi, $sql) = @_;
41 | my $rv = $dbi->do($sql) || die;
42 | return $rv;
43 | }
44 |
45 | sub query_exec_async
46 | {
47 | my ($dbi, $sql) = @_;
48 | # Since we are not importing DBD::Pg at compilation time, we can't use
49 | # constants from it.
50 | my $DBD_PG_PG_ASYNC = 1;
51 | my $rv = $dbi->do($sql, {pg_async => $DBD_PG_PG_ASYNC}) || die;
52 | return $rv;
53 | }
54 |
55 | my $cluster = new Cluster(2);
56 |
57 | $cluster->init();
58 | $cluster->start();
59 | $cluster->create_mm('regression');
60 |
61 | my ($rc, $out, $err);
62 | sleep(10);
63 |
64 | $cluster->safe_psql(0, "create table t(k int primary key, v text)");
65 | $cluster->safe_psql(0, "insert into t values (1, 'hello'), (2, 'world')");
66 |
67 | my @conns = map { DBI->connect('DBI:Pg:' . $cluster->connstr($_)) } 0..1;
68 |
69 | query_exec($conns[0], "begin");
70 | query_exec($conns[1], "begin");
71 |
72 | query_exec($conns[0], "update t set v = 'asd' where k = 1");
73 | query_exec($conns[1], "update t set v = 'bsd'");
74 |
75 | query_exec($conns[0], "update t set v = 'bar' where k = 2");
76 | query_exec($conns[1], "update t set v = 'foo'");
77 |
78 | query_exec_async($conns[0], "commit");
79 | query_exec_async($conns[1], "commit");
80 |
81 | my $timeout = 16;
82 | while (--$timeout > 0)
83 | {
84 | my $r0 = $conns[0]->pg_ready();
85 | my $r1 = $conns[1]->pg_ready();
86 | if ($r0 && $r1) {
87 | last;
88 | }
89 | sleep(1);
90 | }
91 |
92 | if ($timeout > 0)
93 | {
94 | my $succeeded = 0;
95 | $succeeded++ if $conns[0]->pg_result();
96 | $succeeded++ if $conns[1]->pg_result();
97 |
98 | pass("queries finished");
99 | }
100 | else
101 | {
102 | $conns[0]->pg_cancel() unless $conns[0]->pg_ready();
103 | $conns[1]->pg_cancel() unless $conns[1]->pg_ready();
104 |
105 | fail("queries timed out");
106 | }
107 |
108 | query_row($conns[0], "select * from t where k = 1");
109 |
110 | $cluster->stop('fast');
111 |
--------------------------------------------------------------------------------
/t/000_init._pl:
--------------------------------------------------------------------------------
1 | # test that after create_mm awaited nodes we won't get non-online state
2 | # immediately later. Catches races in MtmGetCurrentStatus logic.
3 | # It is expensive, so not run in the regular suite.
4 |
5 | use Cluster;
6 | use Test::More tests => 1;
7 |
8 | my $cluster = new Cluster(3);
9 | $cluster->init(q{
10 | });
11 | $cluster->start();
12 | $cluster->create_mm('regression');
13 |
14 | foreach(0..1000) # hopefully enough to catch all related races
15 | {
16 | foreach (0..2)
17 | {
18 | $cluster->safe_psql($_, "select 42");
19 | }
20 | }
21 |
22 | is(0, 0, "dummy"); # Test::More doesn't like 0 tests, ha
23 |
--------------------------------------------------------------------------------
/t/001_regress.pl:
--------------------------------------------------------------------------------
1 | # run core regression tests on multimaster
2 |
3 | # tests known to fail currently and failure reasons:
4 | # - create_index (CREATE INDEX CONCURRENTLY not supported due to deadlock
5 | # issues, see ddl.c)
6 | # - same for index_including, index_including_gist
7 | # - create_table (due to CTAS prepared statement)
8 | # - sanity check (due to pg_publication/subscription masking and other mtm tables)
9 | # - transactions (lack of COMMIT AND CHAIN support)
10 | # - rowsecurity
11 | # - atx, atx5
12 | # - rules (_pg_prepared_xacts and similar)
13 | # - publication, subscription (_pg_publication/subscription masking)
14 | # - prepare (CTAS prepared statement)
15 | # - indexing (again CIC).
16 | #
17 | # original test output/diffs are at $ENV{TESTDIR}/tmp_check/regress_outdir;
18 | # (in normal build TESTDIR is just mmts/; in vpath it is 'external' mmts/)
19 | # then diff is censored and copied to $ENV{TESTDIR}/results.
20 |
21 | use Cluster;
22 | use File::Basename;
23 | use IPC::Run 'run';
24 | use Test::More;
25 |
26 | # With PGXS the sources are unavailable, so we can't obtain schedules and core
27 | # test themselves.
28 | if ($ENV{'PGXS'})
29 | {
30 | # Test::More doesn't like no tests at all
31 | is(0, 0, "dummy");
32 | done_testing();
33 | exit(0);
34 | }
35 |
36 | # determenistic ports for expected files
37 | $PostgresNode::last_port_assigned = 55431;
38 |
39 | my $cluster = new Cluster(3);
40 | $cluster->init(q{
41 | multimaster.volkswagen_mode = on
42 | # allow to spoof pg_prepared_xacts view
43 | allow_system_table_mods = on
44 | });
45 | $cluster->start();
46 | $cluster->create_mm('regression');
47 |
48 | ###############################################################################
49 | # postgres regression tests
50 | ###############################################################################
51 |
52 | # configure db output format like pg_regress
53 | # In particular, pg_regress explicitly sets PGTZ=PST8PDT, and it turns out some
54 | # tests (including DDL! (see volatile_partbound_test)) depend on current_time,
55 | # so mtm receiver ought to use the same timezone to pass them.
56 | $cluster->{nodes}->[0]->safe_psql('regression', q{
57 | ALTER DATABASE "regression" SET lc_messages TO 'C';
58 | ALTER DATABASE "regression" SET lc_monetary TO 'C';
59 | ALTER DATABASE "regression" SET lc_numeric TO 'C';
60 | ALTER DATABASE "regression" SET lc_time TO 'C';
61 | ALTER DATABASE "regression" SET timezone_abbreviations TO 'Default';
62 | ALTER DATABASE "regression" SET TimeZone TO 'PST8PDT';
63 | });
64 |
65 | # do not show transaction from concurrent backends in pg_prepared_xacts
66 | $cluster->{nodes}->[0]->safe_psql('regression', q{
67 | ALTER VIEW pg_prepared_xacts RENAME TO _pg_prepared_xacts;
68 | CREATE VIEW pg_prepared_xacts AS
69 | select * from _pg_prepared_xacts where gid not like 'MTM-%'
70 | ORDER BY transaction::text::bigint;
71 | ALTER TABLE pg_publication RENAME TO _pg_publication;
72 | CREATE VIEW pg_catalog.pg_publication AS SELECT * FROM pg_catalog._pg_publication WHERE pubname<>'multimaster';
73 | ALTER TABLE pg_subscription RENAME TO _pg_subscription;
74 | CREATE VIEW pg_catalog.pg_subscription AS SELECT * FROM pg_catalog._pg_subscription WHERE subname NOT LIKE 'mtm_sub_%';
75 | });
76 |
77 | $cluster->{nodes}->[0]->safe_psql('regression', q{
78 | ALTER SYSTEM SET allow_system_table_mods = 'off';
79 | });
80 | foreach my $node (@{$cluster->{nodes}}){
81 | $node->restart;
82 | }
83 | $cluster->await_nodes( [0,1,2] );
84 |
85 | # load schedule without tablespace test which is not expected
86 | # to work with several postgreses on a single node
87 | my $schedule = TestLib::slurp_file('../../src/test/regress/parallel_schedule');
88 | $schedule =~ s/test: tablespace/#test: tablespace/g;
89 | $schedule =~ s/test: cfs/#test: cfs/g;
90 | $schedule =~ s/test: largeobject//; # serial schedule
91 | $schedule =~ s/largeobject//; # parallel schedule
92 | $schedule =~ s/atx0//; # parallel schedule
93 | unlink('parallel_schedule');
94 | TestLib::append_to_file('parallel_schedule', $schedule);
95 |
96 | my $regress_shlib = $ENV{REGRESS_SHLIB};
97 | my $regress_libdir = dirname($regress_shlib);
98 | my $regress_outdir = "$ENV{TESTDIR}/tmp_check/regress_outdir";
99 | mkdir($regress_outdir);
100 | # REMOVEME: not needed in 14+, pg_regress fixed in upstream
101 | mkdir("${regress_outdir}/sql");
102 | mkdir("${regress_outdir}/expected");
103 | TestLib::system_log($ENV{'PG_REGRESS'},
104 | '--host=' . $cluster->{nodes}->[0]->host, '--port=' . $cluster->{nodes}->[0]->port,
105 | '--use-existing', '--bindir=',
106 | '--schedule=parallel_schedule',
107 | "--dlpath=${regress_libdir}",
108 | '--inputdir=../../src/test/regress',
109 | "--outputdir=${regress_outdir}");
110 | unlink('parallel_schedule');
111 |
112 | # rename s/diffs/diff as some upper level testing systems are searching for all
113 | # *.diffs files.
114 | rename "${regress_outdir}/regression.diffs", "${regress_outdir}/regression.diff"
115 | or die "cannot rename file: $!";
116 |
117 | # strip absolute paths and dates out of resulted regression.diffs
118 | my $res_diff = TestLib::slurp_file("${regress_outdir}/regression.diff");
119 | # In <= 11 default diff format was context, since 12 unified; handing lines
120 | # starting with ---|+++|*** covers both.
121 | # To make someone's life easier, we prepend .. to make relative paths correct.
122 | # (it allows goto file comparison in editors)
123 | # This of course unfortunately doesn't work for VPATH.
124 | $res_diff =~ s/(--- |\+\+\+ |\*\*\* ).+contrib\/mmts(.+\.out)\t.+\n/$1..$2\tCENSORED\n/g;
125 | # Since 12 header like
126 | # diff -U3 /blabla/contrib/mmts/../../src/test/regress/expected/opr_sanity.out /blabla/mmts/../../src/test/regress/results/opr_sanity.out
127 | # was added to each file diff
128 | $res_diff =~ s/(diff ).+contrib\/mmts(.+\.out).+contrib\/mmts(.+\.out\n)/$1..$2 ..$3/g;
129 | $res_diff =~ s/(lo_import[ \(]')\/[^']+\//$1\/CENSORED\//g;
130 | #SELECT lo_export(loid, '/home/alex/projects/ppro/postgrespro/contrib/mmts/../../src/test/regress/results/lotest.txt') FROM lotest_stash_values;
131 | $res_diff =~ s/(lo_export.*\'\/).+\//$1CENSORED\//g;
132 | mkdir("$ENV{TESTDIR}/results");
133 | unlink("$ENV{TESTDIR}/results/regression.diff");
134 |
135 | # finally compare regression.diffs with our version
136 | # Do not use diffs extension as some upper level testing systems are searching for all
137 | # *.diffs files.
138 | TestLib::append_to_file("$ENV{TESTDIR}/results/regression.diff", $res_diff);
139 | # TODO: work with diffs on per-test basis
140 | my $expected_file;
141 | if (Cluster::is_ee())
142 | {
143 | $expected_file = "expected/regression_ee.diff"
144 | }
145 | else
146 | {
147 | $expected_file = "expected/regression_vanilla.diff"
148 | }
149 | $diff = TestLib::system_log("diff -U3 ${expected_file} $ENV{TESTDIR}/results/regression.diff");
150 | run [ "diff", "-U3", "${expected_file}", "$ENV{TESTDIR}/results/regression.diff" ], ">", "$ENV{TESTDIR}/regression.diff.diff";
151 | my $res = $?;
152 |
153 | is($res, 0, "postgres regress");
154 |
155 | done_testing();
156 |
--------------------------------------------------------------------------------
/t/002_regressmm.pl:
--------------------------------------------------------------------------------
1 | # run sql/multimaster.sql tests
2 | use Cluster;
3 | use Test::More tests => 1;
4 |
5 | # determenistic ports for expected files
6 | $PostgresNode::last_port_assigned = 55431;
7 |
8 | my $cluster = new Cluster(3);
9 | $cluster->init(q{
10 | multimaster.volkswagen_mode = off
11 | });
12 | $cluster->start();
13 | $cluster->create_mm('regression');
14 |
15 | ###############################################################################
16 | # multimaster regression tests
17 | ###############################################################################
18 |
19 | my @tests = ('multimaster');
20 | # run atx test only on ee
21 | if (Cluster::is_ee())
22 | {
23 | push @tests, 'atx';
24 | }
25 |
26 | my $ret = TestLib::system_log($ENV{'PG_REGRESS'},
27 | '--host=' . $cluster->{nodes}->[0]->host, '--port=' . $cluster->{nodes}->[0]->port,
28 | '--use-existing', '--bindir=', @tests);
29 | if ($ret != 0)
30 | {
31 | print "### Got regression! \n", TestLib::slurp_file('regression.diffs');
32 | }
33 | is($ret, 0, "multimaster regress");
34 |
--------------------------------------------------------------------------------
/t/003_basic_recovery.pl:
--------------------------------------------------------------------------------
1 | # Basic recovery: some inserts, get node down, some inserts, get node up, some
2 | # inserts. There is no failures with concurrent load, so an easy variant.
3 |
4 | use strict;
5 | use warnings;
6 | use Cluster;
7 | use TestLib;
8 | use Test::More tests => 4;
9 |
10 | my $cluster = new Cluster(3);
11 | $cluster->init();
12 | $cluster->start();
13 | $cluster->create_mm();
14 |
15 | my $ret;
16 | my $psql_out;
17 |
18 | ###############################################################################
19 | # Replication check
20 | ###############################################################################
21 |
22 | $cluster->{nodes}->[0]->safe_psql('postgres', q{
23 | create table if not exists t(k int primary key, v int);
24 | insert into t values(1, 10);
25 | });
26 | $psql_out = $cluster->{nodes}->[2]->safe_psql('postgres', q{
27 | select v from t where k=1;
28 | });
29 | is($psql_out, '10', "Check replication while all nodes are up.");
30 |
31 | ###############################################################################
32 | # Isolation regress checks
33 | ###############################################################################
34 |
35 | # we can call pg_regress here
36 |
37 | ###############################################################################
38 | # Work after node stop
39 | ###############################################################################
40 |
41 | note("stopping node 2");
42 | $cluster->{nodes}->[2]->stop;
43 |
44 | $cluster->await_nodes_after_stop( [0,1] );
45 |
46 | $cluster->safe_psql(0, "insert into t values(2, 20);");
47 | $cluster->safe_psql(1, "insert into t values(3, 30);");
48 | $cluster->safe_psql(0, "insert into t values(4, 40);");
49 | $cluster->safe_psql(1, "insert into t values(5, 50);");
50 |
51 | $psql_out = $cluster->safe_psql(0, "select v from t where k=4;");
52 | is($psql_out, '40', "Check replication after node failure.");
53 |
54 | ###############################################################################
55 | # Work after node start
56 | ###############################################################################
57 |
58 | note("starting node 2");
59 | $cluster->{nodes}->[2]->start;
60 |
61 | # intentionally start from 2
62 | $cluster->await_nodes( [2,0,1] );
63 |
64 | $cluster->safe_psql(0, "insert into t values(6, 60);");
65 | $cluster->safe_psql(1, "insert into t values(7, 70);");
66 | $cluster->safe_psql(0, "insert into t values(8, 80);");
67 | $cluster->safe_psql(1, "insert into t values(9, 90);");
68 |
69 | $psql_out = $cluster->safe_psql(2, "select v from t where k=8;");
70 | is($psql_out, '80', "Check replication after failed node recovery.");
71 |
72 | $psql_out = $cluster->safe_psql(2, "select v from t where k=5;");
73 | is($psql_out, '50', "Check replication after failed node recovery.");
74 |
75 | $cluster->stop();
76 |
77 | 1;
78 |
--------------------------------------------------------------------------------
/t/004_recovery.pl:
--------------------------------------------------------------------------------
1 | use strict;
2 | use warnings;
3 |
4 | use Cluster;
5 | use TestLib;
6 | use Test::More tests => 6;
7 |
8 | my $cluster = new Cluster(3);
9 | $cluster->init();
10 | $cluster->start();
11 | $cluster->create_mm();
12 |
13 | ########################################################
14 | # Check data integrity before and after recovery of single node.
15 | # Easy variant: sequential pgbenches, recovery without concurrent load.
16 | ########################################################
17 |
18 | my $hash0; my $hash1; my $hash2; my $oldhash;
19 | my $hash_query = q{
20 | select
21 | md5('(' || string_agg(aid::text || ', ' || abalance::text , '),(') || ')')
22 | from
23 | (select * from pgbench_accounts order by aid) t;
24 | };
25 |
26 | $cluster->pgbench(1, ('-i', -s => '10') );
27 | $cluster->pgbench(0, ('-n','-N', -T => '4') );
28 | $cluster->pgbench(1, ('-n','-N', -T => '4') );
29 | $cluster->pgbench(2, ('-n','-N', -T => '4') );
30 |
31 | $cluster->{nodes}->[2]->stop('fast');
32 | $cluster->await_nodes_after_stop( [0,1] );
33 |
34 | $cluster->pgbench(0, ('-n','-N', -T => '4') );
35 | $cluster->pgbench(1, ('-n','-N', -T => '4') );
36 |
37 | $cluster->await_nodes( [0,1] ); # just in case we've faced random timeout before
38 | $hash0 = $cluster->safe_psql(0, $hash_query);
39 | $hash1 = $cluster->safe_psql(1, $hash_query);
40 | is($hash0, $hash1, "Check that hash is the same before recovery");
41 |
42 | $cluster->{nodes}->[2]->start;
43 | $cluster->await_nodes( [2,0,1] );
44 |
45 | $oldhash = $hash0;
46 | $hash0 = $cluster->safe_psql(0, $hash_query);
47 | $hash1 = $cluster->safe_psql(1, $hash_query);
48 | $hash2 = $cluster->safe_psql(2, $hash_query);
49 |
50 | note("$oldhash, $hash0, $hash1, $hash2");
51 | is( (($hash0 eq $hash1) and ($hash1 eq $hash2) and ($oldhash eq $hash0)) , 1,
52 | "Check that hash is the same after recovery");
53 |
54 | ########################################################
55 | # Check start after all nodes were disconnected
56 | ########################################################
57 |
58 | $cluster->safe_psql(0, "create table if not exists t(k int primary key, v int);");
59 |
60 | $cluster->safe_psql(0, "insert into t values(1, 10);");
61 | $cluster->safe_psql(1, "insert into t values(2, 20);");
62 | $cluster->safe_psql(2, "insert into t values(3, 30);");
63 |
64 | my $sum0; my $sum1; my $sum2;
65 |
66 | $cluster->{nodes}->[1]->stop('fast');
67 | $cluster->{nodes}->[2]->stop('fast');
68 |
69 | $cluster->{nodes}->[1]->start;
70 | $cluster->{nodes}->[2]->start;
71 |
72 | $cluster->await_nodes( [1,2,0] );
73 |
74 | $sum0 = $cluster->safe_psql(0, "select sum(v) from t;");
75 | $sum1 = $cluster->safe_psql(1, "select sum(v) from t;");
76 | $sum2 = $cluster->safe_psql(2, "select sum(v) from t;");
77 | is( (($sum0 == 60) and ($sum1 == $sum0) and ($sum2 == $sum0)) , 1,
78 | "Check that nodes are working and sync");
79 |
80 | ########################################################
81 | # Check recovery during some load
82 | ########################################################
83 |
84 | $cluster->pgbench(0, ('-i', -s => '10') );
85 | $cluster->pgbench(0, ('-N', -T => '1') );
86 | $cluster->pgbench(1, ('-N', -T => '1') );
87 | $cluster->pgbench(2, ('-N', -T => '1') );
88 |
89 | # kill node while neighbour is under load
90 | my $pgb_handle = $cluster->pgbench_async(1, ('-N', -T => '20', -c => '5') );
91 | sleep(5);
92 | $cluster->{nodes}->[2]->stop('fast');
93 | $cluster->pgbench_await($pgb_handle);
94 |
95 | # start node while neighbour is under load
96 | $pgb_handle = $cluster->pgbench_async(0, ('-N', -T => '20', -c => '5') );
97 | sleep(5);
98 | $cluster->{nodes}->[2]->start;
99 | $cluster->pgbench_await($pgb_handle);
100 |
101 | # await recovery
102 | $cluster->await_nodes( [2,0,1] );
103 |
104 | # check data identity
105 | $hash0 = $cluster->safe_psql(0, $hash_query);
106 | $hash1 = $cluster->safe_psql(1, $hash_query);
107 | $hash2 = $cluster->safe_psql(2, $hash_query);
108 | note("$hash0, $hash1, $hash2");
109 | is( (($hash0 eq $hash1) and ($hash1 eq $hash2)) , 1, "Check that hash is the same");
110 |
111 | $sum0 = $cluster->safe_psql(0, "select sum(abalance) from pgbench_accounts;");
112 | $sum1 = $cluster->safe_psql(1, "select sum(abalance) from pgbench_accounts;");
113 | $sum2 = $cluster->safe_psql(2, "select sum(abalance) from pgbench_accounts;");
114 |
115 | note("Sums: $sum0, $sum1, $sum2");
116 | is($sum2, $sum0, "Check that sum_2 == sum_0");
117 | is($sum2, $sum1, "Check that sum_2 == sum_1");
118 |
119 | $sum0 = $cluster->safe_psql(0, "select count(*) from pg_prepared_xacts;");
120 | $sum1 = $cluster->safe_psql(1, "select count(*) from pg_prepared_xacts;");
121 | $sum2 = $cluster->safe_psql(2, "select count(*) from pg_prepared_xacts;");
122 |
123 | note("Number of prepared tx: $sum0, $sum1, $sum2");
124 |
125 | $cluster->stop;
126 |
--------------------------------------------------------------------------------
/t/005_pgbench.pl:
--------------------------------------------------------------------------------
1 | # Kinda bank test: on each node multiple clients transfer money from one acc to
2 | # another, another bunch of clients make sure sum is constant always.
3 |
4 | use strict;
5 | use warnings;
6 |
7 | use Cluster;
8 | use TestLib;
9 | use Test::More tests => 2;
10 |
11 | my $cluster = new Cluster(2);
12 | $cluster->init();
13 | $cluster->start();
14 | $cluster->create_mm();
15 |
16 | $cluster->safe_psql(0, q{
17 | create table t (k int primary key, v int);
18 | insert into t (select generate_series(0, 999), 0);
19 | create table reader_log (v int);
20 | });
21 |
22 | my $clients = 5;
23 | my $seconds = 30;
24 | my @benches = ();
25 | foreach (0..$#{$cluster->{nodes}})
26 | {
27 | push @benches, $cluster->pgbench_async($_,
28 | ('-n', -T => $seconds, -c => $clients, -f => 'tests/reader.pgb'));
29 | push @benches, $cluster->pgbench_async($_,
30 | ('-n', -T => $seconds, -c => $clients, -f => 'tests/writer.pgb', -R => 10));
31 | }
32 |
33 | $cluster->pgbench_await($_) foreach @benches;
34 |
35 | my $out;
36 |
37 | $out = $cluster->safe_psql(0,
38 | "select count(*) from reader_log where v != 0");
39 | is($out, 0, "there is nothing except zeros in reader_log");
40 |
41 | $out = $cluster->safe_psql(0,
42 | "select count(*) from reader_log where v = 0");
43 | isnt($out, 0, "reader_log is not empty");
44 |
45 | $cluster->stop;
46 |
--------------------------------------------------------------------------------
/t/006_pgbenchdl.pl:
--------------------------------------------------------------------------------
1 | # Like pgbench.pl, but the probability of deadlocks is much higher; check that
2 | # they get detected.
3 |
4 | use strict;
5 | use warnings;
6 |
7 | use Cluster;
8 | use TestLib;
9 | use Test::More tests => 1;
10 | use Data::Dumper;
11 |
12 | use POSIX ":sys_wait_h";
13 |
14 | my $cluster = new Cluster(3);
15 | $cluster->init();
16 | $cluster->start();
17 | $cluster->create_mm();
18 |
19 | $cluster->safe_psql(0, q{
20 | create table transactions (id SERIAL primary key, dt timestamp default now(), uid int, amount int);
21 | create index on transactions using btree(uid);
22 | create table users (uid int primary key, sum bigint);
23 | });
24 |
25 | my $clients = 10;
26 | my $seconds = 90;
27 | my @benches = ();
28 | foreach (0..$#{$cluster->{nodes}})
29 | {
30 | push @benches, $cluster->pgbench_async($_,
31 | ('-n', -T => $seconds, -c => $clients, -f => 'tests/deadl.pgb'));
32 | }
33 |
34 | sub isalive {
35 | my $benches = $_[0];
36 | my $any_alive = 0;
37 | waitpid(-1, WNOHANG);
38 | $any_alive = ($any_alive or (kill 0,$_->{'KIDS'}->[0]->{'PID'})) foreach @{$benches};
39 | return $any_alive;
40 | }
41 |
42 | # ensure num of successfull xacts steadily goes up, i.e. deadlocks are detected
43 | # in time.
44 | my $ptrans = 0;
45 | my $dead_count = 0;
46 | while (isalive(\@benches)) {
47 | my $trans = $cluster->safe_psql(0,
48 | "select count(*) from transactions");
49 | if ($ptrans == 0) {
50 | $ptrans = $trans;
51 | } elsif ($ptrans == $trans) {
52 | $dead_count++;
53 | } else {
54 | $dead_count = 0;
55 | $ptrans = $trans;
56 | }
57 | if ($dead_count >=3) {
58 | last;
59 | }
60 | sleep 2;
61 | }
62 |
63 | ok($dead_count < 3, 'at least one xact was committed during 6 seconds');
64 | $cluster->stop;
65 |
--------------------------------------------------------------------------------
/t/007_add_stop_node.pl:
--------------------------------------------------------------------------------
1 | use strict;
2 | use warnings;
3 |
4 | use Carp;
5 | use PostgresNode;
6 | use Cluster;
7 | use TestLib;
8 | use Test::More tests => 8;
9 |
10 | # Generally add node with concurrent load (and failures) is not supported
11 | # because of at least
12 | # 1) it is not clear why non-donor nodes should properly keep WAL for new node;
13 | # 2) if donor fails, it is not clear whether new node will obtain suitable
14 | # syncpoints to pull from non-donors;
15 | # 3) A problem with slot creation and receiver start deadlocking each other,
16 | # see PGPRO-3618.
17 | #
18 | # drop_node with concurrent load is not safe at least because once it is done we
19 | # can't determine origin node properly, so no its xacts would be replicated.
20 | #
21 | # An option is left for experiments/future work.
22 | my $concurrent_load = 0;
23 |
24 | my $cluster = new Cluster(3);
25 | $cluster->init();
26 | $cluster->start();
27 |
28 | # XXXX: delete all '-n' ?
29 |
30 | ################################################################################
31 | # manually setup nodes with sparse node_id's
32 | ################################################################################
33 |
34 | foreach (0..$#{$cluster->{nodes}})
35 | {
36 | my $node = $cluster->{nodes}->[$_];
37 | $node->{dbname} = 'postgres';
38 | }
39 |
40 | foreach (0..$#{$cluster->{nodes}})
41 | {
42 | my $node = $cluster->{nodes}->[$_];
43 |
44 | note($cluster->connstr($_));
45 |
46 | $cluster->safe_psql($_, qq{
47 | create extension multimaster;
48 | select mtm.state_create('{2, 4, 5}');
49 | insert into mtm.cluster_nodes values
50 | (2, \$\$@{[ $cluster->connstr(0) ]}\$\$, '@{[ $_ == 0 ? 't' : 'f' ]}'),
51 | (4, \$\$@{[ $cluster->connstr(1) ]}\$\$, '@{[ $_ == 1 ? 't' : 'f' ]}'),
52 | (5, \$\$@{[ $cluster->connstr(2) ]}\$\$, '@{[ $_ == 2 ? 't' : 'f' ]}');
53 | });
54 | }
55 |
56 | $cluster->await_nodes( [0..$#{$cluster->{nodes}}] );
57 |
58 | $cluster->pgbench(0, ('-i', '-n', -s => '10') );
59 | $cluster->pgbench(0, ('-N', '-n', -t => '100') );
60 | $cluster->pgbench(1, ('-N', '-n', -t => '100') ); # XXX: pgbench stucks here for quite a long time
61 | $cluster->pgbench(2, ('-N', '-n', -t => '100') );
62 |
63 | ################################################################################
64 | # auto recovery
65 | ################################################################################
66 |
67 | $cluster->{nodes}->[2]->stop('fast');
68 | $cluster->await_nodes_after_stop( [0,1] );
69 | $cluster->pgbench(0, ('-N', '-n', -T => '1') );
70 | $cluster->{nodes}->[2]->start;
71 |
72 | $cluster->await_nodes( [2,0,1] );
73 | is($cluster->is_data_identic( (0,1,2) ), 1, "check auto recovery");
74 |
75 | ################################################################################
76 | # add basebackuped node
77 | ################################################################################
78 |
79 | # add table with sequence to check sequences after n_nodes change
80 | $cluster->safe_psql(0, "create table test_seq(id serial primary key)");
81 | $cluster->safe_psql(0, "insert into test_seq values(DEFAULT)");
82 | $cluster->safe_psql(1, "insert into test_seq values(DEFAULT)");
83 | $cluster->safe_psql(2, "insert into test_seq values(DEFAULT)");
84 |
85 | my $pgb1;
86 | my $pgb2;
87 | if ($concurrent_load)
88 | {
89 | $pgb1= $cluster->pgbench_async(0, ('-N', '-n', -T => '3600', -c => '2') );
90 | $pgb2= $cluster->pgbench_async(1, ('-N', '-n', -T => '3600', -c => '2') );
91 | }
92 |
93 | my $new_node_off = $cluster->add_node();
94 | $cluster->{nodes}->[$new_node_off]->{dbname} = 'postgres';
95 | my $connstr = $cluster->connstr($new_node_off);
96 | my $new_node_id = $cluster->safe_psql(0, "SELECT mtm.add_node(\$\$$connstr\$\$)");
97 |
98 | is($new_node_id, 1, "sparse id assignment");
99 | is($new_node_off, 3, "sparse id assignment");
100 | if ($concurrent_load)
101 | {
102 | $cluster->pgbench(0, ('-N', '-n', -t => '100') );
103 | }
104 | # Ensure monitor creates slot for new node on donor. We don't use it for
105 | # basebackup anymore, but this is still a good idea (it would be even better to
106 | # wait for logical slot creation too).
107 | $cluster->poll_query_until(0, "select exists(select * from pg_replication_slots where slot_name = 'mtm_filter_slot_${new_node_id}');")
108 | or croak "timed out waiting for slot creation";
109 | my $end_lsn = $cluster->backup_and_init(0, $new_node_off, $new_node_id);
110 |
111 | # Prevent recovery of new node further than the end point returned by
112 | # basebackup as streaming will be requested since it, so not doing this might
113 | # result in attempting to receive already existing data. This realistically
114 | # happens with syncpoint rows, leading to insertion conflict.
115 | #
116 | # It would be much nicer to learn the correct (end of recovery) LSN at the new
117 | # node itself and not burden user with carrying it around, but there seems no
118 | # easy way to do that without core changes.
119 | $cluster->{nodes}->[$new_node_off]->append_conf(
120 | "postgresql.conf", qq(
121 | restore_command = 'false'
122 | recovery_target = 'immediate'
123 | recovery_target_action = 'promote'
124 | ));
125 | # create recovery.signal
126 | $cluster->{nodes}->[$new_node_off]->set_recovery_mode();
127 | $cluster->{nodes}->[$new_node_off]->start;
128 | $cluster->await_nodes([3,0,1,2], 0);
129 | $cluster->safe_psql(0, "SELECT mtm.join_node('$new_node_id', '$end_lsn')");
130 | note("join_node done");
131 |
132 | if ($concurrent_load)
133 | {
134 | sleep(5);
135 | IPC::Run::kill_kill($pgb1);
136 | IPC::Run::kill_kill($pgb2);
137 | }
138 |
139 | $cluster->await_nodes( [3,0,1,2] );
140 | $cluster->pgbench(0, ('-N', '-n', -t => '100') );
141 | $cluster->pgbench(3, ('-N', '-n', -t => '100') );
142 |
143 | is($cluster->is_data_identic( (0,1,2,3) ), 1, "add basebackuped node");
144 |
145 | my $bb_keycount = $cluster->safe_psql(3, q{
146 | select count(*) from mtm.config where key='basebackup'
147 | });
148 |
149 | is($bb_keycount, 0, "basebackup key was deleted");
150 |
151 | # check that sequences in proper state
152 | $cluster->safe_psql(0, "insert into test_seq values(DEFAULT)");
153 | $cluster->safe_psql(1, "insert into test_seq values(DEFAULT)");
154 | $cluster->safe_psql(2, "insert into test_seq values(DEFAULT)");
155 | $cluster->safe_psql(3, "insert into test_seq values(DEFAULT)");
156 |
157 | ################################################################################
158 | # basic check of recovery after add node succeeded
159 | ################################################################################
160 |
161 | $cluster->{nodes}->[0]->stop('fast');
162 | $cluster->await_nodes_after_stop( [1,2,3] );
163 | $cluster->pgbench(3, ('-N', '-n', -T => '1') );
164 | $cluster->{nodes}->[0]->start;
165 |
166 | $cluster->await_nodes( [2,0,1] );
167 | is($cluster->is_data_identic((0,1,2,3)), 1, "check recovery after add_node");
168 |
169 | ################################################################################
170 | # drop one of the initial nodes
171 | ################################################################################
172 |
173 | $cluster->{nodes}->[0]->stop('fast');
174 | $cluster->await_nodes_after_stop( [1,2,3] );
175 | $cluster->safe_psql(1, "select mtm.drop_node(2)");
176 |
177 | # check basic recovery after drop_node
178 | $cluster->{nodes}->[1]->stop('fast');
179 | $cluster->await_nodes_after_stop( [2,3] );
180 | $cluster->pgbench(3, ('-N', '-n', -T => '1') );
181 | $cluster->pgbench(2, ('-N', '-n', -T => '1') );
182 | $cluster->{nodes}->[1]->start;
183 | $cluster->await_nodes( [3,2,1] );
184 | is($cluster->is_data_identic((1,2,3)), 1, "check recovery after drop_node");
185 |
186 |
187 | # TODO: check that WALs are not kept for dropped node anymore
188 |
189 | ################################################################################
190 | # XXX: check remove/add of same node
191 | ################################################################################
192 |
193 | ################################################################################
194 | # XXX: check self remove
195 | ################################################################################
196 |
--------------------------------------------------------------------------------
/t/008_bugfixes.pl:
--------------------------------------------------------------------------------
1 | use Carp;
2 | use POSIX;
3 | use strict;
4 | use Test::More;
5 | use TestLib;
6 | use Time::HiRes qw(usleep);
7 | use warnings;
8 |
9 | use PostgresNode;
10 | use Cluster;
11 |
12 | use Test::More tests => Cluster::is_ee() ? 6 : 5;
13 |
14 | my $cluster = new Cluster(3);
15 | $cluster->init();
16 | $cluster->start();
17 | $cluster->create_mm();
18 |
19 | my $hash0; my $hash1; my $hash2; my $hash_query;
20 |
21 | # run pathman test only on ee
22 | if (Cluster::is_ee())
23 | {
24 | $cluster->safe_psql(0, q{
25 | CREATE EXTENSION pg_pathman;
26 | CREATE SCHEMA test_update_node;
27 | SET pg_pathman.enable_partitionrouter = ON;
28 |
29 | CREATE TABLE test_update_node.test_range(val NUMERIC NOT NULL, comment TEXT);
30 | CREATE INDEX val_idx ON test_update_node.test_range (val);
31 | INSERT INTO test_update_node.test_range SELECT i, i FROM generate_series(1, 100) i;
32 | SELECT create_range_partitions('test_update_node.test_range', 'val', 1, 10);
33 |
34 | ALTER TABLE test_update_node.test_range DROP COLUMN comment CASCADE;
35 |
36 | UPDATE test_update_node.test_range SET val = 115 WHERE val = 55;
37 | });
38 |
39 | $hash_query = q{
40 | select
41 | md5('(' || string_agg(val::text, '),(') || ')')
42 | from
43 | (select * from test_update_node.test_range order by val) t;
44 | };
45 | $hash0 = $cluster->safe_psql(0, $hash_query);
46 | $hash1 = $cluster->safe_psql(1, $hash_query);
47 | $hash2 = $cluster->safe_psql(2, $hash_query);
48 | note("$hash0, $hash1, $hash2");
49 | is( (($hash0 eq $hash1) and ($hash1 eq $hash2)) , 1,
50 | "Check that hash is the same after query");
51 | }
52 |
53 | $cluster->safe_psql(0, q{
54 | CREATE TABLE unique_tbl (i int UNIQUE DEFERRABLE, t text);
55 | INSERT INTO unique_tbl VALUES (0, 'one');
56 | INSERT INTO unique_tbl VALUES (1, 'two');
57 | INSERT INTO unique_tbl VALUES (2, 'tree');
58 | INSERT INTO unique_tbl VALUES (3, 'four');
59 | INSERT INTO unique_tbl VALUES (4, 'five');
60 | });
61 | $cluster->{nodes}->[1]->psql($cluster->{nodes}->[1]->{dbname}, q{
62 | -- default is immediate so this should fail right away
63 | UPDATE unique_tbl SET i = 1 WHERE i = 0;
64 | });
65 | $cluster->safe_psql(0, q{
66 | UPDATE unique_tbl SET i = i+1;
67 | });
68 |
69 | $hash_query = q{
70 | select
71 | md5('(' || string_agg(i::text || ', ' || t::text , '),(') || ')')
72 | from
73 | (select * from unique_tbl order by i) t;
74 | };
75 | $hash0 = $cluster->safe_psql(0, $hash_query);
76 | $hash1 = $cluster->safe_psql(1, $hash_query);
77 | $hash2 = $cluster->safe_psql(2, $hash_query);
78 | note("$hash0, $hash1, $hash2");
79 | is( (($hash0 eq $hash1) and ($hash1 eq $hash2)) , 1,
80 | "Check that hash is the same after query");
81 |
82 | # ##############################################################################
83 | #
84 | # Check the PGPRO-3146 bug. Hard crash of backend causes restart of all postgres
85 | # processes. Multimaster node must be survived after the crash and included into
86 | # the multimaster after recovery.
87 | #
88 | # ##############################################################################
89 |
90 | # Set GUC restart_after_crash in 'on' value
91 | $cluster->stop();
92 | foreach (0..$#{$cluster->{nodes}})
93 | {
94 | $cluster->{nodes}->[$_]->append_conf('postgresql.conf', q{restart_after_crash = on});
95 | }
96 | $cluster->start();
97 | $cluster->await_nodes( [0,1,2] );
98 |
99 | # Simulate payload
100 | $cluster->pgbench(0, ('-i', '-n', -s => '1') );
101 | note( strftime('%Y-%m-%d %H:%M:%S', localtime) . ": starting async pgbench" );
102 | my $pgb1 = $cluster->pgbench_async(0, ('-n', -T => '25', -j => '1', -c => '5') );
103 |
104 | my $pid0;
105 | my $attempts = 0;
106 |
107 | note( strftime('%Y-%m-%d %H:%M:%S', localtime) . ": starting polling of backend pid" );
108 | while (1)
109 | {
110 | $pid0 = $cluster->safe_psql(0, "SELECT pid FROM pg_stat_activity
111 | WHERE backend_type LIKE 'client backend'
112 | AND query LIKE 'UPDATE%' LIMIT 1;");
113 |
114 | # bf says we might be really unlucky to find no backend doing update
115 | # It does not make much sense to try longer than pgbench run lasts,
116 | # since we need an active backend to kill. So let it be 25 seconds
117 | # both for pgbench_async() and this pg_stat_activity polling.
118 | if ( ($pid0 ne "") || $attempts >= 25*10 )
119 | {
120 | last;
121 | }
122 |
123 | # Wait 0.1 second before retrying.
124 | usleep(100_000);
125 | $attempts++;
126 | }
127 | note( strftime('%Y-%m-%d %H:%M:%S', localtime) . ": finished polling of backend pid" );
128 | is( ($pid0 ne ""), 1,
129 | "found an active backend doing UPDATE" );
130 |
131 | # Simulate hard crash
132 | note("Simulate hard crash of a backend by SIGKILL to $pid0");
133 | kill -9, $pid0;
134 |
135 | $cluster->pgbench_await($pgb1);
136 | $cluster->await_nodes( [0,1,2] );
137 | is($cluster->is_data_identic( (0,1,2) ), 1, "check consistency after crash");
138 |
139 |
140 | # ##############################################################################
141 | #
142 | # [PGPRO-3047] Test ALTER DOMAIN .. CONSTRAINT .. NOT VALID
143 | #
144 | # ##############################################################################
145 |
146 | $hash0 = $cluster->safe_psql(0, "
147 | CREATE DOMAIN things AS INT;
148 | CREATE TABLE thethings (stuff things);
149 | INSERT INTO thethings (stuff) VALUES (55);
150 | ALTER DOMAIN things ADD CONSTRAINT meow CHECK (VALUE < 11) NOT VALID;
151 | UPDATE thethings SET stuff = 10;
152 | ALTER DOMAIN things VALIDATE CONSTRAINT meow;
153 | ");
154 | my $result0 = $cluster->safe_psql(0, "SELECT * FROM thethings");
155 | my $result1 = $cluster->safe_psql(1, "SELECT * FROM thethings");
156 | my $result2 = $cluster->safe_psql(2, "SELECT * FROM thethings");
157 | note("Value in the stuff column of thethings table is $result0 at the node1 and match to corresponding values from another nodes: 2 - $result1 and 3 - $result2 ");
158 | is( (($result0 eq 10) and ($result0 eq $result1) and ($result1 eq $result2)), 1,
159 | "Check that update not aborted by violation of constraint on old tuple value");
160 |
161 | # ##############################################################################
162 | #
163 | # [PGPRO-3047] Check for problems with different OIDs on multimaster nodes
164 | # during logical replication of tuples contained attribute with domain over
165 | # arrays of composite.
166 | #
167 | # ##############################################################################
168 |
169 | # Check that OIDs are different.
170 | $result0 = $cluster->safe_psql(0,
171 | "select oid from pg_class where relname like 'thethings';");
172 | $result1 = $cluster->safe_psql(1,
173 | "select oid from pg_class where relname like 'thethings';");
174 | $result2 = $cluster->safe_psql(2,
175 | "select oid from pg_class where relname like 'thethings';");
176 | note("OIDS of the thethings relation: node1 - $result0, node2 - $result1, node3 - $result2");
177 | is( ( ($result0 ne $result1) and ($result0 ne $result2) and ($result1 ne $result2) ), 1,
178 | "Check that oid of the thethings relation are different on each node");
179 |
180 | # Do the test. Insertion of array type must be passed successfully.
181 | # Source: regression test domain.sql
182 | $cluster->safe_psql(0, "
183 | CREATE TYPE comptype AS (r float8, i float8);
184 | CREATE domain dcomptypea AS comptype[];
185 | CREATE table dcomptable (d1 dcomptypea UNIQUE);
186 | INSERT INTO dcomptable VALUES (array[row(1,2)]::dcomptypea);
187 | ");
188 |
189 | $cluster->stop();
190 |
191 | done_testing();
192 |
--------------------------------------------------------------------------------
/t/009_identity_func.pl:
--------------------------------------------------------------------------------
1 | use strict;
2 | use warnings;
3 | use PostgresNode;
4 | use Cluster;
5 | use TestLib;
6 | use Test::More tests => 29;
7 |
8 | my $cluster = new Cluster(3);
9 | $cluster->init();
10 | $cluster->start();
11 | $cluster->create_mm(undef);
12 |
13 | my $dbname = $cluster->{nodes}->[0]->{dbname};
14 | my $nodes = $cluster->{nodes};
15 | my $output;
16 | my $err_out;
17 |
18 | # ##############################################################################
19 | #
20 | # Incorrect query
21 | #
22 | # ##############################################################################
23 | my $invalid_expr_pattern =
24 | ".*failed to run query on node[0-9]+, snapshot .*: "
25 | . "ERROR: relation \"t1\" does not exist\n";
26 |
27 | # test node 1
28 | $nodes->[0]->psql($dbname,
29 | "SELECT mtm.check_query('SELECT * FROM t1')",
30 | stdout => \$output, stderr => \$err_out);
31 | is ( (($output eq '') and ($err_out ne '')), 1, "node1: check zero out on error");
32 | like($err_out, qr{$invalid_expr_pattern}, "node1: check error output correctness");
33 |
34 | # test node 2
35 | $nodes->[1]->psql($dbname,
36 | "SELECT mtm.check_query('SELECT * FROM t1')",
37 | stdout => \$output, stderr => \$err_out);
38 | is ( (($output eq '') and ($err_out ne '')), 1, "node2: check zero out on error");
39 | like($err_out, qr{$invalid_expr_pattern}, "node2: check error output correctness");
40 |
41 | # test node 3
42 | $nodes->[2]->psql($dbname,
43 | "SELECT mtm.check_query('SELECT * FROM t1')",
44 | stdout => \$output, stderr => \$err_out);
45 | is ( (($output eq '') and ($err_out ne '')), 1, "node3: check zero out on error");
46 | like($err_out, qr{$invalid_expr_pattern}, "node3: check error output correctness");
47 |
48 | # Substep: check no problems without one node
49 | $nodes->[2]->stop();
50 | $cluster->await_nodes_after_stop( [0,1] );
51 | $nodes->[0]->psql($dbname,
52 | "SELECT mtm.check_query('SELECT * FROM t1')",
53 | stdout => \$output, stderr => \$err_out);
54 | is ( (($output eq '') and ($err_out ne '')), 1, "node1: check zero out on error");
55 | like($err_out, qr{$invalid_expr_pattern}, "node1: check error output correctness");
56 |
57 | $nodes->[1]->psql($dbname,
58 | "SELECT mtm.check_query('SELECT * FROM t1')",
59 | stdout => \$output, stderr => \$err_out);
60 | is ( (($output eq '') and ($err_out ne '')), 1, "node2: check zero out on error");
61 | like($err_out, qr{$invalid_expr_pattern}, "node2: check error output correctness");
62 |
63 | # Substep: node1 will be isolated
64 | my $isolation_pattern = ".*node is not online\: current status .*";
65 | $nodes->[1]->stop();
66 | $nodes->[0]->psql($dbname,
67 | "SELECT mtm.check_query('SELECT * FROM t1')",
68 | stdout => \$output, stderr => \$err_out);
69 | is ( (($output eq '') and ($err_out ne '')), 1, "node1: check zero out on error");
70 | like($err_out, qr{$isolation_pattern}, "Check access to isolated node");
71 |
72 | $nodes->[1]->start();
73 | $nodes->[2]->start();
74 | $cluster->await_nodes( [2,0,1] );
75 |
76 | # ##############################################################################
77 | #
78 | # Interface functions protection.
79 | #
80 | # ##############################################################################
81 | my $protection_pattern = "this function should only be called by mtm.check_query()";
82 | $nodes->[0]->psql($dbname,
83 | "SELECT mtm.hold_backends();",
84 | stdout => \$output, stderr => \$err_out);
85 | is ( (($output eq '') and ($err_out ne '')), 1, "hold_all() protection");
86 | like($err_out, qr{$protection_pattern}, "Check error output");
87 |
88 | $nodes->[0]->psql($dbname,
89 | "SELECT mtm.release_backends();",
90 | stdout => \$output, stderr => \$err_out);
91 | is ( (($output eq '') and ($err_out ne '')), 1, "release_all() protection");
92 | like($err_out, qr{$protection_pattern}, "Check error output");
93 |
94 | $cluster->safe_psql(0, "CREATE TABLE t1 (a int PRIMARY KEY, b text);");
95 | $nodes->[0]->psql($dbname,
96 | "SELECT mtm.check_query('SELECT * FROM t1')",
97 | stdout => \$output);
98 | is( (($output eq 't')) , 1, "Check tables equivalence with no tuples");
99 |
100 | # Check consistency in the case of two nodes
101 | $nodes->[1]->stop();
102 | $cluster->await_nodes_after_stop( [0,2] );
103 | $nodes->[0]->psql($dbname,
104 | "SELECT mtm.check_query('SELECT * FROM t1')",
105 | stdout => \$output);
106 | is( (($output eq 't')) , 1, "Check tables equivalence with one off node");
107 |
108 | $cluster->safe_psql(0, "INSERT INTO t1 (a, b) VALUES (1, NULL);");
109 | $nodes->[0]->psql($dbname,
110 | "SELECT mtm.check_query('SELECT * FROM t1')",
111 | stdout => \$output);
112 |
113 | is( (($output eq 't')) , 1, "Check primitive table");
114 | $nodes->[1]->start();
115 | $cluster->await_nodes( [2,0,1] );
116 |
117 | $cluster->safe_psql(0,
118 | "INSERT INTO t1 (a,b) (SELECT *, 'test' FROM generate_series(2,100) AS x1);
119 | ");
120 | $nodes->[0]->psql($dbname,
121 | "SELECT mtm.check_query('SELECT * FROM t1 ORDER BY a')",
122 | stdout => \$output);
123 | is( (($output eq 't')) , 1, "Check big table");
124 | $nodes->[0]->psql($dbname,
125 | "SELECT mtm.check_query('SELECT md5(string_agg(x1::text,''''))
126 | FROM (SELECT * FROM t1 ORDER BY a) AS x1');",
127 | stdout => \$output);
128 | is( (($output eq 't')) , 1, "Another approach to check big table");
129 |
130 | $nodes->[0]->psql($dbname,
131 | "SELECT mtm.check_query('SELECT mtm.status();');",
132 | stdout => \$output);
133 | note("Check result: $output");
134 | is( (($output eq 'f')) , 1, "Unsuccessful check");
135 |
136 | $nodes->[2]->stop();
137 | $cluster->await_nodes_after_stop( [0,1] );
138 | $nodes->[0]->psql($dbname,
139 | "SELECT mtm.check_query('SELECT md5(string_agg(x1::text,''''))
140 | FROM (SELECT * FROM t1 ORDER BY a) AS x1');",
141 | stdout => \$output);
142 | is( (($output eq 't')) , 1, "Check tables identity after one node was down");
143 |
144 | $nodes->[2]->start();
145 | $cluster->await_nodes( [2,0,1] );
146 | $nodes->[0]->psql($dbname,
147 | "SELECT mtm.check_query('SELECT my_node_id FROM mtm.status();');",
148 | stdout => \$output);
149 | is( (($output eq 'f')) , 1, "Check warning message on mismatch");
150 |
151 | $nodes->[2]->psql($dbname,
152 | "SELECT mtm.check_query('SELECT a,b FROM t1, mtm.status() AS ms WHERE a > ms.my_node_id');",
153 | stdout => \$output, stderr => \$err_out);
154 | note("Check result: $output");
155 | is( (($output eq 'f')) , 1, "Check warning message on difference in rows number");
156 | like($err_out,
157 | qr{.*query results mismatch\: 99 rows and 2 columns on node1\, 98 rows and 2 columns on node2},
158 | "Check format of the error message");
159 |
160 | $nodes->[2]->psql($dbname,
161 | "SELECT mtm.check_query('SELECT b FROM t1 WHERE a = 1');",
162 | stdout => \$output);
163 | note("Check result: $output");
164 | is( (($output eq 't')) , 1, "Check equivalence of nulls");
165 |
166 | $nodes->[0]->psql($dbname,
167 | "SELECT mtm.check_query('SELECT b FROM t1, mtm.status() AS ms WHERE a = ms.my_node_id');",
168 | stdout => \$output, stderr => \$err_out);
169 | note("Check result: $output");
170 | is( (($output eq 'f')) , 1, "Check warning message on difference in null and not null values");
171 | like($err_out,
172 | qr{.*mismatch in column \'b\' of row 0\: null on node1\, test on node2},
173 | "Check format of the error message");
174 |
175 | exit(0);
176 |
177 | # Full pgbench test
178 | $cluster->pgbench(0, ('-i', -s => '10') );
179 | my $pgb0 = $cluster->pgbench_async(0, ('-N', -T => '30', -c => '5') );
180 | my $pgb1 = $cluster->pgbench_async(1, ('-N', -T => '30', -c => '5') );
181 | my $pgb2 = $cluster->pgbench_async(2, ('-N', -T => '30', -c => '5') );
182 |
183 | $output='t';
184 | for (my $i = 0; ($i < 3) and ($output eq 't'); $i++)
185 | {
186 | $nodes->[0]->psql($dbname,
187 | "SELECT mtm.check_query('SELECT md5(string_agg(x1::text,''''))
188 | FROM (SELECT * FROM pgbench_accounts ORDER BY aid) AS x1');",
189 | stdout => \$output);
190 | note("check iteration $i, result: $output");
191 | is( (($output eq 't')) , 1, "Data on nodes are identic");
192 | sleep(6);
193 | }
194 |
195 | $cluster->pgbench_await($pgb0);
196 | $cluster->pgbench_await($pgb1);
197 | $cluster->pgbench_await($pgb2);
198 |
199 | $cluster->stop();
200 |
--------------------------------------------------------------------------------
/tests/.gitignore:
--------------------------------------------------------------------------------
1 | .blockade
2 | .vagrant
3 | *.swp
4 | *.pyc
5 |
--------------------------------------------------------------------------------
/tests/Pipfile:
--------------------------------------------------------------------------------
1 | [[source]]
2 | name = "pypi"
3 | url = "https://pypi.org/simple"
4 | verify_ssl = true
5 |
6 | [dev-packages]
7 |
8 | [packages]
9 | aiopg = "==1.0.0"
10 | aioprocessing = "==1.0.1"
11 | docker-compose = "==1.26.2"
12 | docker = "*"
13 |
14 | [requires]
15 | python_version = "3.7"
16 |
--------------------------------------------------------------------------------
/tests/deadl.pgb:
--------------------------------------------------------------------------------
1 | \set fromuser random(1,64)
2 | \set touser random(1,64)
3 | \set amount random(1,10000)
4 | BEGIN;
5 | INSERT INTO transactions (uid,amount) VALUES (:fromuser, -:amount);
6 | INSERT INTO transactions (uid,amount) VALUES (:touser, :amount);
7 | INSERT INTO users (uid,sum) VALUES (:fromuser, -:amount) ON CONFLICT(uid) DO UPDATE SET sum=users.sum-:amount WHERE users.uid=:fromuser;
8 | INSERT INTO users (uid,sum) VALUES (:touser, :amount) ON CONFLICT(uid) DO UPDATE SET sum=users.sum+:amount WHERE users.uid=:touser;
9 | END;
10 |
11 |
--------------------------------------------------------------------------------
/tests/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: '2.2'
2 |
3 | services:
4 |
5 | node1:
6 | container_name: node1
7 | build: ..
8 | shm_size: '1024mb'
9 | privileged: true
10 | ulimits:
11 | core: 14294967296
12 | environment:
13 | POSTGRES_USER: 'pg'
14 | POSTGRES_DB: 'regression'
15 | NODE_ID: 1
16 | CONNSTRS: >-
17 | dbname=regression user=pg host=node1,
18 | dbname=regression user=pg host=node2,
19 | dbname=regression user=pg host=node3
20 | ports:
21 | - "15432:5432"
22 | networks:
23 | mtm_bridge:
24 | ipv4_address: 192.168.253.1
25 |
26 | node2:
27 | container_name: node2
28 | build: ..
29 | shm_size: '1024mb'
30 | privileged: true
31 | ulimits:
32 | core: 14294967296
33 | environment:
34 | POSTGRES_USER: 'pg'
35 | POSTGRES_DB: 'regression'
36 | NODE_ID: 2
37 | CONNSTRS: >-
38 | dbname=regression user=pg host=node1,
39 | dbname=regression user=pg host=node2,
40 | dbname=regression user=pg host=node3
41 | ports:
42 | - "15433:5432"
43 | networks:
44 | mtm_bridge:
45 | ipv4_address: 192.168.253.2
46 |
47 | node3:
48 | container_name: node3
49 | build: ..
50 | shm_size: '1024mb'
51 | privileged: true
52 | ulimits:
53 | core: 14294967296
54 | environment:
55 | POSTGRES_USER: 'pg'
56 | POSTGRES_DB: 'regression'
57 | NODE_ID: 3
58 | CONNSTRS: >-
59 | dbname=regression user=pg host=node1,
60 | dbname=regression user=pg host=node2,
61 | dbname=regression user=pg host=node3
62 | ports:
63 | - "15434:5432"
64 | networks:
65 | mtm_bridge:
66 | ipv4_address: 192.168.253.3
67 |
68 | # toxi:
69 | # image: kelvich/toxiproxy
70 | # ports:
71 | # - "8474:8474"
72 |
73 | # toxi_seed:
74 | # image: kelvich/toxiproxy
75 | # depends_on:
76 | # - toxi
77 | # entrypoint: |
78 | # curl
79 | # -X POST 'http://toxi:8474/populate'
80 | # -H 'Content-Type: application/json; charset=utf-8'
81 | # -d
82 | # '[
83 | # {"name": "rep12", "listen": "0.0.0.0:12000", "upstream": "node2:5432"},
84 | # {"name": "arb12", "listen": "0.0.0.0:12001", "upstream": "node2:5433"},
85 | # {"name": "rep13", "listen": "0.0.0.0:13000", "upstream": "node3:5432"},
86 | # {"name": "arb13", "listen": "0.0.0.0:13001", "upstream": "node3:5433"},
87 |
88 | # {"name": "rep21", "listen": "0.0.0.0:21000", "upstream": "node1:5432"},
89 | # {"name": "arb21", "listen": "0.0.0.0:21001", "upstream": "node1:5433"},
90 | # {"name": "rep23", "listen": "0.0.0.0:23000", "upstream": "node3:5432"},
91 | # {"name": "arb23", "listen": "0.0.0.0:23001", "upstream": "node3:5433"},
92 |
93 | # {"name": "rep31", "listen": "0.0.0.0:31000", "upstream": "node1:5432"},
94 | # {"name": "arb31", "listen": "0.0.0.0:31001", "upstream": "node1:5433"},
95 | # {"name": "rep32", "listen": "0.0.0.0:32000", "upstream": "node2:5432"},
96 | # {"name": "arb32", "listen": "0.0.0.0:32001", "upstream": "node2:5433"}
97 | # ]'
98 |
99 |
100 | networks:
101 | mtm_bridge:
102 | driver: bridge
103 | ipam:
104 | config:
105 | - subnet: 192.168.253.0/24
106 | gateway: 192.168.253.254
107 |
--------------------------------------------------------------------------------
/tests/docker-entrypoint.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | if [ "$1" = 'postgres' ]; then
4 | mkdir -p "$PGDATA"
5 | mkdir -p /pg/archive/
6 | mkdir -p /pg/src/src/test/regress/testtablespace
7 |
8 | # look specifically for PG_VERSION, as it is expected in the DB dir
9 | if [ ! -s "$PGDATA/PG_VERSION" ]; then
10 | initdb --nosync
11 |
12 | { echo; echo "host all all 0.0.0.0/0 trust"; } >> "$PGDATA/pg_hba.conf"
13 | { echo; echo "host replication all 0.0.0.0/0 trust"; } >> "$PGDATA/pg_hba.conf"
14 |
15 | cat <<-EOF >> $PGDATA/postgresql.conf
16 | listen_addresses='*'
17 | log_line_prefix = '%m [%p] [[%a]]: '
18 | archive_mode = on
19 | archive_command = 'cp %p /pg/archive/%f'
20 |
21 | fsync = on
22 |
23 | max_prepared_transactions = 100
24 | wal_level = logical
25 | max_worker_processes = 100
26 | max_replication_slots = 10
27 | max_wal_senders = 10
28 | log_statement = all
29 | log_connections = true
30 | log_lock_waits = true
31 |
32 | shared_preload_libraries = 'multimaster'
33 | multimaster.volkswagen_mode = off
34 | multimaster.max_workers = 30
35 |
36 | multimaster.connect_timeout = 10
37 | # Be careful; tests expect commits on live
38 | # nodes during others failures, and failure time is ~10s;
39 | # if we simulate network loss, failure won't be
40 | # detected until this timeout passes.
41 | # OTOH, setting it too low might lead to node
42 | # exclusions on weak machines during normal work.
43 | # It was also noticed that if extensive logging is enabled
44 | # (older, at least pre #6392) journald might not be able
45 | # to swallow logs in time which also provoked exclusions
46 | # with 2s timeout
47 | multimaster.heartbeat_recv_timeout = 2000
48 | multimaster.heartbeat_send_timeout = 200
49 | # Heavily loaded receiver won't send progress until
50 | # walsender requires it which happens at
51 | # wal_sender_timeout / 2, so keep it relatively low
52 | # for syncpoint test.
53 | wal_sender_timeout = 60s
54 | wal_receiver_status_interval = 10s
55 |
56 | # extensive logging for tests
57 | multimaster.TxTrace_log_level = LOG
58 | multimaster.TxFinish_log_level = LOG
59 |
60 | multimaster.CoordinatorTrace_log_level = LOG
61 |
62 | multimaster.BgwPoolEventDebug_log_level = LOG
63 |
64 | multimaster.ReceiverStateDebug_log_level = LOG
65 | multimaster.ApplyMessage_log_level = LOG
66 | multimaster.ApplyTrace_log_level = LOG
67 | multimaster.ReceiverFeedback_log_level = LOG
68 |
69 | multimaster.StateDebug_log_level = LOG
70 |
71 | EOF
72 |
73 | if [ -n "$REFEREE_CONNSTR" ]; then
74 | echo "multimaster.referee_connstring = '$REFEREE_CONNSTR'" >> $PGDATA/postgresql.conf
75 | fi
76 |
77 | # internal start of server in order to allow set-up using psql-client
78 | # does not listen on TCP/IP and waits until start finishes
79 | pg_ctl -D "$PGDATA" \
80 | -o "-c listen_addresses=''" \
81 | -w start
82 |
83 | : ${POSTGRES_USER:=postgres}
84 | : ${POSTGRES_DB:=$POSTGRES_USER}
85 | export POSTGRES_USER POSTGRES_DB
86 |
87 | if [ "$POSTGRES_DB" != 'postgres' ]; then
88 | psql -U `whoami` postgres <<-EOSQL
89 | CREATE DATABASE "$POSTGRES_DB" ;
90 | EOSQL
91 | echo
92 | fi
93 |
94 | if [ "$POSTGRES_USER" = `whoami` ]; then
95 | op='ALTER'
96 | else
97 | op='CREATE'
98 | fi
99 |
100 | psql -U `whoami` postgres <<-EOSQL
101 | $op USER "$POSTGRES_USER" WITH SUPERUSER PASSWORD '';
102 | EOSQL
103 | echo
104 |
105 | # psql -U `whoami` $POSTGRES_DB -c 'CREATE EXTENSION multimaster;';
106 | # psql -U `whoami` $POSTGRES_DB -c "select mtm.init_node($NODE_ID, '{$CONNSTRS}');"
107 |
108 | pg_ctl -D "$PGDATA" -m fast -w stop
109 | fi
110 | fi
111 |
112 | "$@"
113 |
--------------------------------------------------------------------------------
/tests/lib/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/postgrespro/mmts/baab9238f784d428481ecfa1294e3f9a3910b2d2/tests/lib/__init__.py
--------------------------------------------------------------------------------
/tests/lib/failure_injector.py:
--------------------------------------------------------------------------------
1 | import docker
2 | import os
3 |
4 | class FailureInjector(object):
5 |
6 | def __init__(self, node=None):
7 | timeout = os.environ.get('DOCKER_CLIENT_TIMEOUT')
8 | if timeout is not None:
9 | timeout = int(timeout)
10 | self.docker_api = docker.from_env(timeout=timeout)
11 |
12 | def container_exec(self, node, command):
13 | docker_node = self.docker_api.containers.get(node)
14 | docker_node.exec_run(command, user='root')
15 |
16 |
17 | class NoFailure(FailureInjector):
18 |
19 | def start(self):
20 | return
21 |
22 | def stop(self):
23 | return
24 |
25 |
26 | class SingleNodePartition(FailureInjector):
27 |
28 | def __init__(self, node):
29 | self.node = node
30 | super().__init__()
31 |
32 | def start(self):
33 | self.container_exec(self.node, "iptables -A INPUT -j DROP")
34 | self.container_exec(self.node, "iptables -A OUTPUT -j DROP")
35 |
36 | def stop(self):
37 | self.container_exec(self.node, "iptables -D INPUT -j DROP")
38 | self.container_exec(self.node, "iptables -D OUTPUT -j DROP")
39 |
40 | class SingleNodePartitionReject(FailureInjector):
41 |
42 | def __init__(self, node):
43 | self.node = node
44 | super().__init__()
45 |
46 | def start(self):
47 | self.container_exec(self.node, "iptables -A INPUT -j REJECT")
48 | self.container_exec(self.node, "iptables -A OUTPUT -j REJECT")
49 |
50 | def stop(self):
51 | self.container_exec(self.node, "iptables -D INPUT -j REJECT")
52 | self.container_exec(self.node, "iptables -D OUTPUT -j REJECT")
53 |
54 |
55 | class EdgePartition(FailureInjector):
56 |
57 | def __init__(self, nodeA, nodeB):
58 | self.nodeA = nodeA
59 | self.nodeB = nodeB
60 | super().__init__()
61 |
62 | def __change(self, action):
63 | self.container_exec(self.nodeA,
64 | "iptables {} INPUT -s {} -j DROP".format(
65 | action, self.nodeB))
66 | self.container_exec(self.nodeA,
67 | "iptables {} OUTPUT -d {} -j DROP".format(
68 | action, self.nodeB))
69 |
70 | def start(self):
71 | self.__change('-A')
72 |
73 | def stop(self):
74 | self.__change('-D')
75 |
76 |
77 | class RestartNode(FailureInjector):
78 |
79 | def __init__(self, node):
80 | self.node = node
81 | super().__init__()
82 |
83 | # XXX: Is it really a good idea to call cli.stop inside method called start?
84 | def start(self):
85 | self.docker_api.containers.get(self.node).stop()
86 |
87 | def stop(self):
88 | self.docker_api.containers.get(self.node).start()
89 |
90 |
91 | class FreezeNode(FailureInjector):
92 |
93 | def __init__(self, node):
94 | self.node = node
95 | super().__init__()
96 |
97 | def start(self):
98 | self.docker_api.containers.get(self.node).pause()
99 |
100 | def stop(self):
101 | self.docker_api.containers.get(self.node).unpause()
102 |
103 |
104 | class CrashRecoverNode(FailureInjector):
105 |
106 | def __init__(self, node):
107 | self.node = node
108 | super().__init__()
109 |
110 | def start(self):
111 | self.docker_api.containers.get(self.node).kill()
112 |
113 | def stop(self):
114 | self.docker_api.containers.get(self.node).start()
115 |
116 |
117 | class SkewTime(FailureInjector):
118 |
119 | def __init__(self, node):
120 | self.node = node
121 | super().__init__()
122 |
123 | class StopNode(FailureInjector):
124 |
125 | def __init__(self, node):
126 | self.node = node
127 | super().__init__()
128 |
129 | # XXX: Is it really a good idea to call cli.stop inside method called start?
130 | def start(self):
131 | self.docker_api.containers.get(self.node).stop()
132 |
133 | def stop(self):
134 | return
135 |
136 |
137 | class StartNode(FailureInjector):
138 |
139 | def __init__(self, node):
140 | self.node = node
141 | super().__init__()
142 |
143 | # XXX: Is it really a good idea to call cli.stop inside method
144 | # called start?
145 | def start(self):
146 | return
147 |
148 | def stop(self):
149 | self.docker_api.containers.get(self.node).start()
150 |
151 | ONE_NODE_FAILURES = [SingleNodePartition, SingleNodePartitionReject,
152 | RestartNode, CrashRecoverNode, FreezeNode]
153 | TWO_NODE_FAILURES = [EdgePartition]
154 |
--------------------------------------------------------------------------------
/tests/lib/log_helper.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import logging.config
3 | import sys
4 | import time
5 |
6 | # FWIW I've attempted to keep the cfg in json/yaml file but sank in 'could not
7 | # resolve UTCFormatter class' issue
8 |
9 | # timestamp in UTC+-00:00 aka GMT
10 | class UTCFormatter(logging.Formatter):
11 | converter = time.gmtime
12 |
13 | LOGGING = {
14 | "version": 1,
15 | "formatters": {
16 | "defaultFormatter": {
17 | "()": UTCFormatter,
18 | "format": "%(asctime)s.%(msecs)-3d %(levelname)s [%(filename)s:%(lineno)d] %(message)s",
19 | "datefmt": "%Y-%m-%d %H:%M:%S"
20 | }
21 | },
22 | "handlers": {
23 | "console": {
24 | "class": "logging.StreamHandler",
25 | "formatter": "defaultFormatter",
26 | "level": "DEBUG",
27 | "stream": "ext://sys.stderr"
28 | }
29 | },
30 | "loggers": {
31 | "root": {
32 | "level": "DEBUG",
33 | "handlers": ["console"]
34 | },
35 | "root.test_helper": {
36 | "level": "INFO"
37 | },
38 | "root.bank_client": {
39 | "level": "INFO"
40 | }
41 | }
42 | }
43 |
44 | logging.config.dictConfig(LOGGING)
45 |
--------------------------------------------------------------------------------
/tests/reader.pgb:
--------------------------------------------------------------------------------
1 | begin;
2 | insert into reader_log select sum(v) from t;
3 | commit;
--------------------------------------------------------------------------------
/tests/requirements.txt:
--------------------------------------------------------------------------------
1 | aiopg==1.0.0
2 | aioprocessing==1.0.1
3 | attrs==20.1.0
4 | bcrypt==3.2.0
5 | cached-property==1.5.1
6 | certifi==2020.6.20
7 | cffi==1.14.2
8 | chardet==3.0.4
9 | cryptography==3.1
10 | distro==1.5.0
11 | docker==4.3.1
12 | docker-compose==1.26.2
13 | dockerpty==0.4.1
14 | docopt==0.6.2
15 | idna==2.10
16 | importlib-metadata==1.7.0
17 | jsonschema==3.2.0
18 | paramiko==2.7.1
19 | psycopg2-binary==2.8.5
20 | pycparser==2.20
21 | PyNaCl==1.4.0
22 | pyrsistent==0.16.0
23 | python-dotenv==0.14.0
24 | PyYAML==5.3.1
25 | requests==2.24.0
26 | six==1.15.0
27 | texttable==1.6.2
28 | urllib3==1.25.10
29 | websocket-client==0.57.0
30 | zipp==3.1.0
31 |
--------------------------------------------------------------------------------
/tests/support/bumptime.c:
--------------------------------------------------------------------------------
1 | /*
2 | * His (Aphyr) Majesty Script Bumptime.
3 | *
4 | * https://raw.githubusercontent.com/jepsen-io/jepsen/master/cockroachdb/resources/bumptime.c
5 | *
6 | */
7 |
8 | #include
9 | #include
10 | #include
11 | #include
12 |
13 | int
14 | main(int argc, char **argv)
15 | {
16 | if (argc < 2)
17 | {
18 | fprintf(stderr, "usage: %s , where delta is in ms\n", argv[0]);
19 | return 1;
20 | }
21 |
22 | /* Compute offset from argument */
23 | int64_t delta = atof(argv[1]) * 1000;
24 | int64_t delta_us = delta % 1000000;
25 | int64_t delta_s = (delta - delta_us) / 1000000;
26 |
27 | /* Get current time */
28 | struct timeval time;
29 | struct timezone tz;
30 |
31 | if (0 != gettimeofday(&time, &tz))
32 | {
33 | perror("gettimeofday");
34 | return 1;
35 | }
36 |
37 | /* Update time */
38 | time.tv_usec += delta_us;
39 | time.tv_sec += delta_s;
40 | /* Overflow */
41 | while (time.tv_usec <= 1000000)
42 | {
43 | time.tv_sec -= 1;
44 | time.tv_usec += 1000000;
45 | }
46 | while (1000000 <= time.tv_usec)
47 | {
48 | time.tv_sec += 1;
49 | time.tv_usec -= 1000000;
50 | }
51 |
52 | /* Set time */
53 | if (0 != settimeofday(&time, &tz))
54 | {
55 | perror("settimeofday");
56 | return 2;
57 | }
58 |
59 | return 0;
60 | }
61 |
--------------------------------------------------------------------------------
/tests/support/docker-regress.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | cd /pg/src/src/test/regress
4 |
5 | psql -U postgres regression <<-SQL
6 | ALTER DATABASE "postgres" SET lc_messages TO 'C';
7 | ALTER DATABASE "postgres" SET lc_monetary TO 'C';
8 | ALTER DATABASE "postgres" SET lc_numeric TO 'C';
9 | ALTER DATABASE "postgres" SET lc_time TO 'C';
10 | ALTER DATABASE "postgres" SET timezone_abbreviations TO 'Default';
11 | SQL
12 |
13 | ./pg_regress --use-existing \
14 | --schedule=serial_schedule \
15 | --host=node1 \
16 | --user=postgres
17 |
18 | STATUS=$?
19 |
20 | if [ -f "regression.diffs" ]
21 | then
22 | cat regression.diffs
23 | fi
24 |
25 | exit $STATUS
26 |
--------------------------------------------------------------------------------
/tests/support/two_nodes.yml:
--------------------------------------------------------------------------------
1 | version: '2'
2 |
3 | services:
4 |
5 | node1:
6 | container_name: node1
7 | build: ../..
8 | shm_size: '512mb'
9 | privileged: true
10 | ulimits:
11 | core: 14294967296
12 | environment:
13 | POSTGRES_USER: 'pg'
14 | POSTGRES_DB: 'regression'
15 | NODE_ID: 1
16 | CONNSTRS: >-
17 | dbname=regression user=pg host=node1,
18 | dbname=regression user=pg host=node2
19 | REFEREE_CONNSTR: 'dbname=regression user=pg host=referee'
20 | ports:
21 | - "15432:5432"
22 | networks:
23 | mtm_bridge:
24 | ipv4_address: 192.168.253.1
25 |
26 | node2:
27 | container_name: node2
28 | build: ../..
29 | shm_size: '512mb'
30 | privileged: true
31 | ulimits:
32 | core: 14294967296
33 | environment:
34 | POSTGRES_USER: 'pg'
35 | POSTGRES_DB: 'regression'
36 | NODE_ID: 2
37 | CONNSTRS: >-
38 | dbname=regression user=pg host=node1,
39 | dbname=regression user=pg host=node2
40 | REFEREE_CONNSTR: 'dbname=regression user=pg host=referee'
41 | ports:
42 | - "15433:5432"
43 | networks:
44 | mtm_bridge:
45 | ipv4_address: 192.168.253.2
46 |
47 | referee:
48 | container_name: referee
49 | build: ../..
50 | shm_size: '512mb'
51 | privileged: true
52 | ulimits:
53 | core: 14294967296
54 | environment:
55 | POSTGRES_USER: 'pg'
56 | POSTGRES_DB: 'regression'
57 | NODE_ID: 1
58 | ports:
59 | - "15435:5432"
60 | networks:
61 | mtm_bridge:
62 | ipv4_address: 192.168.253.3
63 |
64 | networks:
65 | mtm_bridge:
66 | driver: bridge
67 | ipam:
68 | config:
69 | - subnet: 192.168.253.0/24
70 | gateway: 192.168.253.254
71 |
--------------------------------------------------------------------------------
/tests/test_bkb.sage.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env sage
2 | import sys, os
3 |
4 | from sage.all import *
5 | from subprocess import Popen, PIPE, STDOUT
6 | import unittest
7 |
8 | def run_stdin(input):
9 | mydir = os.path.dirname(os.path.realpath(__file__))
10 | binfile = mydir + "/../src/a.out"
11 |
12 | p = Popen(binfile, stdout=PIPE, stdin=PIPE, stderr=STDOUT)
13 | grep_stdout = p.communicate(input=input)[0]
14 | return grep_stdout.decode()
15 |
16 | def run_bkb(g):
17 | n = len(g)
18 | params = str(n) + "\n"
19 | for i in range(n):
20 | row = 0
21 | row |= 1 << i
22 | for j in range(n):
23 | if g.has_edge(i, j):
24 | row |= 1 << j
25 | params += str(row) + "\n"
26 |
27 | print(params)
28 | res = run_stdin(params).strip()
29 | res = [int(n) for n in res.split(' ')]
30 | return res
31 |
32 |
33 | class TestCliqueBKB(unittest.TestCase):
34 |
35 | def test_random_graphs(self):
36 |
37 | for _ in range(1000):
38 | while True:
39 | g = graphs.RandomGNM(60,1700)
40 | if g.is_connected():
41 | break
42 |
43 | clique, clique_size = run_bkb(g)
44 |
45 | print(clique, clique_size, len(g.clique_maximum()))
46 |
47 |
48 |
49 | if __name__ == '__main__':
50 | unittest.main()
51 |
--------------------------------------------------------------------------------
/tests/test_recovery_random.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | #
4 | # Based on Aphyr's test for CockroachDB.
5 | #
6 | # Randomized recovery test for multimaster. Currently it picks a random node,
7 | # crash-recovers it or drops/rejects packets to and from it under load and
8 | # checks that things are ok, i.e. the rest two continue working and after
9 | # eliminating the failure the victim successfully recovers, with no hanged
10 | # prepares and data being identic everywhere. Lather, rinse, repeat.
11 |
12 | import datetime
13 | import docker
14 | import os
15 | import random
16 | import socket
17 | import subprocess
18 | import time
19 | import unittest
20 | import warnings
21 | import logging
22 |
23 | from lib.bank_client import MtmClient
24 | from lib.failure_injector import *
25 | import lib.log_helper # configures loggers
26 | from lib.test_helper import *
27 |
28 | log = logging.getLogger('root')
29 |
30 | class RecoveryTest(MMTestCase, TestHelper):
31 | def test_normal_operations(self):
32 | log.info('### test_normal_operations ###')
33 |
34 | aggs_failure, aggs = self.performFailure(NoFailure())
35 |
36 | self.assertCommits(aggs_failure)
37 | self.assertIsolation(aggs_failure)
38 |
39 | self.assertCommits(aggs)
40 | self.assertIsolation(aggs)
41 |
42 | # main random tests
43 | def test_random_disasters(self):
44 | log.info('### test_random_disasters ###')
45 |
46 | for i in range(1, 16):
47 | log.info(f'running round #{i} of test_random_disasters')
48 | node_number = random.choice(range(1, 4))
49 | port = 15431 + node_number
50 |
51 | nodes_assert_commit_during_failure = [n for n in range(3) if n !=
52 | node_number - 1]
53 | aggs_failure, aggs = self.performRandomFailure(
54 | f'node{node_number}',
55 | nodes_wait_for_commit=[n for n in range(3)],
56 | nodes_wait_for_online=[f"dbname=regression user=postgres host={self.host_ip} port={port}"],
57 | stop_load=True,
58 | nodes_assert_commit_during_failure=
59 | nodes_assert_commit_during_failure)
60 |
61 | for n in range(3):
62 | if n == node_number - 1:
63 | self.assertNoCommits([aggs_failure[n]])
64 | else:
65 | self.assertCommits([aggs_failure[n]])
66 |
67 | self.assertIsolation(aggs_failure)
68 | self.assertCommits(aggs)
69 | self.assertIsolation(aggs)
70 | self.assertDataSync()
71 |
72 | log.info(f'iteration #{i} is OK')
73 |
74 | # sausage topology test
75 | def test_edge_partition(self):
76 | log.info('### test_edge_partition ###')
77 |
78 | aggs_failure, aggs = self.performFailure(
79 | EdgePartition('node1', 'node3'),
80 | # clique selection picks up the min mask, so in 1-2-3 sausage 12
81 | # will be eventually the live nodes. However, there is a small risk
82 | # of 3 successfully voting for 23 before 1 understands what's going
83 | # on, in which case 1 is put into recovery which doesn't finish in
84 | # 10s of the test given that the load is not stopped. This actually
85 | # happened in CI. To avoid test failure, wait for both 1 and 3 to be
86 | # online.
87 | nodes_wait_for_online=[
88 | f"dbname=regression user=postgres host={self.host_ip} port=15434",
89 | f"dbname=regression user=postgres host={self.host_ip} port=15432"],
90 | stop_load=True)
91 |
92 | self.assertTrue(('commit' in aggs_failure[0]['transfer']['finish']) or
93 | ('commit' in aggs_failure[2]['transfer']['finish']))
94 | self.assertCommits(aggs_failure[1:2]) # second node
95 | self.assertIsolation(aggs_failure)
96 |
97 | self.assertCommits(aggs)
98 | self.assertIsolation(aggs)
99 |
100 | # can be used for manual running of some particular failure
101 | def _test_single_failure(self):
102 | log.info('### test_single_failure ###')
103 |
104 | failure = CrashRecoverNode('node3')
105 | aggs_failure, aggs = self.performFailure(
106 | failure,
107 | nodes_wait_for_online=["dbname=regression user=postgres host=127.0.0.1 port=15434"],
108 | stop_load=True)
109 |
110 | self.assertCommits(aggs_failure[:2])
111 | self.assertNoCommits(aggs_failure[2:])
112 | self.assertIsolation(aggs_failure)
113 |
114 | self.assertCommits(aggs)
115 | self.assertIsolation(aggs)
116 |
117 |
118 | # you can run single test with something like
119 | # python -u -m unittest test_recovery.RecoveryTest.test_single_failure
120 | if __name__ == '__main__':
121 | # run all tests
122 | unittest.main()
123 |
--------------------------------------------------------------------------------
/tests/test_regression.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import subprocess
3 | import time
4 |
5 | from lib.bank_client import MtmClient
6 | from lib.test_helper import *
7 |
8 | class RecoveryTest(unittest.TestCase, TestHelper):
9 |
10 | @classmethod
11 | def setUpClass(cls):
12 | cls.dsns = [
13 | "dbname=regression user=postgres host=127.0.0.1 port=15432",
14 | "dbname=regression user=postgres host=127.0.0.1 port=15433",
15 | "dbname=regression user=postgres host=127.0.0.1 port=15434"
16 | ]
17 |
18 | print('setUp')
19 | subprocess.check_call(['docker-compose','up',
20 | '--force-recreate',
21 | '--build',
22 | '-d'])
23 |
24 | # Wait for all nodes to become online
25 | [ cls.awaitOnline(dsn) for dsn in cls.dsns ]
26 |
27 | cls.client = MtmClient(cls.dsns, n_accounts=1000)
28 |
29 | @classmethod
30 | def tearDownClass(cls):
31 | print('tearDown')
32 | # subprocess.check_call(['docker-compose','down'])
33 |
34 | def test_regression(self):
35 | # XXX: make smth clever here
36 | time.sleep(10)
37 | subprocess.check_call(['docker', 'exec',
38 | 'node1',
39 | '/pg/mmts/tests/support/docker-regress.sh',
40 | ])
41 |
42 | if __name__ == '__main__':
43 | unittest.main()
44 |
--------------------------------------------------------------------------------
/tests/test_syncpoint.py:
--------------------------------------------------------------------------------
1 | #
2 | # Basic syncpoint sanity check: ensure in normal mode (all nodes are up and
3 | # running) old wal files are erased once they are not needed anymore.
4 | # On the other hand we must ensure that if a node of the cluster is out of
5 | # order the older wal files needed for the node recovery are NOT erased.
6 | #
7 |
8 | import unittest
9 | import time
10 | import subprocess
11 | import datetime
12 | import docker
13 | import warnings
14 | import pprint
15 | import logging
16 |
17 | import lib.log_helper # configures loggers
18 | from lib.bank_client import MtmClient
19 | from lib.failure_injector import *
20 | from lib.test_helper import *
21 |
22 | log = logging.getLogger('root')
23 |
24 | class SyncpointTest(MMTestCase, TestHelper):
25 | # Returns the newest wal
26 | def _get_last_wal(self, dsn):
27 | return self.nodeSelect(dsn, "SELECT name FROM pg_ls_waldir() WHERE "
28 | "name ~ '^[0-9A-F]+$' ORDER BY "
29 | "name DESC LIMIT 1")[0][0]
30 |
31 | def _get_last_wals(self, dsns):
32 | return [self._get_last_wal(dsn) for dsn in dsns]
33 |
34 | # Returns the oldest existing wal
35 | def _get_first_wal(self, dsn):
36 | # recycle old segments
37 | self.nodeExecute(dsn, ["CHECKPOINT"])
38 | return self.nodeSelect(dsn, "SELECT name FROM pg_ls_waldir() WHERE "
39 | "name ~ '^[0-9A-F]+$' ORDER BY "
40 | "name LIMIT 1")[0][0]
41 |
42 | def _get_first_wals(self, dsns):
43 | return [self._get_first_wal(dsn) for dsn in dsns]
44 |
45 | # get restart_lsn segment of slot to the recipient node id.
46 | def _get_slot_wal(self, dsn, recipient):
47 | return self.nodeSelect(dsn, """
48 | SELECT pg_walfile_name(restart_lsn)
49 | FROM pg_replication_slots WHERE slot_name = 'mtm_slot_{}'
50 | """.format(recipient))[0][0]
51 |
52 | def _get_slot_wals(self, dsns, recipient):
53 | return [self._get_slot_wal(dsn, recipient) for dsn in dsns]
54 |
55 | # Waits (up to iterations * iteration_sleep seconds)
56 | # until at least wals_to_pass segments appear on each node
57 | def _wait_wal(self, dsns, wals_to_pass=5,
58 | iteration_sleep=20,
59 | iterations=1000):
60 | last_wals_initial = self._get_last_wals(dsns)
61 | log.debug("waiting for wal, last_wals_initial={}, first_wals={}"
62 | .format(last_wals_initial, self._get_first_wals(dsns)))
63 | for j in range(iterations):
64 | time.sleep(iteration_sleep)
65 | last_wals = self._get_last_wals(dsns)
66 | log.debug("waiting for wal, last_wals={}, first_wals={}"
67 | .format(last_wals, self._get_first_wals(dsns)))
68 | # xxx: this is only correct for first 4GB of WAL due to the hole in
69 | # WAL file naming
70 | if all(int(lw, 16) - int(lw_i, 16) >= wals_to_pass
71 | for (lw_i, lw) in zip(last_wals_initial, last_wals)):
72 | return
73 |
74 | raise AssertionError('timed out while waiting for wal')
75 |
76 | def _chk_rec_trim(self, dsn, other_dsns, iteration_sleep=2,
77 | iterations=1000):
78 | log.info('checking if wals were trimmed during recovery')
79 | dsns = other_dsns + [dsn]
80 | first_wals_before = self._get_first_wals(dsns)
81 | first_wals = []
82 | wals_trimmed = False
83 | status = ''
84 | for j in range(iterations):
85 | time.sleep(iteration_sleep)
86 | last_wals = self._get_last_wals(dsns)
87 | first_wals = self._get_first_wals(dsns)
88 | status = self.nodeSelect(dsn,
89 | 'SELECT status from mtm.status()')[0][0]
90 | log.debug("status: %s" % status)
91 | log.debug('first wals - %s, ' % first_wals)
92 | log.debug('last wals - %s' % last_wals)
93 | if status == 'online':
94 | break
95 | wals_trimmed = wals_trimmed or all(b= a for (b, a) in zip(slot_wals_before, first_wals_after)):
147 | raise AssertionError('segments on some nodes were trimmed in degraded mode: before={}, after={}'.format(slot_wals_before, first_wals_after))
148 |
149 | # re-run client in weak mode to allow node to recover
150 | # (but don't stop it completely to make test harder)
151 | self.client.stop()
152 | numworkers = {
153 | 'transfer': 1,
154 | 'sumtotal': 1,
155 | 'inserter': 1
156 | }
157 | self.client.bgrun(numworkers=numworkers)
158 | log.info('getting node 3 up')
159 | failure.stop()
160 | # This allows to connect to MM node during recovery
161 | recovery_dsn = self.dsns[2]+' application_name=mtm_admin'
162 | # Wait for node becomes accessible (in recovery mode)
163 | self.awaitOnline(recovery_dsn)
164 | self._chk_rec_trim(recovery_dsn, self.dsns[:2])
165 | self.awaitOnline(self.dsns[2])
166 | # Now stop client
167 | self.client.stop()
168 |
169 |
170 | if __name__ == '__main__':
171 | unittest.main()
172 |
--------------------------------------------------------------------------------
/tests/writer.pgb:
--------------------------------------------------------------------------------
1 | \set src random(0, 999)
2 | \set dst random(0, 999)
3 | \set amount random(1, 10)
4 | begin;
5 | update t set v = v - :amount where k=:src;
6 | update t set v = v + :amount where k=:dst;
7 | commit;
--------------------------------------------------------------------------------
/tests_testgres/.gitignore:
--------------------------------------------------------------------------------
1 | env
2 | venv
3 | __pycache__/
4 | *.pyc
5 |
--------------------------------------------------------------------------------
/tests_testgres/connect.jsh:
--------------------------------------------------------------------------------
1 |
2 | /env --class-path /usr/share/java/postgresql-jdbc/postgresql-jdbc4.jar
3 |
4 | import java.sql.*;
5 | Class.forName("org.postgresql.Driver");
6 |
7 | int port1 = 12928;
8 | int port2 = 16682;
9 | int port3 = 18521;
10 | String connstring = String.format("jdbc:postgresql://localhost:%d,localhost:%d,localhost:%d/postgres", port1, port2, port3);
11 |
12 | /* connect to DB */
13 | Connection con = DriverManager.getConnection(connstring);
14 |
15 | /* show help */
16 | System.out.println("Use 'con' object!");
17 |
18 | /* execute some commands */
19 | System.out.println("Execute 'SELECT 1'");
20 | Statement stmt = con.createStatement();
21 | ResultSet rs = stmt.executeQuery("select 1");
22 | rs.next();
23 | String s = rs.getString(1);
24 | System.out.println("result = " + s);
25 |
--------------------------------------------------------------------------------
/tests_testgres/ddl.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import subprocess
3 | import time
4 |
5 | from mm_cluster import Cluster
6 |
7 | NUM_NODES = 3
8 |
9 | class TestDDL(unittest.TestCase):
10 |
11 | @classmethod
12 | def setUpClass(cls):
13 | cls.cluster = Cluster(NUM_NODES)
14 | cls.cluster.print_conninfo()
15 | cls.cluster.start().install().await_online((0,1,2))
16 | # cls.cluster.print_conninfo()
17 |
18 | @classmethod
19 | def tearDownClass(cls):
20 | cls.cluster.stop()
21 |
22 | # Check that recovery properly processes
23 | def test_dll_recovery(self):
24 | # create table while one node is stopped
25 | self.cluster.nodes[2].stop()
26 | self.cluster.await_online((0,1))
27 | self.cluster.nodes[0].safe_psql(query='create table t(id int primary key)')
28 |
29 | # now if second node didn't store logical message with DDL and third
30 | # node will recover from second then it will not receive this
31 | # 'create table' (PGPRO-1699)
32 | self.cluster.nodes[2].start()
33 | self.cluster.await_online((0,1,2))
34 | self.cluster.nodes[2].safe_psql(query='insert into t values(42)')
35 |
36 |
37 | if __name__ == '__main__':
38 | unittest.main()
--------------------------------------------------------------------------------
/tests_testgres/run_tests.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | set -e
3 | ulimit -c unlimited
4 |
5 | CURPATH=`pwd`
6 | BASEDIR=$CURPATH/../../..
7 | export PATH=$BASEDIR/tmp_install/usr/local/pgsql/bin/:$PATH
8 | export DYLD_LIBRARY_PATH=$BASEDIR/tmp_install/usr/local/pgsql/lib/:$DYLD_LIBRARY_PATH
9 | export DESTDIR=$BASEDIR/tmp_install
10 |
11 | make -C $BASEDIR install
12 | make -C $BASEDIR/contrib/mmts install
13 |
14 | if [ -z "$VIRTUAL_ENV" ]; then
15 | >&2 echo WARNING: not in virtualenv
16 | fi
17 |
18 | # python3 -m unittest discover --pattern=*.py
19 | python3 ddl.py
20 |
--------------------------------------------------------------------------------
/tests_testgres/test_failover.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | from mm_cluster import Cluster
4 |
5 |
6 | with Cluster(3).start().install() as cluster:
7 | print("Cluster is working")
8 |
9 | node_id = 0
10 | for node in cluster.nodes:
11 | node_id += 1
12 |
13 | print("Node #{}".format(node_id))
14 | print("\t-> port: {}".format(node.port))
15 | print("\t-> arbiter port: {}".format(node.mm_port))
16 | print("\t-> dir: {}".format(node.base_dir))
17 | print()
18 |
19 | jshell = """
20 | /env --class-path /usr/share/java/postgresql-jdbc/postgresql-jdbc4.jar
21 |
22 | import java.sql.*;
23 | Class.forName("org.postgresql.Driver");
24 |
25 | int port1 = {};
26 | int port2 = {};
27 | int port3 = {};
28 | String connstring = String.format("jdbc:postgresql://localhost:%d,localhost:%d,localhost:%d/postgres", port1, port2, port3);
29 |
30 | /* connect to DB */
31 | Connection con = DriverManager.getConnection(connstring);
32 |
33 | /* show help */
34 | System.out.println("Use 'con' object!");
35 |
36 | /* execute some commands */
37 | System.out.println("Execute 'SELECT 1'");
38 | Statement stmt = con.createStatement();
39 | ResultSet rs = stmt.executeQuery("select 1");
40 | rs.next();
41 | String s = rs.getString(1);
42 | System.out.println("result = " + s);
43 | """.format(cluster.nodes[0].port,
44 | cluster.nodes[1].port,
45 | cluster.nodes[2].port)
46 |
47 | with open('connect.jsh', 'w') as f:
48 | f.write(jshell)
49 | print("Now run jshell with connect.jsh")
50 | print()
51 |
52 | print("Press ctrl+C to exit")
53 |
54 | while True:
55 | import time
56 | time.sleep(1)
57 |
--------------------------------------------------------------------------------
/tests_testgres/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/postgrespro/mmts/baab9238f784d428481ecfa1294e3f9a3910b2d2/tests_testgres/tests/__init__.py
--------------------------------------------------------------------------------
/tests_testgres/tests/bootstrap.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from mm_cluster import Cluster
3 |
4 |
5 | class Bootstrap(unittest.TestCase):
6 | def test_bootstrap(self):
7 | with Cluster(3).start().install() as cluster:
8 | for node in cluster.nodes:
9 | status = 'select status from mtm.get_cluster_state()'
10 |
11 | self.assertTrue(node.status())
12 | self.assertTrue(node.execute('postgres', 'select true'))
13 | self.assertTrue(node.execute('postgres', status))
14 |
--------------------------------------------------------------------------------
/tests_testgres/tests/truncate.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import subprocess
3 | import time
4 |
5 | from mm_cluster import Cluster
6 |
7 |
8 | NUM_NODES = 2
9 | BENCH_SEC = 30
10 |
11 |
12 | class TestTruncate(unittest.TestCase):
13 | def test_truncate(self):
14 | with Cluster(NUM_NODES).start().install() as cluster:
15 | assert(NUM_NODES >= 2)
16 |
17 | for node in cluster.nodes:
18 | self.assertTrue(node.status())
19 |
20 | node_1 = cluster.nodes[0]
21 | node_1.pgbench_init(dbname=cluster.dbname)
22 |
23 | pgbench = node_1.pgbench(dbname=cluster.dbname,
24 | stdout=subprocess.PIPE,
25 | stderr=subprocess.STDOUT,
26 | options=['-T%i' % BENCH_SEC])
27 |
28 | count = 0
29 | started = time.time()
30 | while time.time() - started < BENCH_SEC:
31 | for node in cluster.nodes:
32 | node.safe_psql(dbname=cluster.dbname,
33 | username=cluster.username,
34 | query='truncate pgbench_history;')
35 |
36 | node.safe_psql(dbname=cluster.dbname,
37 | username=cluster.username,
38 | query='vacuum full;')
39 |
40 | count += 1
41 |
42 | # check that pgbench has been running for at least 1 loop
43 | assert (count > 0 or pgbench.poll is not None)
44 |
45 | time.sleep(0.5)
46 |
47 | assert(count > 0)
48 | print("{}: executed truncate {} times"
49 | .format(self.test_truncate.__name__, count))
50 |
51 | pgbench.wait()
52 |
--------------------------------------------------------------------------------