├── pg_rewrite.md ├── .gitignore ├── .dir-locals.el ├── pg_rewrite.control ├── typedefs.list ├── pg_rewrite--1.2--2.0.sql ├── pg_rewrite--1.0.sql ├── pg_rewrite--1.3--2.0.sql ├── Makefile ├── pg_rewrite--1.1--1.2.sql ├── LICENSE ├── .github └── workflows │ └── regression.yml ├── pg_rewrite--1.0--1.1.sql ├── NEWS ├── sql ├── generated.sql └── pg_rewrite.sql ├── expected ├── generated.out ├── generated_1.out ├── pg_rewrite_concurrent_toast.out ├── pg_rewrite_concurrent_partition.out ├── pg_rewrite_concurrent.out ├── pg_rewrite_1.out └── pg_rewrite.out ├── specs ├── pg_rewrite_concurrent_toast.spec ├── pg_rewrite_concurrent.spec └── pg_rewrite_concurrent_partition.spec ├── pg_rewrite.h ├── README.md └── concurrent.c /pg_rewrite.md: -------------------------------------------------------------------------------- 1 | README.md -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | *.o 3 | *.so 4 | results/ 5 | GPATH 6 | GRTAGS 7 | GTAGS 8 | -------------------------------------------------------------------------------- /.dir-locals.el: -------------------------------------------------------------------------------- 1 | ((c-mode . ((c-basic-offset . 4) 2 | (c-file-style . "bsd") 3 | (fill-column . 78) 4 | (indent-tabs-mode . t) 5 | (tab-width . 4)))) 6 | -------------------------------------------------------------------------------- /pg_rewrite.control: -------------------------------------------------------------------------------- 1 | # pg_rewrite extension 2 | comment = 'Tool for maintenance that requires table rewriting.' 3 | default_version = '2.0' 4 | module_pathname = '$libdir/pg_rewrite' 5 | relocatable = true 6 | -------------------------------------------------------------------------------- /typedefs.list: -------------------------------------------------------------------------------- 1 | CatalogState 2 | ConcurrentChange 3 | ConcurrentChangeKind 4 | ConstraintInfo 5 | DecodingOutputState 6 | IndexCatInfo 7 | PartitionEntry 8 | PgClassCatInfo 9 | TypeCatInfo 10 | partitions_hash 11 | -------------------------------------------------------------------------------- /pg_rewrite--1.2--2.0.sql: -------------------------------------------------------------------------------- 1 | /* pg_rewrite--1.2--2.0.sql */ 2 | 3 | -- complain if script is sourced in psql, rather than via ALTER EXTENSION 4 | \echo Use "ALTER EXTENSION pg_rewrite UPDATE TO '2.0'" to load this file. \quit 5 | -------------------------------------------------------------------------------- /pg_rewrite--1.0.sql: -------------------------------------------------------------------------------- 1 | /* pg_rewrite--1.0.sql */ 2 | 3 | -- complain if script is sourced in psql, rather than via CREATE EXTENSION 4 | \echo Use "CREATE EXTENSION pg_rewrite" to load this file. \quit 5 | 6 | CREATE FUNCTION partition_table( 7 | src_table text, 8 | dst_table text, 9 | src_table_new text) 10 | RETURNS void 11 | AS 'MODULE_PATHNAME', 'partition_table' 12 | LANGUAGE C; 13 | -------------------------------------------------------------------------------- /pg_rewrite--1.3--2.0.sql: -------------------------------------------------------------------------------- 1 | /* pg_rewrite--1.3--2.0.sql */ 2 | 3 | -- complain if script is sourced in psql, rather than via ALTER EXTENSION 4 | \echo Use "ALTER EXTENSION pg_rewrite UPDATE TO '2.0'" to load this file. \quit 5 | 6 | DROP FUNCTION IF EXISTS rewrite_table_nowait; 7 | CREATE FUNCTION rewrite_table_nowait( 8 | src_table text, 9 | dst_table text, 10 | src_table_new text) 11 | RETURNS void 12 | AS 'MODULE_PATHNAME', 'rewrite_table_nowait' 13 | LANGUAGE C 14 | STRICT; -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | PG_CONFIG ?= pg_config 2 | MODULE_big = pg_rewrite 3 | OBJS = pg_rewrite.o concurrent.o $(WIN32RES) 4 | PGFILEDESC = "pg_rewrite - tools for maintenance that requires table rewriting." 5 | 6 | EXTENSION = pg_rewrite 7 | DATA = pg_rewrite--1.0.sql pg_rewrite--1.0--1.1.sql pg_rewrite--1.1--1.2.sql\ 8 | pg_rewrite--1.2--2.0.sql pg_rewrite--1.3--2.0.sql 9 | DOCS = pg_rewrite.md 10 | 11 | REGRESS = pg_rewrite generated 12 | #ISOLATION = pg_rewrite_concurrent pg_rewrite_concurrent_partition \ 13 | pg_rewrite_concurrent_toast 14 | 15 | PGXS := $(shell $(PG_CONFIG) --pgxs) 16 | include $(PGXS) 17 | 18 | -------------------------------------------------------------------------------- /pg_rewrite--1.1--1.2.sql: -------------------------------------------------------------------------------- 1 | /* pg_rewrite--1.1--1.2.sql */ 2 | 3 | -- complain if script is sourced in psql, rather than via ALTER EXTENSION 4 | \echo Use "ALTER EXTENSION pg_rewrite UPDATE TO '1.2'" to load this file. \quit 5 | 6 | DROP FUNCTION partition_table(text, text, text); 7 | 8 | CREATE FUNCTION rewrite_table( 9 | src_table text, 10 | dst_table text, 11 | src_table_new text) 12 | RETURNS void 13 | AS 'MODULE_PATHNAME', 'rewrite_table' 14 | LANGUAGE C 15 | STRICT; 16 | 17 | CREATE FUNCTION rewrite_table_nowait( 18 | src_table text, 19 | dst_table text, 20 | src_table_new text) 21 | RETURNS void 22 | AS 'MODULE_PATHNAME', 'rewrite_table_nowait' 23 | LANGUAGE C 24 | STRICT; 25 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2021-2023, Cybertec PostgreSQL International GmbH 2 | 3 | Permission to use, copy, modify, and distribute this software and its 4 | documentation for any purpose, without fee, and without a written agreement is 5 | hereby granted, provided that the above copyright notice and this paragraph 6 | and the following two paragraphs appear in all copies. 7 | 8 | IN NO EVENT SHALL Cybertec PostgreSQL International GmbH BE LIABLE TO ANY 9 | PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, 10 | INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS 11 | DOCUMENTATION, EVEN IF Cybertec PostgreSQL International GmbH HAS BEEN ADVISED 12 | OF THE POSSIBILITY OF SUCH DAMAGE. 13 | 14 | Cybertec PostgreSQL International GmbH SPECIFICALLY DISCLAIMS ANY WARRANTIES, 15 | INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 16 | FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS 17 | IS" BASIS, AND Cybertec PostgreSQL International GmbH HAS NO OBLIGATIONS TO 18 | PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. 19 | -------------------------------------------------------------------------------- /.github/workflows/regression.yml: -------------------------------------------------------------------------------- 1 | name: Build 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | build: 7 | runs-on: ubuntu-latest 8 | 9 | defaults: 10 | run: 11 | shell: sh 12 | 13 | strategy: 14 | matrix: 15 | pgversion: 16 | - 17 17 | 18 | env: 19 | PGVERSION: ${{ matrix.pgversion }} 20 | 21 | steps: 22 | - name: checkout 23 | uses: actions/checkout@v3 24 | 25 | - name: install pg 26 | run: | 27 | sudo /usr/share/postgresql-common/pgdg/apt.postgresql.org.sh -v $PGVERSION -p -i 28 | sudo -u postgres createuser -s "$USER" 29 | 30 | - name: build 31 | run: | 32 | make PROFILE="-Werror" 33 | sudo -E make install 34 | 35 | - name: test 36 | run: | 37 | sudo pg_conftool set shared_preload_libraries pg_rewrite 38 | sudo pg_conftool set wal_level logical 39 | sudo pg_conftool set max_replication_slots 1 40 | sudo pg_ctlcluster $PGVERSION main restart 41 | make installcheck 42 | 43 | - name: show regression diffs 44 | if: ${{ failure() }} 45 | run: | 46 | cat /home/runner/work/pg_rewrite/pg_rewrite/output_iso/regression.diffs 47 | -------------------------------------------------------------------------------- /pg_rewrite--1.0--1.1.sql: -------------------------------------------------------------------------------- 1 | /* pg_rewrite--1.0--1.1.sql */ 2 | 3 | -- complain if script is sourced in psql, rather than via ALTER EXTENSION 4 | \echo Use "ALTER EXTENSION pg_rewrite UPDATE TO '1.1'" to load this file. \quit 5 | 6 | DROP FUNCTION partition_table(text, text, text); 7 | CREATE FUNCTION partition_table( 8 | src_table text, 9 | dst_table text, 10 | src_table_new text) 11 | RETURNS void 12 | AS 'MODULE_PATHNAME', 'partition_table_new' 13 | LANGUAGE C 14 | STRICT; 15 | 16 | CREATE FUNCTION pg_rewrite_get_task_list() 17 | RETURNS TABLE ( 18 | tabschema_src name, 19 | tabname_src name, 20 | tabschema_dst name, 21 | tabname_dst name, 22 | tabname_src_new name, 23 | ins_initial bigint, 24 | ins bigint, 25 | upd bigint, 26 | del bigint) 27 | AS 'MODULE_PATHNAME', 'pg_rewrite_get_task_list' 28 | LANGUAGE C; 29 | 30 | -- The column names should match the arguments of the partition_table() 31 | -- function. 32 | CREATE VIEW pg_rewrite_progress AS 33 | SELECT COALESCE(tabschema_src || '.', '') || tabname_src AS src_table, 34 | COALESCE(tabschema_dst || '.', '') || tabname_dst AS dst_table, 35 | tabname_src_new AS src_table_new, 36 | ins_initial, ins, upd, del 37 | FROM pg_rewrite_get_task_list(); 38 | -------------------------------------------------------------------------------- /NEWS: -------------------------------------------------------------------------------- 1 | Release 2.0 2 | =========== 3 | 4 | 1. This release makes the extension useful in more use cases. 5 | 6 | Besides turning a non-partitioned table into a partitioned one, it can be 7 | used to change 1) data type of column(s), 2) order of columns, 3) 8 | tablespace. 9 | 10 | 2. A single function `rewrite_table()` is used now to handle all the use 11 | cases. 12 | 13 | 3. Constraints are handled in a more convenient way. 14 | 15 | The extension now takes care of creating the constraints on the target 16 | table according to the source table. The user only needs to validate the 17 | constraints after the rewriting has finished. 18 | 19 | Unlike the previous release, the rewritten table can be referenced by 20 | foreign key constraints. 21 | 22 | Note: The `rewrite.check_constraints` configuration variable was 23 | removed. If there is a risk that other users could run `ALTER TABLE` on 24 | the table during rewriting, please revoke the corresponding privileges 25 | from them temporarily. 26 | 27 | 28 | Release 1.1.1 29 | ============= 30 | 31 | This release only adjusts the code so it is compatible with PostgreSQL server 32 | version 17. 33 | 34 | 35 | Release 1.1.0 36 | ============= 37 | 38 | New Features 39 | ------------ 40 | 41 | 1. Make the code compatible with PostgreSQL server version 16. 42 | 43 | 2. Added progress monitoring. 44 | -------------------------------------------------------------------------------- /sql/generated.sql: -------------------------------------------------------------------------------- 1 | -- Generated columns - some meaningful combinations of source and destination 2 | -- columns. 3 | CREATE TABLE tab7( 4 | i int primary key, 5 | j int, 6 | k int generated always as (i + 1) virtual, 7 | l int generated always AS (i + 1) stored, 8 | m int generated always AS (i + 1) virtual); 9 | CREATE TABLE tab7_new( 10 | i int primary key, 11 | -- Override the value copied from the source table. 12 | j int generated always AS (i - 1) stored, 13 | -- Check that the expression is evaluated correctly on the source 14 | -- table. 15 | k int, 16 | -- The same for stored expression. 17 | l int, 18 | -- Override the value computed on the source table. 19 | m int generated always as (i - 1) virtual); 20 | INSERT INTO tab7(i, j) VALUES (1, 1); 21 | SELECT rewrite_table('tab7', 'tab7_new', 'tab7_orig'); 22 | SELECT * FROM tab7; 23 | 24 | CREATE EXTENSION pageinspect; 25 | -- HEAP_HASNULL indicates that the value of 'm' hasn't been copied from the 26 | -- source table. 27 | SELECT raw_flags 28 | FROM heap_page_items(get_raw_page('tab7', 0)), 29 | LATERAL heap_tuple_infomask_flags(t_infomask, t_infomask2); 30 | 31 | -- For PG < 18, test without VIRTUAL columns. 32 | CREATE TABLE tab8( 33 | i int primary key, 34 | j int, 35 | k int generated always AS (i + 1) stored); 36 | CREATE TABLE tab8_new( 37 | i int primary key, 38 | -- Override the value copied from the source table. 39 | j int generated always AS (i - 1) stored, 40 | -- Check that the expression is evaluated correctly on the source 41 | -- table. 42 | k int); 43 | INSERT INTO tab8(i, j) VALUES (1, 1); 44 | SELECT rewrite_table('tab8', 'tab8_new', 'tab8_orig'); 45 | SELECT * FROM tab8; 46 | -------------------------------------------------------------------------------- /expected/generated.out: -------------------------------------------------------------------------------- 1 | -- Generated columns - some meaningful combinations of source and destination 2 | -- columns. 3 | CREATE TABLE tab7( 4 | i int primary key, 5 | j int, 6 | k int generated always as (i + 1) virtual, 7 | l int generated always AS (i + 1) stored, 8 | m int generated always AS (i + 1) virtual); 9 | CREATE TABLE tab7_new( 10 | i int primary key, 11 | -- Override the value copied from the source table. 12 | j int generated always AS (i - 1) stored, 13 | -- Check that the expression is evaluated correctly on the source 14 | -- table. 15 | k int, 16 | -- The same for stored expression. 17 | l int, 18 | -- Override the value computed on the source table. 19 | m int generated always as (i - 1) virtual); 20 | INSERT INTO tab7(i, j) VALUES (1, 1); 21 | SELECT rewrite_table('tab7', 'tab7_new', 'tab7_orig'); 22 | rewrite_table 23 | --------------- 24 | 25 | (1 row) 26 | 27 | SELECT * FROM tab7; 28 | i | j | k | l | m 29 | ---+---+---+---+--- 30 | 1 | 0 | 2 | 2 | 0 31 | (1 row) 32 | 33 | CREATE EXTENSION pageinspect; 34 | -- HEAP_HASNULL indicates that the value of 'm' hasn't been copied from the 35 | -- source table. 36 | SELECT raw_flags 37 | FROM heap_page_items(get_raw_page('tab7', 0)), 38 | LATERAL heap_tuple_infomask_flags(t_infomask, t_infomask2); 39 | raw_flags 40 | ------------------------------------------------------ 41 | {HEAP_HASNULL,HEAP_XMIN_COMMITTED,HEAP_XMAX_INVALID} 42 | (1 row) 43 | 44 | -- For PG < 18, test without VIRTUAL columns. 45 | CREATE TABLE tab8( 46 | i int primary key, 47 | j int, 48 | k int generated always AS (i + 1) stored); 49 | CREATE TABLE tab8_new( 50 | i int primary key, 51 | -- Override the value copied from the source table. 52 | j int generated always AS (i - 1) stored, 53 | -- Check that the expression is evaluated correctly on the source 54 | -- table. 55 | k int); 56 | INSERT INTO tab8(i, j) VALUES (1, 1); 57 | SELECT rewrite_table('tab8', 'tab8_new', 'tab8_orig'); 58 | rewrite_table 59 | --------------- 60 | 61 | (1 row) 62 | 63 | SELECT * FROM tab8; 64 | i | j | k 65 | ---+---+--- 66 | 1 | 0 | 2 67 | (1 row) 68 | 69 | -------------------------------------------------------------------------------- /expected/generated_1.out: -------------------------------------------------------------------------------- 1 | -- Generated columns - some meaningful combinations of source and destination 2 | -- columns. 3 | CREATE TABLE tab7( 4 | i int primary key, 5 | j int, 6 | k int generated always as (i + 1) virtual, 7 | l int generated always AS (i + 1) stored, 8 | m int generated always AS (i + 1) virtual); 9 | ERROR: syntax error at or near "virtual" 10 | LINE 4: k int generated always as (i + 1) virtual, 11 | ^ 12 | CREATE TABLE tab7_new( 13 | i int primary key, 14 | -- Override the value copied from the source table. 15 | j int generated always AS (i - 1) stored, 16 | -- Check that the expression is evaluated correctly on the source 17 | -- table. 18 | k int, 19 | -- The same for stored expression. 20 | l int, 21 | -- Override the value computed on the source table. 22 | m int generated always as (i - 1) virtual); 23 | ERROR: syntax error at or near "virtual" 24 | LINE 11: m int generated always as (i - 1) virtual); 25 | ^ 26 | INSERT INTO tab7(i, j) VALUES (1, 1); 27 | ERROR: relation "tab7" does not exist 28 | LINE 1: INSERT INTO tab7(i, j) VALUES (1, 1); 29 | ^ 30 | SELECT rewrite_table('tab7', 'tab7_new', 'tab7_orig'); 31 | ERROR: relation "tab7" does not exist 32 | SELECT * FROM tab7; 33 | ERROR: relation "tab7" does not exist 34 | LINE 1: SELECT * FROM tab7; 35 | ^ 36 | CREATE EXTENSION pageinspect; 37 | -- HEAP_HASNULL indicates that the value of 'm' hasn't been copied from the 38 | -- source table. 39 | SELECT raw_flags 40 | FROM heap_page_items(get_raw_page('tab7', 0)), 41 | LATERAL heap_tuple_infomask_flags(t_infomask, t_infomask2); 42 | ERROR: relation "tab7" does not exist 43 | -- For PG < 18, test without VIRTUAL columns. 44 | CREATE TABLE tab8( 45 | i int primary key, 46 | j int, 47 | k int generated always AS (i + 1) stored); 48 | CREATE TABLE tab8_new( 49 | i int primary key, 50 | -- Override the value copied from the source table. 51 | j int generated always AS (i - 1) stored, 52 | -- Check that the expression is evaluated correctly on the source 53 | -- table. 54 | k int); 55 | INSERT INTO tab8(i, j) VALUES (1, 1); 56 | SELECT rewrite_table('tab8', 'tab8_new', 'tab8_orig'); 57 | rewrite_table 58 | --------------- 59 | 60 | (1 row) 61 | 62 | SELECT * FROM tab8; 63 | i | j | k 64 | ---+---+--- 65 | 1 | 0 | 2 66 | (1 row) 67 | 68 | -------------------------------------------------------------------------------- /expected/pg_rewrite_concurrent_toast.out: -------------------------------------------------------------------------------- 1 | Parsed test spec with 2 sessions 2 | 3 | starting permutation: do_rewrite wait_for_before_lock_ip do_changes wakeup_before_lock_ip wait_for_after_commit_ip do_check wakeup_after_commit_ip 4 | injection_points_attach 5 | ----------------------- 6 | 7 | (1 row) 8 | 9 | step do_rewrite: 10 | SELECT rewrite_table_nowait('tbl_src', 'tbl_dst', 'tbl_src_old'); 11 | 12 | rewrite_table_nowait 13 | -------------------- 14 | 15 | (1 row) 16 | 17 | step wait_for_before_lock_ip: 18 | DO $$ 19 | BEGIN 20 | LOOP 21 | PERFORM pg_stat_clear_snapshot(); 22 | 23 | PERFORM 24 | FROM pg_stat_activity 25 | WHERE (wait_event_type, wait_event)=('InjectionPoint', 'pg_rewrite-before-lock'); 26 | 27 | IF FOUND THEN 28 | EXIT; 29 | END IF; 30 | 31 | PERFORM pg_sleep(.1); 32 | END LOOP; 33 | END; 34 | $$; 35 | 36 | step do_changes: 37 | INSERT INTO tbl_src(i, t) 38 | SELECT 5, string_agg(random()::text, '') 39 | FROM generate_series(1, 200) h(y); 40 | 41 | UPDATE tbl_src SET t = t || 'x' WHERE i = 1; 42 | 43 | step wakeup_before_lock_ip: 44 | SELECT injection_points_wakeup('pg_rewrite-before-lock'); 45 | 46 | injection_points_wakeup 47 | ----------------------- 48 | 49 | (1 row) 50 | 51 | step wait_for_after_commit_ip: 52 | DO $$ 53 | BEGIN 54 | LOOP 55 | PERFORM pg_stat_clear_snapshot(); 56 | 57 | PERFORM 58 | FROM pg_stat_activity 59 | WHERE (wait_event_type, wait_event)=('InjectionPoint', 'pg_rewrite-after-commit'); 60 | 61 | IF FOUND THEN 62 | EXIT; 63 | END IF; 64 | 65 | PERFORM pg_sleep(.1); 66 | END LOOP; 67 | END; 68 | $$; 69 | 70 | step do_check: 71 | TABLE pg_rewrite_progress; 72 | 73 | -- Each row should contain TOASTed value. 74 | SELECT count(*) FROM tbl_src WHERE pg_column_toast_chunk_id(t) ISNULL; 75 | 76 | -- The contents of the new table should be identical to that of the old 77 | -- one. 78 | SELECT count(*) 79 | FROM tbl_src t1 JOIN tbl_src_old t2 ON t1.i = t2.i 80 | WHERE t1.t <> t2.t; 81 | 82 | src_table|dst_table|src_table_new|ins_initial|ins|upd|del 83 | ---------+---------+-------------+-----------+---+---+--- 84 | tbl_src |tbl_dst |tbl_src_old | 2| 1| 1| 0 85 | (1 row) 86 | 87 | count 88 | ----- 89 | 0 90 | (1 row) 91 | 92 | count 93 | ----- 94 | 0 95 | (1 row) 96 | 97 | step wakeup_after_commit_ip: 98 | SELECT injection_points_wakeup('pg_rewrite-after-commit'); 99 | 100 | injection_points_wakeup 101 | ----------------------- 102 | 103 | (1 row) 104 | 105 | injection_points_detach 106 | ----------------------- 107 | 108 | (1 row) 109 | 110 | -------------------------------------------------------------------------------- /expected/pg_rewrite_concurrent_partition.out: -------------------------------------------------------------------------------- 1 | Parsed test spec with 2 sessions 2 | 3 | starting permutation: do_rewrite wait_for_before_lock_ip do_changes wakeup_before_lock_ip wait_for_after_commit_ip do_check wakeup_after_commit_ip 4 | injection_points_attach 5 | ----------------------- 6 | 7 | (1 row) 8 | 9 | step do_rewrite: 10 | SELECT rewrite_table_nowait('tbl_src', 'tbl_dst', 'tbl_src_old'); 11 | 12 | rewrite_table_nowait 13 | -------------------- 14 | 15 | (1 row) 16 | 17 | step wait_for_before_lock_ip: 18 | DO $$ 19 | BEGIN 20 | LOOP 21 | PERFORM pg_stat_clear_snapshot(); 22 | 23 | PERFORM 24 | FROM pg_stat_activity 25 | WHERE (wait_event_type, wait_event)=('InjectionPoint', 'pg_rewrite-before-lock'); 26 | 27 | IF FOUND THEN 28 | EXIT; 29 | END IF; 30 | 31 | PERFORM pg_sleep(.1); 32 | END LOOP; 33 | END; 34 | $$; 35 | 36 | step do_changes: 37 | -- Insert one row into each partition. 38 | INSERT INTO tbl_src VALUES (2, 20), (3, 30), (5, 50); 39 | 40 | -- Update with no identity change. 41 | UPDATE tbl_src SET j=0 WHERE i=1; 42 | 43 | -- Update with identity change but within the same partition. 44 | UPDATE tbl_src SET i=6 WHERE i=5; 45 | 46 | -- Cross-partition update. 47 | UPDATE tbl_src SET i=7 WHERE i=3; 48 | 49 | -- Update a row we inserted and updated, to check that it's visible. 50 | UPDATE tbl_src SET j=4 WHERE i=7; 51 | 52 | -- Delete. 53 | DELETE FROM tbl_src WHERE i=4; 54 | 55 | step wakeup_before_lock_ip: 56 | SELECT injection_points_wakeup('pg_rewrite-before-lock'); 57 | 58 | injection_points_wakeup 59 | ----------------------- 60 | 61 | (1 row) 62 | 63 | step wait_for_after_commit_ip: 64 | DO $$ 65 | BEGIN 66 | LOOP 67 | PERFORM pg_stat_clear_snapshot(); 68 | 69 | PERFORM 70 | FROM pg_stat_activity 71 | WHERE (wait_event_type, wait_event)=('InjectionPoint', 'pg_rewrite-after-commit'); 72 | 73 | IF FOUND THEN 74 | EXIT; 75 | END IF; 76 | 77 | PERFORM pg_sleep(.1); 78 | END LOOP; 79 | END; 80 | $$ 81 | 82 | step do_check: 83 | TABLE pg_rewrite_progress; 84 | 85 | SELECT i, j FROM tbl_src ORDER BY i, j; 86 | 87 | src_table|dst_table|src_table_new|ins_initial|ins|upd|del 88 | ---------+---------+-------------+-----------+---+---+--- 89 | tbl_src |tbl_dst |tbl_src_old | 2| 4| 3| 2 90 | (1 row) 91 | 92 | i| j 93 | -+-- 94 | 1| 0 95 | 2|20 96 | 6|50 97 | 7| 4 98 | (4 rows) 99 | 100 | step wakeup_after_commit_ip: 101 | SELECT injection_points_wakeup('pg_rewrite-after-commit'); 102 | 103 | injection_points_wakeup 104 | ----------------------- 105 | 106 | (1 row) 107 | 108 | injection_points_detach 109 | ----------------------- 110 | 111 | (1 row) 112 | 113 | -------------------------------------------------------------------------------- /expected/pg_rewrite_concurrent.out: -------------------------------------------------------------------------------- 1 | Parsed test spec with 2 sessions 2 | 3 | starting permutation: do_rewrite wait_for_before_lock_ip do_changes wakeup_before_lock_ip wait_for_after_commit_ip do_check wakeup_after_commit_ip 4 | injection_points_attach 5 | ----------------------- 6 | 7 | (1 row) 8 | 9 | step do_rewrite: 10 | SELECT rewrite_table_nowait('tbl_src', 'tbl_dst', 'tbl_src_old'); 11 | 12 | rewrite_table_nowait 13 | -------------------- 14 | 15 | (1 row) 16 | 17 | step wait_for_before_lock_ip: 18 | DO $$ 19 | BEGIN 20 | LOOP 21 | PERFORM pg_stat_clear_snapshot(); 22 | 23 | PERFORM 24 | FROM pg_stat_activity 25 | WHERE (wait_event_type, wait_event)=('InjectionPoint', 'pg_rewrite-before-lock'); 26 | 27 | IF FOUND THEN 28 | EXIT; 29 | END IF; 30 | 31 | PERFORM pg_sleep(.1); 32 | END LOOP; 33 | END; 34 | $$; 35 | 36 | step do_changes: 37 | INSERT INTO tbl_src VALUES (2, 20), (3, 30), (5, 50); 38 | 39 | -- Update with no identity change. 40 | UPDATE tbl_src SET j=0 WHERE i=1; 41 | 42 | -- Update with identity change. 43 | UPDATE tbl_src SET i=6 WHERE i=4; 44 | 45 | -- Update a row we inserted, to check that the insertion is visible. 46 | UPDATE tbl_src SET j=7 WHERE i=2; 47 | -- ... and update it again, to check that the update is visible. 48 | UPDATE tbl_src SET j=8 WHERE j=7; 49 | 50 | -- Delete. 51 | DELETE FROM tbl_src WHERE i=7; 52 | 53 | step wakeup_before_lock_ip: 54 | SELECT injection_points_wakeup('pg_rewrite-before-lock'); 55 | 56 | injection_points_wakeup 57 | ----------------------- 58 | 59 | (1 row) 60 | 61 | step wait_for_after_commit_ip: 62 | DO $$ 63 | BEGIN 64 | LOOP 65 | PERFORM pg_stat_clear_snapshot(); 66 | 67 | PERFORM 68 | FROM pg_stat_activity 69 | WHERE (wait_event_type, wait_event)=('InjectionPoint', 'pg_rewrite-after-commit'); 70 | 71 | IF FOUND THEN 72 | EXIT; 73 | END IF; 74 | 75 | PERFORM pg_sleep(.1); 76 | END LOOP; 77 | END; 78 | $$; 79 | 80 | step do_check: 81 | TABLE pg_rewrite_progress; 82 | 83 | SELECT i, j, k, l FROM tbl_src ORDER BY i, j; 84 | 85 | src_table|dst_table|src_table_new|ins_initial|ins|upd|del 86 | ---------+---------+-------------+-----------+---+---+--- 87 | tbl_src |tbl_dst |tbl_src_old | 3| 3| 4| 1 88 | (1 row) 89 | 90 | i| j| k| l 91 | -+--+---+--- 92 | 1| 0| 0| 0 93 | 2| 8| -8| -8 94 | 3|30|-30|-30 95 | 5|50|-50|-50 96 | 6|40|-40|-40 97 | (5 rows) 98 | 99 | step wakeup_after_commit_ip: 100 | SELECT injection_points_wakeup('pg_rewrite-after-commit'); 101 | 102 | injection_points_wakeup 103 | ----------------------- 104 | 105 | (1 row) 106 | 107 | injection_points_detach 108 | ----------------------- 109 | 110 | (1 row) 111 | 112 | -------------------------------------------------------------------------------- /specs/pg_rewrite_concurrent_toast.spec: -------------------------------------------------------------------------------- 1 | setup 2 | { 3 | CREATE EXTENSION injection_points; 4 | CREATE EXTENSION pg_rewrite; 5 | 6 | CREATE TABLE tbl_src(i int primary key, t text); 7 | 8 | INSERT INTO tbl_src(i, t) 9 | SELECT x, string_agg(random()::text, '') 10 | FROM generate_series(1, 2) g(x), generate_series(1, 200) h(y) 11 | GROUP BY x; 12 | 13 | CREATE TABLE tbl_dst(i int primary key, t text); 14 | } 15 | 16 | teardown 17 | { 18 | DROP EXTENSION injection_points; 19 | DROP EXTENSION pg_rewrite; 20 | DROP TABLE tbl_src; 21 | DROP TABLE tbl_src_old; 22 | } 23 | 24 | session s1 25 | setup 26 | { 27 | SELECT injection_points_attach('pg_rewrite-before-lock', 'wait'); 28 | SELECT injection_points_attach('pg_rewrite-after-commit', 'wait'); 29 | } 30 | # Perform the initial load and wait for s2 to do some data changes. 31 | # 32 | # Since pg_rewrite uses background worker, the isolation tester does not 33 | # recognize that the session waits on an injection point (because the worker 34 | # is who waits). Therefore use rewrite_table_nowait(), which only launches the 35 | # worker and goes on. The 'wait_for_s1_sleep' step below then checks until the 36 | # waiting started. 37 | step do_rewrite 38 | { 39 | SELECT rewrite_table_nowait('tbl_src', 'tbl_dst', 'tbl_src_old'); 40 | } 41 | # Check the data. 42 | step do_check 43 | { 44 | TABLE pg_rewrite_progress; 45 | 46 | -- Each row should contain TOASTed value. 47 | SELECT count(*) FROM tbl_src WHERE pg_column_toast_chunk_id(t) ISNULL; 48 | 49 | -- The contents of the new table should be identical to that of the old 50 | -- one. 51 | SELECT count(*) 52 | FROM tbl_src t1 JOIN tbl_src_old t2 ON t1.i = t2.i 53 | WHERE t1.t <> t2.t; 54 | } 55 | 56 | session s2 57 | # Since s1 uses background worker, the backend executing 'wait_before_lock' 58 | # does not appear to be waiting on the injection point. Instead we need to 59 | # check explicitly if the waiting on the injection point is in progress, and 60 | # wait if it's not. 61 | step wait_for_before_lock_ip 62 | { 63 | DO $$ 64 | BEGIN 65 | LOOP 66 | PERFORM pg_stat_clear_snapshot(); 67 | 68 | PERFORM 69 | FROM pg_stat_activity 70 | WHERE (wait_event_type, wait_event)=('InjectionPoint', 'pg_rewrite-before-lock'); 71 | 72 | IF FOUND THEN 73 | EXIT; 74 | END IF; 75 | 76 | PERFORM pg_sleep(.1); 77 | END LOOP; 78 | END; 79 | $$; 80 | } 81 | step do_changes 82 | { 83 | INSERT INTO tbl_src(i, t) 84 | SELECT 5, string_agg(random()::text, '') 85 | FROM generate_series(1, 200) h(y); 86 | 87 | UPDATE tbl_src SET t = t || 'x' WHERE i = 1; 88 | } 89 | step wakeup_before_lock_ip 90 | { 91 | SELECT injection_points_wakeup('pg_rewrite-before-lock'); 92 | } 93 | # Wait until the concurrent changes have been committed by the pg_rewrite 94 | # worker. 95 | step wait_for_after_commit_ip 96 | { 97 | DO $$ 98 | BEGIN 99 | LOOP 100 | PERFORM pg_stat_clear_snapshot(); 101 | 102 | PERFORM 103 | FROM pg_stat_activity 104 | WHERE (wait_event_type, wait_event)=('InjectionPoint', 'pg_rewrite-after-commit'); 105 | 106 | IF FOUND THEN 107 | EXIT; 108 | END IF; 109 | 110 | PERFORM pg_sleep(.1); 111 | END LOOP; 112 | END; 113 | $$; 114 | } 115 | # Like wakeup_before_lock_ip above. 116 | step wakeup_after_commit_ip 117 | { 118 | SELECT injection_points_wakeup('pg_rewrite-after-commit'); 119 | } 120 | teardown 121 | { 122 | SELECT injection_points_detach('pg_rewrite-before-lock'); 123 | SELECT injection_points_detach('pg_rewrite-after-commit'); 124 | } 125 | 126 | permutation 127 | do_rewrite 128 | wait_for_before_lock_ip 129 | do_changes 130 | wakeup_before_lock_ip 131 | wait_for_after_commit_ip 132 | do_check 133 | wakeup_after_commit_ip 134 | -------------------------------------------------------------------------------- /specs/pg_rewrite_concurrent.spec: -------------------------------------------------------------------------------- 1 | setup 2 | { 3 | CREATE EXTENSION injection_points; 4 | CREATE EXTENSION pg_rewrite; 5 | 6 | CREATE TABLE tbl_src(i int primary key, j int, 7 | k int generated always as (-j) virtual, 8 | l int generated always as (-j) stored); 9 | INSERT INTO tbl_src(i, j) VALUES (1, 10), (4, 40), (7, 70); 10 | 11 | -- Change of data type and column order. 12 | CREATE TABLE tbl_dst(j int, i bigint primary key, k int, l int); 13 | } 14 | 15 | teardown 16 | { 17 | DROP EXTENSION injection_points; 18 | DROP EXTENSION pg_rewrite; 19 | DROP TABLE tbl_src; 20 | DROP TABLE tbl_src_old; 21 | } 22 | 23 | session s1 24 | setup 25 | { 26 | SELECT injection_points_attach('pg_rewrite-before-lock', 'wait'); 27 | SELECT injection_points_attach('pg_rewrite-after-commit', 'wait'); 28 | } 29 | # Perform the initial load and wait for s2 to do some data changes. 30 | # 31 | # Since pg_rewrite uses background worker, the isolation tester does not 32 | # recognize that the session waits on an injection point (because the worker 33 | # is who waits). Therefore use rewrite_table_nowait(), which only launches the 34 | # worker and goes on. The 'wait_for_s1_sleep' step below then checks until the 35 | # waiting started. 36 | step do_rewrite 37 | { 38 | SELECT rewrite_table_nowait('tbl_src', 'tbl_dst', 'tbl_src_old'); 39 | } 40 | # Check the data. 41 | step do_check 42 | { 43 | TABLE pg_rewrite_progress; 44 | 45 | SELECT i, j, k, l FROM tbl_src ORDER BY i, j; 46 | } 47 | 48 | session s2 49 | # Since s1 uses background worker, the backend executing 'wait_before_lock' 50 | # does not appear to be waiting on the injection point. Instead we need to 51 | # check explicitly if the waiting on the injection point is in progress, and 52 | # wait if it's not. 53 | step wait_for_before_lock_ip 54 | { 55 | DO $$ 56 | BEGIN 57 | LOOP 58 | PERFORM pg_stat_clear_snapshot(); 59 | 60 | PERFORM 61 | FROM pg_stat_activity 62 | WHERE (wait_event_type, wait_event)=('InjectionPoint', 'pg_rewrite-before-lock'); 63 | 64 | IF FOUND THEN 65 | EXIT; 66 | END IF; 67 | 68 | PERFORM pg_sleep(.1); 69 | END LOOP; 70 | END; 71 | $$; 72 | } 73 | step do_changes 74 | { 75 | INSERT INTO tbl_src VALUES (2, 20), (3, 30), (5, 50); 76 | 77 | -- Update with no identity change. 78 | UPDATE tbl_src SET j=0 WHERE i=1; 79 | 80 | -- Update with identity change. 81 | UPDATE tbl_src SET i=6 WHERE i=4; 82 | 83 | -- Update a row we inserted, to check that the insertion is visible. 84 | UPDATE tbl_src SET j=7 WHERE i=2; 85 | -- ... and update it again, to check that the update is visible. 86 | UPDATE tbl_src SET j=8 WHERE j=7; 87 | 88 | -- Delete. 89 | DELETE FROM tbl_src WHERE i=7; 90 | } 91 | step wakeup_before_lock_ip 92 | { 93 | SELECT injection_points_wakeup('pg_rewrite-before-lock'); 94 | } 95 | # Wait until the concurrent changes have been committed by the pg_rewrite 96 | # worker. 97 | step wait_for_after_commit_ip 98 | { 99 | DO $$ 100 | BEGIN 101 | LOOP 102 | PERFORM pg_stat_clear_snapshot(); 103 | 104 | PERFORM 105 | FROM pg_stat_activity 106 | WHERE (wait_event_type, wait_event)=('InjectionPoint', 'pg_rewrite-after-commit'); 107 | 108 | IF FOUND THEN 109 | EXIT; 110 | END IF; 111 | 112 | PERFORM pg_sleep(.1); 113 | END LOOP; 114 | END; 115 | $$; 116 | } 117 | # Like wakeup_before_lock_ip above. 118 | step wakeup_after_commit_ip 119 | { 120 | SELECT injection_points_wakeup('pg_rewrite-after-commit'); 121 | } 122 | teardown 123 | { 124 | SELECT injection_points_detach('pg_rewrite-before-lock'); 125 | SELECT injection_points_detach('pg_rewrite-after-commit'); 126 | } 127 | 128 | permutation 129 | do_rewrite 130 | wait_for_before_lock_ip 131 | do_changes 132 | wakeup_before_lock_ip 133 | wait_for_after_commit_ip 134 | do_check 135 | wakeup_after_commit_ip 136 | -------------------------------------------------------------------------------- /specs/pg_rewrite_concurrent_partition.spec: -------------------------------------------------------------------------------- 1 | setup 2 | { 3 | CREATE EXTENSION injection_points; 4 | CREATE EXTENSION pg_rewrite; 5 | 6 | CREATE TABLE tbl_src(i int primary key, j int); 7 | INSERT INTO tbl_src(i, j) VALUES (1, 10), (4, 40); 8 | 9 | -- Besides partitioning, also test change of column type (int -> bigint). 10 | CREATE TABLE tbl_dst(i bigint primary key, j int) PARTITION BY RANGE(i); 11 | CREATE TABLE tbl_dst_part_1 PARTITION OF tbl_dst FOR VALUES FROM (1) TO (4); 12 | 13 | -- Create a partition with different order of columns, to test that 14 | -- partition maps work. 15 | CREATE TABLE tbl_dst_part_2(j int, i bigint primary key); 16 | ALTER TABLE tbl_dst ATTACH PARTITION tbl_dst_part_2 FOR VALUES FROM (4) TO (8); 17 | } 18 | 19 | teardown 20 | { 21 | DROP EXTENSION injection_points; 22 | DROP EXTENSION pg_rewrite; 23 | DROP TABLE tbl_src; 24 | DROP TABLE tbl_src_old; 25 | } 26 | 27 | session s1 28 | setup 29 | { 30 | SELECT injection_points_attach('pg_rewrite-before-lock', 'wait'); 31 | SELECT injection_points_attach('pg_rewrite-after-commit', 'wait'); 32 | } 33 | # Perform the initial load and wait for s2 to do some data changes. 34 | # 35 | # Since pg_rewrite uses background worker, the isolation tester does not 36 | # recognize that the session waits on an injection point (because the worker 37 | # is who waits). Therefore use rewrite_table_nowait(), which only launches the 38 | # worker and goes on. The 'wait_for_s1_sleep' step below then checks until the 39 | # waiting started. 40 | step do_rewrite 41 | { 42 | SELECT rewrite_table_nowait('tbl_src', 'tbl_dst', 'tbl_src_old'); 43 | } 44 | # Check the data. 45 | step do_check 46 | { 47 | TABLE pg_rewrite_progress; 48 | 49 | SELECT i, j FROM tbl_src ORDER BY i, j; 50 | } 51 | 52 | session s2 53 | # Since s1 uses background worker, the backend executing 'wait_before_lock' 54 | # does not appear to be waiting on the injection point. Instead we need to 55 | # check explicitly if the waiting on the injection point is in progress, and 56 | # wait if it's not. 57 | step wait_for_before_lock_ip 58 | { 59 | DO $$ 60 | BEGIN 61 | LOOP 62 | PERFORM pg_stat_clear_snapshot(); 63 | 64 | PERFORM 65 | FROM pg_stat_activity 66 | WHERE (wait_event_type, wait_event)=('InjectionPoint', 'pg_rewrite-before-lock'); 67 | 68 | IF FOUND THEN 69 | EXIT; 70 | END IF; 71 | 72 | PERFORM pg_sleep(.1); 73 | END LOOP; 74 | END; 75 | $$; 76 | } 77 | step do_changes 78 | { 79 | -- Insert one row into each partition. 80 | INSERT INTO tbl_src VALUES (2, 20), (3, 30), (5, 50); 81 | 82 | -- Update with no identity change. 83 | UPDATE tbl_src SET j=0 WHERE i=1; 84 | 85 | -- Update with identity change but within the same partition. 86 | UPDATE tbl_src SET i=6 WHERE i=5; 87 | 88 | -- Cross-partition update. 89 | UPDATE tbl_src SET i=7 WHERE i=3; 90 | 91 | -- Update a row we inserted and updated, to check that it's visible. 92 | UPDATE tbl_src SET j=4 WHERE i=7; 93 | 94 | -- Delete. 95 | DELETE FROM tbl_src WHERE i=4; 96 | } 97 | step wakeup_before_lock_ip 98 | { 99 | SELECT injection_points_wakeup('pg_rewrite-before-lock'); 100 | } 101 | # Wait until the concurrent changes have been committed by the pg_rewrite 102 | # worker. 103 | step wait_for_after_commit_ip 104 | { 105 | DO $$ 106 | BEGIN 107 | LOOP 108 | PERFORM pg_stat_clear_snapshot(); 109 | 110 | PERFORM 111 | FROM pg_stat_activity 112 | WHERE (wait_event_type, wait_event)=('InjectionPoint', 'pg_rewrite-after-commit'); 113 | 114 | IF FOUND THEN 115 | EXIT; 116 | END IF; 117 | 118 | PERFORM pg_sleep(.1); 119 | END LOOP; 120 | END; 121 | $$ 122 | } 123 | # Like wakeup_before_lock_ip above. 124 | step wakeup_after_commit_ip 125 | { 126 | SELECT injection_points_wakeup('pg_rewrite-after-commit'); 127 | } 128 | teardown 129 | { 130 | SELECT injection_points_detach('pg_rewrite-before-lock'); 131 | SELECT injection_points_detach('pg_rewrite-after-commit'); 132 | } 133 | 134 | permutation 135 | do_rewrite 136 | wait_for_before_lock_ip 137 | do_changes 138 | wakeup_before_lock_ip 139 | wait_for_after_commit_ip 140 | do_check 141 | wakeup_after_commit_ip 142 | -------------------------------------------------------------------------------- /sql/pg_rewrite.sql: -------------------------------------------------------------------------------- 1 | DROP EXTENSION IF EXISTS pg_rewrite; 2 | CREATE EXTENSION pg_rewrite; 3 | 4 | CREATE TABLE tab1(i int PRIMARY KEY, j int, k int); 5 | -- If a dropped column is encountered, the source tuple should be converted 6 | -- so it matches the destination table. 7 | ALTER TABLE tab1 DROP COLUMN k; 8 | ALTER TABLE tab1 ADD COLUMN k int; 9 | INSERT INTO tab1(i, j, k) 10 | SELECT i, i / 2, i 11 | FROM generate_series(0, 1023) g(i); 12 | 13 | CREATE TABLE tab1_new(i int PRIMARY KEY, j int, k int) PARTITION BY RANGE(i); 14 | CREATE TABLE tab1_new_part_1 PARTITION OF tab1_new FOR VALUES FROM (0) TO (256); 15 | CREATE TABLE tab1_new_part_2 PARTITION OF tab1_new FOR VALUES FROM (256) TO (512); 16 | CREATE TABLE tab1_new_part_3 PARTITION OF tab1_new FOR VALUES FROM (512) TO (768); 17 | CREATE TABLE tab1_new_part_4 PARTITION OF tab1_new FOR VALUES FROM (768) TO (1024); 18 | 19 | -- Also test handling of constraints that require "manual" validation. 20 | ALTER TABLE tab1 ADD CHECK (k >= 0); 21 | 22 | CREATE TABLE tab1_fk(i int REFERENCES tab1); 23 | INSERT INTO tab1_fk(i) VALUES (1); 24 | \d tab1 25 | 26 | -- Process the table. 27 | SELECT rewrite_table('tab1', 'tab1_new', 'tab1_orig'); 28 | 29 | -- tab1 should now be partitioned. 30 | \d tab1 31 | 32 | -- Validate the constraints. 33 | ALTER TABLE tab1 VALIDATE CONSTRAINT tab1_k_check2; 34 | ALTER TABLE tab1_fk VALIDATE CONSTRAINT tab1_fk_i_fkey2; 35 | 36 | \d tab1 37 | 38 | EXPLAIN (COSTS off) SELECT * FROM tab1; 39 | 40 | -- Check that the contents has not changed. 41 | SELECT count(*) FROM tab1; 42 | 43 | SELECT * 44 | FROM tab1 t FULL JOIN tab1_orig o ON t.i = o.i 45 | WHERE t.i ISNULL OR o.i ISNULL; 46 | 47 | -- List partitioning 48 | CREATE TABLE tab2(i int, j int, PRIMARY KEY (i, j)); 49 | INSERT INTO tab2(i, j) 50 | SELECT i, j 51 | FROM generate_series(1, 4) g(i), generate_series(1, 4) h(j); 52 | 53 | CREATE TABLE tab2_new(i int, j int, PRIMARY KEY (i, j)) PARTITION BY LIST(i); 54 | CREATE TABLE tab2_new_part_1 PARTITION OF tab2_new FOR VALUES IN (1); 55 | CREATE TABLE tab2_new_part_2 PARTITION OF tab2_new FOR VALUES IN (2); 56 | CREATE TABLE tab2_new_part_3 PARTITION OF tab2_new FOR VALUES IN (3); 57 | CREATE TABLE tab2_new_part_4 PARTITION OF tab2_new FOR VALUES IN (4); 58 | 59 | SELECT rewrite_table('tab2', 'tab2_new', 'tab2_orig'); 60 | 61 | TABLE tab2_new_part_1; 62 | TABLE tab2_new_part_2; 63 | TABLE tab2_new_part_3; 64 | TABLE tab2_new_part_4; 65 | 66 | -- Hash partitioning 67 | CREATE TABLE tab3(i int, j int, PRIMARY KEY (i, j)); 68 | INSERT INTO tab3(i, j) 69 | SELECT i, j 70 | FROM generate_series(1, 4) g(i), generate_series(1, 4) h(j); 71 | 72 | CREATE TABLE tab3_new(i int, j int, PRIMARY KEY (i, j)) PARTITION BY HASH(i); 73 | CREATE TABLE tab3_new_part_1 PARTITION OF tab3_new FOR VALUES WITH (MODULUS 4, REMAINDER 0); 74 | CREATE TABLE tab3_new_part_2 PARTITION OF tab3_new FOR VALUES WITH (MODULUS 4, REMAINDER 1); 75 | CREATE TABLE tab3_new_part_3 PARTITION OF tab3_new FOR VALUES WITH (MODULUS 4, REMAINDER 2); 76 | CREATE TABLE tab3_new_part_4 PARTITION OF tab3_new FOR VALUES WITH (MODULUS 4, REMAINDER 3); 77 | 78 | SELECT rewrite_table('tab3', 'tab3_new', 'tab3_orig'); 79 | 80 | TABLE tab3_new_part_1; 81 | TABLE tab3_new_part_2; 82 | TABLE tab3_new_part_3; 83 | TABLE tab3_new_part_4; 84 | 85 | -- Change of precision and scale of a numeric data type. 86 | CREATE TABLE tab4(i int PRIMARY KEY, j numeric(3, 1)); 87 | INSERT INTO tab4(i, j) VALUES (1, 0.1); 88 | CREATE TABLE tab4_new(i int PRIMARY KEY, j numeric(4, 2)); 89 | TABLE tab4; 90 | SELECT rewrite_table('tab4', 'tab4_new', 'tab4_orig'); 91 | TABLE tab4; 92 | 93 | -- One more test for "manual" validation of FKs, this time we rewrite the PK 94 | -- table. The NOT VALID constraint cannot be used if the FK table is 95 | -- partitioned and if PG version is < 18, so we need a separate test. 96 | CREATE TABLE tab1_pk(i int primary key); 97 | INSERT INTO tab1_pk(i) VALUES (1); 98 | CREATE TABLE tab1_pk_new(i bigint primary key); 99 | 100 | DROP TABLE tab1_fk; 101 | CREATE TABLE tab1_fk(i int REFERENCES tab1_pk); 102 | INSERT INTO tab1_fk(i) VALUES (1); 103 | 104 | \d tab1_pk 105 | SELECT rewrite_table('tab1_pk', 'tab1_pk_new', 'tab1_pk_orig'); 106 | \d tab1_pk 107 | ALTER TABLE tab1_fk VALIDATE CONSTRAINT tab1_fk_i_fkey2; 108 | \d tab1_pk 109 | 110 | -- For the partitioned FK table, test at least that the FK creation is skipped 111 | -- (i.e. ERROR saying that NOT VALID is not supported is no raised) 112 | DROP TABLE tab1_fk; 113 | CREATE TABLE tab1_fk(i int REFERENCES tab1_pk) PARTITION BY RANGE (i); 114 | CREATE TABLE tab1_fk_1 PARTITION OF tab1_fk DEFAULT; 115 | INSERT INTO tab1_fk(i) VALUES (1); 116 | 117 | ALTER TABLE tab1_pk_orig RENAME TO tab1_pk_new; 118 | TRUNCATE TABLE tab1_pk_new; 119 | 120 | \d tab1_fk 121 | SELECT rewrite_table('tab1_pk', 'tab1_pk_new', 'tab1_pk_orig'); 122 | -- Note that tab1_fk still references tab1_pk_orig - that's expected. 123 | \d tab1_fk 124 | 125 | -- The same once again, but now rewrite the FK table. 126 | DROP TABLE tab1_fk; 127 | DROP TABLE tab1_pk; 128 | ALTER TABLE tab1_pk_orig RENAME TO tab1_pk; 129 | CREATE TABLE tab1_fk(i int PRIMARY KEY REFERENCES tab1_pk); 130 | INSERT INTO tab1_fk(i) VALUES (1); 131 | CREATE TABLE tab1_fk_new(i int PRIMARY KEY) PARTITION BY RANGE (i); 132 | CREATE TABLE tab1_fk_new_1 PARTITION OF tab1_fk_new DEFAULT; 133 | \d tab1_fk 134 | SELECT rewrite_table('tab1_fk', 'tab1_fk_new', 'tab1_fk_orig'); 135 | \d tab1_fk 136 | 137 | -- Check if sequence on the target table is synchronized with that of the 138 | -- source table. 139 | CREATE TABLE tab5(i int primary key generated always as identity); 140 | CREATE TABLE tab5_new(i int primary key generated always as identity); 141 | INSERT INTO tab5(i) VALUES (DEFAULT); 142 | SELECT rewrite_table('tab5', 'tab5_new', 'tab5_orig'); 143 | INSERT INTO tab5(i) VALUES (DEFAULT); 144 | SELECT i FROM tab5 ORDER BY i; 145 | 146 | -- The same with serial column. 147 | CREATE TABLE tab6(i serial primary key); 148 | CREATE TABLE tab6_new(i serial primary key); 149 | INSERT INTO tab6(i) VALUES (DEFAULT); 150 | SELECT rewrite_table('tab6', 'tab6_new', 'tab6_orig'); 151 | INSERT INTO tab6(i) VALUES (DEFAULT); 152 | SELECT i FROM tab6 ORDER BY i; 153 | -------------------------------------------------------------------------------- /pg_rewrite.h: -------------------------------------------------------------------------------- 1 | /*---------------------------------------------------------------- 2 | * 3 | * pg_rewrite.h 4 | * Tools for maintenance that requires table rewriting. 5 | * 6 | * Copyright (c) 2021-2025, Cybertec PostgreSQL International GmbH 7 | * 8 | *---------------------------------------------------------------- 9 | */ 10 | 11 | #include 12 | 13 | #include "c.h" 14 | #include "postgres.h" 15 | #include "fmgr.h" 16 | #include "miscadmin.h" 17 | 18 | #include "access/genam.h" 19 | #include "access/heapam.h" 20 | #include "access/relscan.h" 21 | #include "access/xlog_internal.h" 22 | #include "access/xact.h" 23 | #include "catalog/pg_class.h" 24 | #include "nodes/execnodes.h" 25 | #include "postmaster/bgworker.h" 26 | #include "replication/logical.h" 27 | #include "replication/origin.h" 28 | #include "utils/inval.h" 29 | #include "utils/resowner.h" 30 | #include "utils/snapmgr.h" 31 | 32 | typedef struct DecodingOutputState 33 | { 34 | /* The relation whose changes we're decoding. */ 35 | Oid relid; 36 | 37 | /* 38 | * Decoded changes are stored here. Although we try to avoid excessive 39 | * batches, it can happen that the changes need to be stored to disk. The 40 | * tuplestore does this transparently. 41 | */ 42 | Tuplestorestate *tstore; 43 | 44 | /* The current number of changes in tstore. */ 45 | double nchanges; 46 | 47 | /* 48 | * Descriptor to store the ConcurrentChange structure serialized (bytea). 49 | * We can't store the tuple directly because tuplestore only supports 50 | * minimum tuple and we may need to transfer OID system column from the 51 | * output plugin. Also we need to transfer the change kind, so it's better 52 | * to put everything in the structure than to use 2 tuplestores "in 53 | * parallel". 54 | */ 55 | TupleDesc tupdesc_change; 56 | 57 | /* 58 | * Tuple descriptor needed process the concurrent data changes. 59 | */ 60 | TupleDesc tupdesc_src; 61 | 62 | /* Slot to retrieve data from tstore. */ 63 | TupleTableSlot *tsslot; 64 | 65 | /* 66 | * WAL records having this origin have been created by the initial load 67 | * and should not be decoded. 68 | */ 69 | RepOriginId rorigin; 70 | 71 | ResourceOwner resowner; 72 | } DecodingOutputState; 73 | 74 | /* The WAL segment being decoded. */ 75 | extern XLogSegNo rewrite_current_segment; 76 | 77 | extern void _PG_init(void); 78 | 79 | /* Progress tracking. */ 80 | typedef struct TaskProgress 81 | { 82 | /* Tuples inserted during the initial load. */ 83 | int64 ins_initial; 84 | 85 | /* 86 | * Tuples inserted, updated and deleted after the initial load (i.e. 87 | * during the catch-up phase). 88 | */ 89 | int64 ins; 90 | int64 upd; 91 | int64 del; 92 | } TaskProgress; 93 | 94 | /* 95 | * The new implementation, which delegates the execution to a background 96 | * worker (as opposed to the PG executor). 97 | * 98 | * Arguments are passed to the worker via this structure, located in the 99 | * shared memory. 100 | */ 101 | typedef struct WorkerTask 102 | { 103 | /* Connection info. */ 104 | Oid dbid; 105 | Oid roleid; 106 | 107 | /* Worker that performs the task both sets and clears this field. */ 108 | pid_t pid; 109 | 110 | /* See the comments of pg_rewrite_exit_if_requested(). */ 111 | bool exit_requested; 112 | 113 | /* The progress is only valid if the dbid is valid. */ 114 | TaskProgress progress; 115 | 116 | /* 117 | * Use this when setting / clearing the fields above. Once dbid is set, 118 | * the task belongs to the backend that set it, so the other fields may be 119 | * assigned w/o the lock. 120 | */ 121 | slock_t mutex; 122 | 123 | /* The tables to work on. */ 124 | NameData relschema; 125 | NameData relname; 126 | NameData relname_new; 127 | NameData relschema_dst; 128 | NameData relname_dst; 129 | 130 | /* 131 | * Space for the worker to send an error message to the backend. 132 | * 133 | * XXX Note that later messages overwrite the earlier ones, so only the 134 | * last message is received. Is it worth using a queue instead? 135 | */ 136 | #define MAX_ERR_MSG_LEN 1024 137 | char msg[MAX_ERR_MSG_LEN]; 138 | 139 | /* Detailed error message. */ 140 | char msg_detail[MAX_ERR_MSG_LEN]; 141 | 142 | int elevel; 143 | 144 | /* 145 | * Should rewrite_table() return w/o waiting for the worker's exit? If 146 | * this flag is set, the worker is responsible for releasing the 147 | * task. Otherwise the worker must not release the task because the 148 | * backend might be interested in 'msg' and 'msg_detail'. 149 | */ 150 | bool nowait; 151 | 152 | int max_xlock_time; 153 | } WorkerTask; 154 | 155 | #define MAX_TASKS 8 156 | 157 | /* Each backend stores here the pointer to its task in the shared memory. */ 158 | extern WorkerTask *MyWorkerTask; 159 | 160 | /* 161 | * Like AttrMap in PG core, but here we add an array of expressions to coerce 162 | * the input values to output ones. (A new name is needed as it's hard to 163 | * avoid inclusion of the in-core structure.) 164 | */ 165 | typedef struct AttrMapExt 166 | { 167 | AttrNumber *attnums; 168 | int maplen; 169 | bool dropped_attr; /* Has outer or inner descriptor a dropped 170 | * attribute? */ 171 | Node **exprsIn; /* Non-NULL field tells how to convert the input 172 | * value to the output data type and/or to 173 | * evaluate the column expression. NULL indicates 174 | * that no conversion is needed and that there is 175 | * no expression for given column. */ 176 | Node **exprsOut; /* 177 | * Likewise, expression to compute the value of an 178 | * output column. 179 | */ 180 | } AttrMapExt; 181 | 182 | /* 183 | * Like TupleConversionMap in PG core, but here we add an array of expressions 184 | * to coerce the input values to output ones. (A new name is needed as it's 185 | * hard to avoid inclusion of the in-core structure.) 186 | */ 187 | typedef struct TupleConversionMapExt 188 | { 189 | TupleDesc indesc; /* tupdesc for source rowtype */ 190 | TupleDesc outdesc; /* tupdesc for result rowtype */ 191 | AttrMapExt *attrMap; /* indexes of input fields, or 0 for null */ 192 | Datum *invalues; /* workspace for deconstructing source */ 193 | bool *inisnull; 194 | ExprState **exprsIn; /* See AttrMapExt */ 195 | ExprState **exprsOut; /* See AttrMapExt */ 196 | EState *estate; /* Executor state used to evaluate 197 | * coerceExprs. */ 198 | TupleTableSlot *in_slot; /* Slot to store the input tuple for 199 | * coercion. */ 200 | TupleTableSlot *out_slot; /* Slot to construct the output tuple. */ 201 | } TupleConversionMapExt; 202 | 203 | /* 204 | * Hash table to cache partition-specific information. 205 | */ 206 | typedef struct PartitionEntry 207 | { 208 | Oid part_oid; /* key */ 209 | Relation ident_index; 210 | 211 | /* 212 | * Slot (TTSOpsHeapTuple) to apply data changes to the partition. 213 | */ 214 | TupleTableSlot *slot; 215 | 216 | /* 217 | * Slot to retrieve tuples from the partition. Separate from 'slot_ind' 218 | * because it has to be TTSOpsBufferHeapTuple. 219 | */ 220 | TupleTableSlot *slot_ind; 221 | 222 | /* This should make insertions into partitions more efficient. */ 223 | BulkInsertState bistate; 224 | 225 | /* 226 | * Map to convert tuples that match the partitioned table so they match 227 | * this partition. 228 | */ 229 | TupleConversionMapExt *conv_map; 230 | 231 | char status; /* used by simplehash */ 232 | } PartitionEntry; 233 | 234 | #define SH_PREFIX partitions 235 | #define SH_ELEMENT_TYPE PartitionEntry 236 | #define SH_KEY_TYPE Oid 237 | #define SH_KEY part_oid 238 | #define SH_HASH_KEY(tb, key) (key) 239 | #define SH_EQUAL(tb, a, b) ((a) == (b)) 240 | #define SH_SCOPE static inline 241 | #define SH_DECLARE 242 | #define SH_DEFINE 243 | #include "lib/simplehash.h" 244 | 245 | extern PGDLLEXPORT void rewrite_worker_main(Datum main_arg); 246 | 247 | extern void pg_rewrite_exit_if_requested(void); 248 | 249 | /* 250 | * Use function names distinct from those in pg_squeeze, in case both 251 | * extensions are installed. 252 | */ 253 | extern bool pg_rewrite_process_concurrent_changes(EState *estate, 254 | ModifyTableState *mtstate, 255 | struct PartitionTupleRouting *proute, 256 | LogicalDecodingContext *ctx, 257 | XLogRecPtr end_of_wal, 258 | ScanKey ident_key, 259 | int ident_key_nentries, 260 | Relation ident_index, 261 | TupleTableSlot *slot_dst_ind, 262 | LOCKMODE lock_held, 263 | partitions_hash *partitions, 264 | TupleConversionMapExt *conv_map, 265 | struct timeval *must_complete); 266 | extern bool pg_rewrite_decode_concurrent_changes(LogicalDecodingContext *ctx, 267 | XLogRecPtr end_of_wal, 268 | struct timeval *must_complete); 269 | extern HeapTuple convert_tuple_for_dest_table(HeapTuple tuple, 270 | TupleConversionMapExt *conv_map); 271 | extern void _PG_output_plugin_init(OutputPluginCallbacks *cb); 272 | extern PartitionEntry *get_partition_entry(partitions_hash *partitions, 273 | Oid part_oid);; 274 | extern HeapTuple pg_rewrite_execute_attr_map_tuple(HeapTuple tuple, 275 | TupleConversionMapExt *map); 276 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pg_rewrite 2 | 3 | `pg_rewrite` is a tool to rewrite table (i.e. to copy its data to a new 4 | file). It allows both read and write access to the table during the rewriting. 5 | 6 | Following are the most common reasons to rewrite a table: 7 | 8 | 1. Change data type of column(s) 9 | 10 | Typically this is needed if the existing data type is running out of 11 | values. For example, you may need to change `interger` type to 12 | `bigint`. `ALTER TABLE` command can do that too, but it allows neither 13 | write nor read access to the table during the rewriting. 14 | 15 | 2. Partition the table 16 | 17 | If you realize that your table is getting much bigger than expected and 18 | that partitioning would make your life easier, the next question may be 19 | how to copy the existing data to the new, partitioned table without 20 | stopping all the applications that run DML commands on the table. (When 21 | you decide to use partitioning, the amount of data to copy might already 22 | be significant, so the copying might need a while.) 23 | 24 | 3. Change order of columns 25 | 26 | If you conclude that a different order of columns would save significant 27 | disk space (due to reduced paddding), the problem boils down to copying 28 | data to a new table like in 2). Again, you may need `pg_rewrite` to make 29 | the change smooth. 30 | 31 | 4. Move table into another tablespace. 32 | 33 | `ALTER TABLE` command can do that, but it allows neither write nor read 34 | access to the table during the rewriting. With `pg_rewrite`, you only need 35 | to create the new table in the desired tablespace. The rest is identical 36 | to the other use cases. 37 | 38 | Note that the following use cases can be combined in a single rewrited. 39 | 40 | 41 | # INSTALLATION 42 | 43 | Install PostgreSQL before proceeding. Make sure to have `pg_config` binary, 44 | these are typically included in `-dev` and `-devel` packages. PostgreSQL server 45 | version 13 or later is required. 46 | 47 | ```bash 48 | git clone https://github.com/cybertec-postgresql/pg_rewrite.git 49 | cd pg_rewrite 50 | git checkout 51 | make 52 | make install 53 | ``` 54 | 55 | Add these to `postgresql.conf`: 56 | 57 | ``` 58 | wal_level = logical 59 | max_replication_slots = 1 # ... or add 1 to the current value. 60 | shared_preload_libraries = 'pg_rewrite' # ... or add the library to the existing ones. 61 | ``` 62 | 63 | Restart the cluster, and invoke: 64 | 65 | ``` 66 | CREATE EXTENSION pg_rewrite; 67 | ``` 68 | 69 | # USAGE 70 | 71 | Assume you have a table defined like this 72 | 73 | ``` 74 | CREATE TABLE measurement ( 75 | id int, 76 | city_id int not null, 77 | logdate date not null, 78 | peaktemp int, 79 | PRIMARY KEY(id, logdate) 80 | ); 81 | ``` 82 | 83 | and you need to replace it with a partitioned table. At the same time, you 84 | want to change the data type of the `id` column to `bigint`. 85 | 86 | 87 | ``` 88 | CREATE TABLE measurement_aux ( 89 | id bigint, 90 | city_id int not null, 91 | logdate date not null, 92 | peaktemp int, 93 | PRIMARY KEY(id, logdate) 94 | ) PARTITION BY RANGE (logdate); 95 | ``` 96 | 97 | Then create partitions for all the rows currently present in the `measurement` 98 | table, and also for the data that might be inserted during processing: 99 | 100 | ``` 101 | CREATE TABLE measurement_y2006m02 PARTITION OF measurement_aux 102 | FOR VALUES FROM ('2006-02-01') TO ('2006-03-01'); 103 | 104 | CREATE TABLE measurement_y2006m03 PARTITION OF measurement_aux 105 | FOR VALUES FROM ('2006-03-01') TO ('2006-04-01'); 106 | 107 | -- ... 108 | ``` 109 | 110 | *It's essential that both the source (`measurement`) and target 111 | (`measurement_aux`) table have an identity index. It is needed to process data 112 | changes that applications make while data is being copied from the source to 113 | the target table. If the replica identity of the table is DEFAULT or FULL, 114 | primary key constraint provides the identity index. If your table has no 115 | primary key, you need to set the identity index explicitly using the [ALTER 116 | COMMAND ... REPLICA IDENTITY USING INDEX ...][1] command. 117 | 118 | Also note that the key (i.e. column list) of the identity index of the source 119 | and target table must be identical.* 120 | 121 | Then, in order to copy the data into the target table, run the 122 | `rewrite_table()` function and pass it both the source and target table, as 123 | well as a new table name for the source table. For example: 124 | 125 | ``` 126 | SELECT rewrite_table('measurement', 'measurement_aux', 'measurement_old'); 127 | ``` 128 | 129 | The call will first copy all rows from `measurement` to `measurement_aux`. Then 130 | it will apply to `measurement_aux` all the data changes (INSERT, UPDATE, 131 | DELETE) that took place in `measurement` during the copying. Next, it will 132 | lock `measurement` so that neither read nor write access is possible. Finally 133 | it will rename `measurement` to `measurement_old` and `measurement_aux` to 134 | `measurement`. Thus `measurement` ends up to be the partitioned table, while 135 | `measurement_old` is the original, non-partitioned table. 136 | 137 | If a column of the target table has a different data type from the 138 | corresponding column of the source table, an implicit or assignment cast must 139 | exist between the two types. 140 | 141 | # Constraints 142 | 143 | The target table should obviously end up with the same constraints as the 144 | source table. It's recommended to handle constraints creation this way: 145 | 146 | 1. Add PRIMARY KEY, UNIQUE and EXCLUDE constraints of the source table to the 147 | target table before you call `rewrite_table()`. These are enforced during 148 | the rewriting, so any violation would make `rewrite_table()` fail 149 | (ROLLBACK). (The constraints must have been enforced in the source table, 150 | but it does not hurt to check them in the target table, especially if the 151 | column data type is being changed.) 152 | 153 | 2. If the version of PostgreSQL server is 17 or lower, add NOT NULL 154 | constraints of the source table to the target table. `rewrite_table()` 155 | by-passes validation of these, but all the rows it inserts into the target 156 | table must have been validated in the source table. Even if the column 157 | data tape is different in the target table, the data type conversion 158 | should not turn non-NULL value to NULL or vice versa. 159 | 160 | 3. CHECK constraints are created automatically by `rewrite_table()` 161 | (according to the source table) when all the data changes have been 162 | applied to the target table. However, these constraints are created as NOT 163 | VALID, so you need to use the `ALTER TABLE ... VALIDATE CONSTRAINT ...` 164 | command to validate them. 165 | 166 | (The function does not create these constraints immediately as valid, 167 | because that could imply blocking access to the table for significant 168 | time.) 169 | 170 | 4. If the version of PostgreSQL server is 18 or higher, NOT NULL constraints 171 | are also created automatically and need to be validated using the `ALTER 172 | TABLE ... VALIDATE CONSTRAINT ...` command. 173 | 174 | 5. FOREIGN KEY constraints are also created automatically (according to the 175 | source table) and need to be validated using the `ALTER TABLE ... VALIDATE 176 | CONSTRAINT ...` command, unless the referencing table is partitioned and 177 | the version of PostgreSQL server is 17 or lower: those versions do not 178 | support the NOT VALID option for partitioned tables. 179 | 180 | Therefore, if the referencing table is partitioned and if the server 181 | version is 17 or lower, you need to use the `ALTER TABLE ... ADD 182 | CONSTRAINT ... FOREIGN KEY ...` command after `rewrite_table()` has 183 | finished. Please run the command as soon as possible to minimize the risk 184 | that applications modify the database in a way that violates the 185 | constraints. 186 | 187 | 6. Drop all foreign keys involving the source table. 188 | 189 | You probably want to drop the source table anyway, but if you don't, you 190 | should at least drop its FOREIGN KEY constraints. As the table was 191 | renamed, applications will no longer update it. Therefore, attempts to 192 | update the other tables involved in its foreign keys may cause errors. 193 | 194 | # Sequences 195 | 196 | If a sequence is used to generate column value in the source table (typically 197 | the column data type is `serial` or the column is declared `GENARATED ... AS 198 | IDENTITY`), and if `rewrite_table()` finds the corresponding sequence for the 199 | target table, it sets its value according to the sequence for the source 200 | table. If it cannot identify the sequence for the target table, a log message 201 | is printed out. 202 | 203 | # Progress monitoring 204 | 205 | If `rewrite_table()` takes long time to finish, you might be interested in the 206 | progress. The `pg_rewrite_progress` view shows all the pending calls of the 207 | function in the current database. The `src_table`, `dst_table` and 208 | `src_table_new` columns contain the arguments of the `rewrite_table()` 209 | function. `ins_initial` is the number of tuples inserted into the new table 210 | storage during the "initial load stage", i.e. the number of tuples present in 211 | the table before the processing started. On the other hand, `ins`, `upd` and 212 | `del` are the numbers of tuples inserted, updated and deleted by applications 213 | during the table processing. (These "concurrent data changes" must also be 214 | incorporated into the partitioned table, otherwise they'd get lost.) 215 | 216 | # Limitations 217 | 218 | 1. If the target table is partitioned, it's not allowed to have foreign 219 | tables as partitions. 220 | 221 | 2. Indexes are not renamed. 222 | 223 | While the target table (`measurement_aux` above) is renamed to the source 224 | table (`measurement`), its indexes are not renamed to match the source 225 | table. If you consider it a problem, please use the `ALTER INDEX` command 226 | to rename them. This operation blocks neither reads nor writes. 227 | 228 | # Configuration 229 | 230 | Following is the description of the configuration variables that affect 231 | behavior of the functions of this extension. 232 | 233 | * `rewrite.max_xlock_time` 234 | 235 | Although the table being processed is available for both read and write 236 | operations by other transactions most of the time, an exclusive lock is needed 237 | to finalize the processing (i.e. to do the table renaming), which blocks both 238 | read and write access. This should take very short time that users should 239 | harly notice. 240 | 241 | However, if a significant amount of changes took place in the source table 242 | while the extension was waiting for the (exclusive) lock, the outage might 243 | take proportionally longer time. The point is that those changes need to be 244 | propagated to the target table before the exclusive lock can be released. 245 | 246 | If the extension function seems to block access to tables too much, consider 247 | setting `rewrite.max_xlock_time` GUC parameter. For example: 248 | 249 | ``` 250 | SET rewrite.max_xlock_time TO 100; 251 | ``` 252 | 253 | Tells that the exclusive lock shouldn't be held for more than 0.1 second (100 254 | milliseconds). If more time is needed for the final stage, the particular 255 | function releases the exclusive lock, processes the changes committed by the 256 | other transactions in between and tries the final stage again. Error is 257 | reported if the lock duration is exceeded a few more times. If that happens, 258 | you should either increase the setting or try to process the problematic table 259 | later, when the write activity is lower. 260 | 261 | The default value is `0`, meaning that the final stage can take as much time as 262 | it needs. 263 | 264 | # Concurrency 265 | 266 | 1. While the rewrite_table() function is executing, `ALTER TABLE` command on 267 | the same table should be blocked until the rewriting is done. However, in 268 | some cases the `ALTER TABLE` command and the rewrite_table() function might 269 | end up in a deadlock. Therefore it's recommended not to run ALTER TABLE on 270 | a table which is being rewritten. 271 | 272 | 2. The `rewrite_table()` function allows for MVCC-unsafe behavior described in 273 | the first paragraph of [mvcc-caveats][2]. 274 | 275 | 276 | [1] https://www.postgresql.org/docs/17/sql-altertable.html 277 | [2] https://www.postgresql.org/docs/current/mvcc-caveats.html 278 | -------------------------------------------------------------------------------- /expected/pg_rewrite_1.out: -------------------------------------------------------------------------------- 1 | DROP EXTENSION IF EXISTS pg_rewrite; 2 | NOTICE: extension "pg_rewrite" does not exist, skipping 3 | CREATE EXTENSION pg_rewrite; 4 | CREATE TABLE tab1(i int PRIMARY KEY, j int, k int); 5 | -- If a dropped column is encountered, the source tuple should be converted 6 | -- so it matches the destination table. 7 | ALTER TABLE tab1 DROP COLUMN k; 8 | ALTER TABLE tab1 ADD COLUMN k int; 9 | INSERT INTO tab1(i, j, k) 10 | SELECT i, i / 2, i 11 | FROM generate_series(0, 1023) g(i); 12 | CREATE TABLE tab1_new(i int PRIMARY KEY, j int, k int) PARTITION BY RANGE(i); 13 | CREATE TABLE tab1_new_part_1 PARTITION OF tab1_new FOR VALUES FROM (0) TO (256); 14 | CREATE TABLE tab1_new_part_2 PARTITION OF tab1_new FOR VALUES FROM (256) TO (512); 15 | CREATE TABLE tab1_new_part_3 PARTITION OF tab1_new FOR VALUES FROM (512) TO (768); 16 | CREATE TABLE tab1_new_part_4 PARTITION OF tab1_new FOR VALUES FROM (768) TO (1024); 17 | -- Also test handling of constraints that require "manual" validation. 18 | ALTER TABLE tab1 ADD CHECK (k >= 0); 19 | CREATE TABLE tab1_fk(i int REFERENCES tab1); 20 | INSERT INTO tab1_fk(i) VALUES (1); 21 | \d tab1 22 | Table "public.tab1" 23 | Column | Type | Collation | Nullable | Default 24 | --------+---------+-----------+----------+--------- 25 | i | integer | | not null | 26 | j | integer | | | 27 | k | integer | | | 28 | Indexes: 29 | "tab1_pkey" PRIMARY KEY, btree (i) 30 | Check constraints: 31 | "tab1_k_check" CHECK (k >= 0) 32 | Referenced by: 33 | TABLE "tab1_fk" CONSTRAINT "tab1_fk_i_fkey" FOREIGN KEY (i) REFERENCES tab1(i) 34 | 35 | -- Process the table. 36 | SELECT rewrite_table('tab1', 'tab1_new', 'tab1_orig'); 37 | rewrite_table 38 | --------------- 39 | 40 | (1 row) 41 | 42 | -- tab1 should now be partitioned. 43 | \d tab1 44 | Partitioned table "public.tab1" 45 | Column | Type | Collation | Nullable | Default 46 | --------+---------+-----------+----------+--------- 47 | i | integer | | not null | 48 | j | integer | | | 49 | k | integer | | | 50 | Partition key: RANGE (i) 51 | Indexes: 52 | "tab1_new_pkey" PRIMARY KEY, btree (i) 53 | Check constraints: 54 | "tab1_k_check2" CHECK (k >= 0) NOT VALID 55 | Referenced by: 56 | TABLE "tab1_fk" CONSTRAINT "tab1_fk_i_fkey2" FOREIGN KEY (i) REFERENCES tab1(i) NOT VALID 57 | Number of partitions: 4 (Use \d+ to list them.) 58 | 59 | -- Validate the constraints. 60 | ALTER TABLE tab1 VALIDATE CONSTRAINT tab1_k_check2; 61 | ALTER TABLE tab1_fk VALIDATE CONSTRAINT tab1_fk_i_fkey2; 62 | \d tab1 63 | Partitioned table "public.tab1" 64 | Column | Type | Collation | Nullable | Default 65 | --------+---------+-----------+----------+--------- 66 | i | integer | | not null | 67 | j | integer | | | 68 | k | integer | | | 69 | Partition key: RANGE (i) 70 | Indexes: 71 | "tab1_new_pkey" PRIMARY KEY, btree (i) 72 | Check constraints: 73 | "tab1_k_check2" CHECK (k >= 0) 74 | Referenced by: 75 | TABLE "tab1_fk" CONSTRAINT "tab1_fk_i_fkey2" FOREIGN KEY (i) REFERENCES tab1(i) 76 | Number of partitions: 4 (Use \d+ to list them.) 77 | 78 | EXPLAIN (COSTS off) SELECT * FROM tab1; 79 | QUERY PLAN 80 | ------------------------------------------ 81 | Append 82 | -> Seq Scan on tab1_new_part_1 tab1_1 83 | -> Seq Scan on tab1_new_part_2 tab1_2 84 | -> Seq Scan on tab1_new_part_3 tab1_3 85 | -> Seq Scan on tab1_new_part_4 tab1_4 86 | (5 rows) 87 | 88 | -- Check that the contents has not changed. 89 | SELECT count(*) FROM tab1; 90 | count 91 | ------- 92 | 1024 93 | (1 row) 94 | 95 | SELECT * 96 | FROM tab1 t FULL JOIN tab1_orig o ON t.i = o.i 97 | WHERE t.i ISNULL OR o.i ISNULL; 98 | i | j | k | i | j | k 99 | ---+---+---+---+---+--- 100 | (0 rows) 101 | 102 | -- List partitioning 103 | CREATE TABLE tab2(i int, j int, PRIMARY KEY (i, j)); 104 | INSERT INTO tab2(i, j) 105 | SELECT i, j 106 | FROM generate_series(1, 4) g(i), generate_series(1, 4) h(j); 107 | CREATE TABLE tab2_new(i int, j int, PRIMARY KEY (i, j)) PARTITION BY LIST(i); 108 | CREATE TABLE tab2_new_part_1 PARTITION OF tab2_new FOR VALUES IN (1); 109 | CREATE TABLE tab2_new_part_2 PARTITION OF tab2_new FOR VALUES IN (2); 110 | CREATE TABLE tab2_new_part_3 PARTITION OF tab2_new FOR VALUES IN (3); 111 | CREATE TABLE tab2_new_part_4 PARTITION OF tab2_new FOR VALUES IN (4); 112 | SELECT rewrite_table('tab2', 'tab2_new', 'tab2_orig'); 113 | rewrite_table 114 | --------------- 115 | 116 | (1 row) 117 | 118 | TABLE tab2_new_part_1; 119 | i | j 120 | ---+--- 121 | 1 | 1 122 | 1 | 2 123 | 1 | 3 124 | 1 | 4 125 | (4 rows) 126 | 127 | TABLE tab2_new_part_2; 128 | i | j 129 | ---+--- 130 | 2 | 1 131 | 2 | 2 132 | 2 | 3 133 | 2 | 4 134 | (4 rows) 135 | 136 | TABLE tab2_new_part_3; 137 | i | j 138 | ---+--- 139 | 3 | 1 140 | 3 | 2 141 | 3 | 3 142 | 3 | 4 143 | (4 rows) 144 | 145 | TABLE tab2_new_part_4; 146 | i | j 147 | ---+--- 148 | 4 | 1 149 | 4 | 2 150 | 4 | 3 151 | 4 | 4 152 | (4 rows) 153 | 154 | -- Hash partitioning 155 | CREATE TABLE tab3(i int, j int, PRIMARY KEY (i, j)); 156 | INSERT INTO tab3(i, j) 157 | SELECT i, j 158 | FROM generate_series(1, 4) g(i), generate_series(1, 4) h(j); 159 | CREATE TABLE tab3_new(i int, j int, PRIMARY KEY (i, j)) PARTITION BY HASH(i); 160 | CREATE TABLE tab3_new_part_1 PARTITION OF tab3_new FOR VALUES WITH (MODULUS 4, REMAINDER 0); 161 | CREATE TABLE tab3_new_part_2 PARTITION OF tab3_new FOR VALUES WITH (MODULUS 4, REMAINDER 1); 162 | CREATE TABLE tab3_new_part_3 PARTITION OF tab3_new FOR VALUES WITH (MODULUS 4, REMAINDER 2); 163 | CREATE TABLE tab3_new_part_4 PARTITION OF tab3_new FOR VALUES WITH (MODULUS 4, REMAINDER 3); 164 | SELECT rewrite_table('tab3', 'tab3_new', 'tab3_orig'); 165 | rewrite_table 166 | --------------- 167 | 168 | (1 row) 169 | 170 | TABLE tab3_new_part_1; 171 | i | j 172 | ---+--- 173 | 1 | 1 174 | 1 | 2 175 | 1 | 3 176 | 1 | 4 177 | (4 rows) 178 | 179 | TABLE tab3_new_part_2; 180 | i | j 181 | ---+--- 182 | 3 | 1 183 | 3 | 2 184 | 3 | 3 185 | 3 | 4 186 | (4 rows) 187 | 188 | TABLE tab3_new_part_3; 189 | i | j 190 | ---+--- 191 | 2 | 1 192 | 2 | 2 193 | 2 | 3 194 | 2 | 4 195 | (4 rows) 196 | 197 | TABLE tab3_new_part_4; 198 | i | j 199 | ---+--- 200 | 4 | 1 201 | 4 | 2 202 | 4 | 3 203 | 4 | 4 204 | (4 rows) 205 | 206 | -- Change of precision and scale of a numeric data type. 207 | CREATE TABLE tab4(i int PRIMARY KEY, j numeric(3, 1)); 208 | INSERT INTO tab4(i, j) VALUES (1, 0.1); 209 | CREATE TABLE tab4_new(i int PRIMARY KEY, j numeric(4, 2)); 210 | TABLE tab4; 211 | i | j 212 | ---+----- 213 | 1 | 0.1 214 | (1 row) 215 | 216 | SELECT rewrite_table('tab4', 'tab4_new', 'tab4_orig'); 217 | rewrite_table 218 | --------------- 219 | 220 | (1 row) 221 | 222 | TABLE tab4; 223 | i | j 224 | ---+------ 225 | 1 | 0.10 226 | (1 row) 227 | 228 | -- One more test for "manual" validation of FKs, this time we rewrite the PK 229 | -- table. The NOT VALID constraint cannot be used if the FK table is 230 | -- partitioned and if PG version is < 18, so we need a separate test. 231 | CREATE TABLE tab1_pk(i int primary key); 232 | INSERT INTO tab1_pk(i) VALUES (1); 233 | CREATE TABLE tab1_pk_new(i bigint primary key); 234 | DROP TABLE tab1_fk; 235 | CREATE TABLE tab1_fk(i int REFERENCES tab1_pk); 236 | INSERT INTO tab1_fk(i) VALUES (1); 237 | \d tab1_pk 238 | Table "public.tab1_pk" 239 | Column | Type | Collation | Nullable | Default 240 | --------+---------+-----------+----------+--------- 241 | i | integer | | not null | 242 | Indexes: 243 | "tab1_pk_pkey" PRIMARY KEY, btree (i) 244 | Referenced by: 245 | TABLE "tab1_fk" CONSTRAINT "tab1_fk_i_fkey" FOREIGN KEY (i) REFERENCES tab1_pk(i) 246 | 247 | SELECT rewrite_table('tab1_pk', 'tab1_pk_new', 'tab1_pk_orig'); 248 | rewrite_table 249 | --------------- 250 | 251 | (1 row) 252 | 253 | \d tab1_pk 254 | Table "public.tab1_pk" 255 | Column | Type | Collation | Nullable | Default 256 | --------+--------+-----------+----------+--------- 257 | i | bigint | | not null | 258 | Indexes: 259 | "tab1_pk_new_pkey" PRIMARY KEY, btree (i) 260 | Referenced by: 261 | TABLE "tab1_fk" CONSTRAINT "tab1_fk_i_fkey2" FOREIGN KEY (i) REFERENCES tab1_pk(i) NOT VALID 262 | 263 | ALTER TABLE tab1_fk VALIDATE CONSTRAINT tab1_fk_i_fkey2; 264 | \d tab1_pk 265 | Table "public.tab1_pk" 266 | Column | Type | Collation | Nullable | Default 267 | --------+--------+-----------+----------+--------- 268 | i | bigint | | not null | 269 | Indexes: 270 | "tab1_pk_new_pkey" PRIMARY KEY, btree (i) 271 | Referenced by: 272 | TABLE "tab1_fk" CONSTRAINT "tab1_fk_i_fkey2" FOREIGN KEY (i) REFERENCES tab1_pk(i) 273 | 274 | -- For the partitioned FK table, test at least that the FK creation is skipped 275 | -- (i.e. ERROR saying that NOT VALID is not supported is no raised) 276 | DROP TABLE tab1_fk; 277 | CREATE TABLE tab1_fk(i int REFERENCES tab1_pk) PARTITION BY RANGE (i); 278 | CREATE TABLE tab1_fk_1 PARTITION OF tab1_fk DEFAULT; 279 | INSERT INTO tab1_fk(i) VALUES (1); 280 | ALTER TABLE tab1_pk_orig RENAME TO tab1_pk_new; 281 | TRUNCATE TABLE tab1_pk_new; 282 | \d tab1_fk 283 | Partitioned table "public.tab1_fk" 284 | Column | Type | Collation | Nullable | Default 285 | --------+---------+-----------+----------+--------- 286 | i | integer | | | 287 | Partition key: RANGE (i) 288 | Foreign-key constraints: 289 | "tab1_fk_i_fkey" FOREIGN KEY (i) REFERENCES tab1_pk(i) 290 | Number of partitions: 1 (Use \d+ to list them.) 291 | 292 | SELECT rewrite_table('tab1_pk', 'tab1_pk_new', 'tab1_pk_orig'); 293 | rewrite_table 294 | --------------- 295 | 296 | (1 row) 297 | 298 | -- Note that tab1_fk still references tab1_pk_orig - that's expected. 299 | \d tab1_fk 300 | Partitioned table "public.tab1_fk" 301 | Column | Type | Collation | Nullable | Default 302 | --------+---------+-----------+----------+--------- 303 | i | integer | | | 304 | Partition key: RANGE (i) 305 | Foreign-key constraints: 306 | "tab1_fk_i_fkey" FOREIGN KEY (i) REFERENCES tab1_pk_orig(i) 307 | Number of partitions: 1 (Use \d+ to list them.) 308 | 309 | -- The same once again, but now rewrite the FK table. 310 | DROP TABLE tab1_fk; 311 | DROP TABLE tab1_pk; 312 | ALTER TABLE tab1_pk_orig RENAME TO tab1_pk; 313 | CREATE TABLE tab1_fk(i int PRIMARY KEY REFERENCES tab1_pk); 314 | INSERT INTO tab1_fk(i) VALUES (1); 315 | CREATE TABLE tab1_fk_new(i int PRIMARY KEY) PARTITION BY RANGE (i); 316 | CREATE TABLE tab1_fk_new_1 PARTITION OF tab1_fk_new DEFAULT; 317 | \d tab1_fk 318 | Table "public.tab1_fk" 319 | Column | Type | Collation | Nullable | Default 320 | --------+---------+-----------+----------+--------- 321 | i | integer | | not null | 322 | Indexes: 323 | "tab1_fk_pkey" PRIMARY KEY, btree (i) 324 | Foreign-key constraints: 325 | "tab1_fk_i_fkey" FOREIGN KEY (i) REFERENCES tab1_pk(i) 326 | 327 | SELECT rewrite_table('tab1_fk', 'tab1_fk_new', 'tab1_fk_orig'); 328 | NOTICE: FOREIGN KEY with NOT VALID option cannot be added to partitioned table 329 | rewrite_table 330 | --------------- 331 | 332 | (1 row) 333 | 334 | \d tab1_fk 335 | Partitioned table "public.tab1_fk" 336 | Column | Type | Collation | Nullable | Default 337 | --------+---------+-----------+----------+--------- 338 | i | integer | | not null | 339 | Partition key: RANGE (i) 340 | Indexes: 341 | "tab1_fk_new_pkey" PRIMARY KEY, btree (i) 342 | Number of partitions: 1 (Use \d+ to list them.) 343 | 344 | -- Check if sequence on the target table is synchronized with that of the 345 | -- source table. 346 | CREATE TABLE tab5(i int primary key generated always as identity); 347 | CREATE TABLE tab5_new(i int primary key generated always as identity); 348 | INSERT INTO tab5(i) VALUES (DEFAULT); 349 | SELECT rewrite_table('tab5', 'tab5_new', 'tab5_orig'); 350 | rewrite_table 351 | --------------- 352 | 353 | (1 row) 354 | 355 | INSERT INTO tab5(i) VALUES (DEFAULT); 356 | SELECT i FROM tab5 ORDER BY i; 357 | i 358 | --- 359 | 1 360 | 2 361 | (2 rows) 362 | 363 | -- The same with serial column. 364 | CREATE TABLE tab6(i serial primary key); 365 | CREATE TABLE tab6_new(i serial primary key); 366 | INSERT INTO tab6(i) VALUES (DEFAULT); 367 | SELECT rewrite_table('tab6', 'tab6_new', 'tab6_orig'); 368 | rewrite_table 369 | --------------- 370 | 371 | (1 row) 372 | 373 | INSERT INTO tab6(i) VALUES (DEFAULT); 374 | SELECT i FROM tab6 ORDER BY i; 375 | i 376 | --- 377 | 1 378 | 2 379 | (2 rows) 380 | 381 | -------------------------------------------------------------------------------- /expected/pg_rewrite.out: -------------------------------------------------------------------------------- 1 | DROP EXTENSION IF EXISTS pg_rewrite; 2 | NOTICE: extension "pg_rewrite" does not exist, skipping 3 | CREATE EXTENSION pg_rewrite; 4 | CREATE TABLE tab1(i int PRIMARY KEY, j int, k int); 5 | -- If a dropped column is encountered, the source tuple should be converted 6 | -- so it matches the destination table. 7 | ALTER TABLE tab1 DROP COLUMN k; 8 | ALTER TABLE tab1 ADD COLUMN k int; 9 | INSERT INTO tab1(i, j, k) 10 | SELECT i, i / 2, i 11 | FROM generate_series(0, 1023) g(i); 12 | CREATE TABLE tab1_new(i int PRIMARY KEY, j int, k int) PARTITION BY RANGE(i); 13 | CREATE TABLE tab1_new_part_1 PARTITION OF tab1_new FOR VALUES FROM (0) TO (256); 14 | CREATE TABLE tab1_new_part_2 PARTITION OF tab1_new FOR VALUES FROM (256) TO (512); 15 | CREATE TABLE tab1_new_part_3 PARTITION OF tab1_new FOR VALUES FROM (512) TO (768); 16 | CREATE TABLE tab1_new_part_4 PARTITION OF tab1_new FOR VALUES FROM (768) TO (1024); 17 | -- Also test handling of constraints that require "manual" validation. 18 | ALTER TABLE tab1 ADD CHECK (k >= 0); 19 | CREATE TABLE tab1_fk(i int REFERENCES tab1); 20 | INSERT INTO tab1_fk(i) VALUES (1); 21 | \d tab1 22 | Table "public.tab1" 23 | Column | Type | Collation | Nullable | Default 24 | --------+---------+-----------+----------+--------- 25 | i | integer | | not null | 26 | j | integer | | | 27 | k | integer | | | 28 | Indexes: 29 | "tab1_pkey" PRIMARY KEY, btree (i) 30 | Check constraints: 31 | "tab1_k_check" CHECK (k >= 0) 32 | Referenced by: 33 | TABLE "tab1_fk" CONSTRAINT "tab1_fk_i_fkey" FOREIGN KEY (i) REFERENCES tab1(i) 34 | 35 | -- Process the table. 36 | SELECT rewrite_table('tab1', 'tab1_new', 'tab1_orig'); 37 | rewrite_table 38 | --------------- 39 | 40 | (1 row) 41 | 42 | -- tab1 should now be partitioned. 43 | \d tab1 44 | Partitioned table "public.tab1" 45 | Column | Type | Collation | Nullable | Default 46 | --------+---------+-----------+----------+--------- 47 | i | integer | | not null | 48 | j | integer | | | 49 | k | integer | | | 50 | Partition key: RANGE (i) 51 | Indexes: 52 | "tab1_new_pkey" PRIMARY KEY, btree (i) 53 | Check constraints: 54 | "tab1_k_check2" CHECK (k >= 0) NOT VALID 55 | Referenced by: 56 | TABLE "tab1_fk" CONSTRAINT "tab1_fk_i_fkey2" FOREIGN KEY (i) REFERENCES tab1(i) NOT VALID 57 | Number of partitions: 4 (Use \d+ to list them.) 58 | 59 | -- Validate the constraints. 60 | ALTER TABLE tab1 VALIDATE CONSTRAINT tab1_k_check2; 61 | ALTER TABLE tab1_fk VALIDATE CONSTRAINT tab1_fk_i_fkey2; 62 | \d tab1 63 | Partitioned table "public.tab1" 64 | Column | Type | Collation | Nullable | Default 65 | --------+---------+-----------+----------+--------- 66 | i | integer | | not null | 67 | j | integer | | | 68 | k | integer | | | 69 | Partition key: RANGE (i) 70 | Indexes: 71 | "tab1_new_pkey" PRIMARY KEY, btree (i) 72 | Check constraints: 73 | "tab1_k_check2" CHECK (k >= 0) 74 | Referenced by: 75 | TABLE "tab1_fk" CONSTRAINT "tab1_fk_i_fkey2" FOREIGN KEY (i) REFERENCES tab1(i) 76 | Number of partitions: 4 (Use \d+ to list them.) 77 | 78 | EXPLAIN (COSTS off) SELECT * FROM tab1; 79 | QUERY PLAN 80 | ------------------------------------------ 81 | Append 82 | -> Seq Scan on tab1_new_part_1 tab1_1 83 | -> Seq Scan on tab1_new_part_2 tab1_2 84 | -> Seq Scan on tab1_new_part_3 tab1_3 85 | -> Seq Scan on tab1_new_part_4 tab1_4 86 | (5 rows) 87 | 88 | -- Check that the contents has not changed. 89 | SELECT count(*) FROM tab1; 90 | count 91 | ------- 92 | 1024 93 | (1 row) 94 | 95 | SELECT * 96 | FROM tab1 t FULL JOIN tab1_orig o ON t.i = o.i 97 | WHERE t.i ISNULL OR o.i ISNULL; 98 | i | j | k | i | j | k 99 | ---+---+---+---+---+--- 100 | (0 rows) 101 | 102 | -- List partitioning 103 | CREATE TABLE tab2(i int, j int, PRIMARY KEY (i, j)); 104 | INSERT INTO tab2(i, j) 105 | SELECT i, j 106 | FROM generate_series(1, 4) g(i), generate_series(1, 4) h(j); 107 | CREATE TABLE tab2_new(i int, j int, PRIMARY KEY (i, j)) PARTITION BY LIST(i); 108 | CREATE TABLE tab2_new_part_1 PARTITION OF tab2_new FOR VALUES IN (1); 109 | CREATE TABLE tab2_new_part_2 PARTITION OF tab2_new FOR VALUES IN (2); 110 | CREATE TABLE tab2_new_part_3 PARTITION OF tab2_new FOR VALUES IN (3); 111 | CREATE TABLE tab2_new_part_4 PARTITION OF tab2_new FOR VALUES IN (4); 112 | SELECT rewrite_table('tab2', 'tab2_new', 'tab2_orig'); 113 | rewrite_table 114 | --------------- 115 | 116 | (1 row) 117 | 118 | TABLE tab2_new_part_1; 119 | i | j 120 | ---+--- 121 | 1 | 1 122 | 1 | 2 123 | 1 | 3 124 | 1 | 4 125 | (4 rows) 126 | 127 | TABLE tab2_new_part_2; 128 | i | j 129 | ---+--- 130 | 2 | 1 131 | 2 | 2 132 | 2 | 3 133 | 2 | 4 134 | (4 rows) 135 | 136 | TABLE tab2_new_part_3; 137 | i | j 138 | ---+--- 139 | 3 | 1 140 | 3 | 2 141 | 3 | 3 142 | 3 | 4 143 | (4 rows) 144 | 145 | TABLE tab2_new_part_4; 146 | i | j 147 | ---+--- 148 | 4 | 1 149 | 4 | 2 150 | 4 | 3 151 | 4 | 4 152 | (4 rows) 153 | 154 | -- Hash partitioning 155 | CREATE TABLE tab3(i int, j int, PRIMARY KEY (i, j)); 156 | INSERT INTO tab3(i, j) 157 | SELECT i, j 158 | FROM generate_series(1, 4) g(i), generate_series(1, 4) h(j); 159 | CREATE TABLE tab3_new(i int, j int, PRIMARY KEY (i, j)) PARTITION BY HASH(i); 160 | CREATE TABLE tab3_new_part_1 PARTITION OF tab3_new FOR VALUES WITH (MODULUS 4, REMAINDER 0); 161 | CREATE TABLE tab3_new_part_2 PARTITION OF tab3_new FOR VALUES WITH (MODULUS 4, REMAINDER 1); 162 | CREATE TABLE tab3_new_part_3 PARTITION OF tab3_new FOR VALUES WITH (MODULUS 4, REMAINDER 2); 163 | CREATE TABLE tab3_new_part_4 PARTITION OF tab3_new FOR VALUES WITH (MODULUS 4, REMAINDER 3); 164 | SELECT rewrite_table('tab3', 'tab3_new', 'tab3_orig'); 165 | rewrite_table 166 | --------------- 167 | 168 | (1 row) 169 | 170 | TABLE tab3_new_part_1; 171 | i | j 172 | ---+--- 173 | 1 | 1 174 | 1 | 2 175 | 1 | 3 176 | 1 | 4 177 | (4 rows) 178 | 179 | TABLE tab3_new_part_2; 180 | i | j 181 | ---+--- 182 | 3 | 1 183 | 3 | 2 184 | 3 | 3 185 | 3 | 4 186 | (4 rows) 187 | 188 | TABLE tab3_new_part_3; 189 | i | j 190 | ---+--- 191 | 2 | 1 192 | 2 | 2 193 | 2 | 3 194 | 2 | 4 195 | (4 rows) 196 | 197 | TABLE tab3_new_part_4; 198 | i | j 199 | ---+--- 200 | 4 | 1 201 | 4 | 2 202 | 4 | 3 203 | 4 | 4 204 | (4 rows) 205 | 206 | -- Change of precision and scale of a numeric data type. 207 | CREATE TABLE tab4(i int PRIMARY KEY, j numeric(3, 1)); 208 | INSERT INTO tab4(i, j) VALUES (1, 0.1); 209 | CREATE TABLE tab4_new(i int PRIMARY KEY, j numeric(4, 2)); 210 | TABLE tab4; 211 | i | j 212 | ---+----- 213 | 1 | 0.1 214 | (1 row) 215 | 216 | SELECT rewrite_table('tab4', 'tab4_new', 'tab4_orig'); 217 | rewrite_table 218 | --------------- 219 | 220 | (1 row) 221 | 222 | TABLE tab4; 223 | i | j 224 | ---+------ 225 | 1 | 0.10 226 | (1 row) 227 | 228 | -- One more test for "manual" validation of FKs, this time we rewrite the PK 229 | -- table. The NOT VALID constraint cannot be used if the FK table is 230 | -- partitioned and if PG version is < 18, so we need a separate test. 231 | CREATE TABLE tab1_pk(i int primary key); 232 | INSERT INTO tab1_pk(i) VALUES (1); 233 | CREATE TABLE tab1_pk_new(i bigint primary key); 234 | DROP TABLE tab1_fk; 235 | CREATE TABLE tab1_fk(i int REFERENCES tab1_pk); 236 | INSERT INTO tab1_fk(i) VALUES (1); 237 | \d tab1_pk 238 | Table "public.tab1_pk" 239 | Column | Type | Collation | Nullable | Default 240 | --------+---------+-----------+----------+--------- 241 | i | integer | | not null | 242 | Indexes: 243 | "tab1_pk_pkey" PRIMARY KEY, btree (i) 244 | Referenced by: 245 | TABLE "tab1_fk" CONSTRAINT "tab1_fk_i_fkey" FOREIGN KEY (i) REFERENCES tab1_pk(i) 246 | 247 | SELECT rewrite_table('tab1_pk', 'tab1_pk_new', 'tab1_pk_orig'); 248 | rewrite_table 249 | --------------- 250 | 251 | (1 row) 252 | 253 | \d tab1_pk 254 | Table "public.tab1_pk" 255 | Column | Type | Collation | Nullable | Default 256 | --------+--------+-----------+----------+--------- 257 | i | bigint | | not null | 258 | Indexes: 259 | "tab1_pk_new_pkey" PRIMARY KEY, btree (i) 260 | Referenced by: 261 | TABLE "tab1_fk" CONSTRAINT "tab1_fk_i_fkey2" FOREIGN KEY (i) REFERENCES tab1_pk(i) NOT VALID 262 | 263 | ALTER TABLE tab1_fk VALIDATE CONSTRAINT tab1_fk_i_fkey2; 264 | \d tab1_pk 265 | Table "public.tab1_pk" 266 | Column | Type | Collation | Nullable | Default 267 | --------+--------+-----------+----------+--------- 268 | i | bigint | | not null | 269 | Indexes: 270 | "tab1_pk_new_pkey" PRIMARY KEY, btree (i) 271 | Referenced by: 272 | TABLE "tab1_fk" CONSTRAINT "tab1_fk_i_fkey2" FOREIGN KEY (i) REFERENCES tab1_pk(i) 273 | 274 | -- For the partitioned FK table, test at least that the FK creation is skipped 275 | -- (i.e. ERROR saying that NOT VALID is not supported is no raised) 276 | DROP TABLE tab1_fk; 277 | CREATE TABLE tab1_fk(i int REFERENCES tab1_pk) PARTITION BY RANGE (i); 278 | CREATE TABLE tab1_fk_1 PARTITION OF tab1_fk DEFAULT; 279 | INSERT INTO tab1_fk(i) VALUES (1); 280 | ALTER TABLE tab1_pk_orig RENAME TO tab1_pk_new; 281 | TRUNCATE TABLE tab1_pk_new; 282 | \d tab1_fk 283 | Partitioned table "public.tab1_fk" 284 | Column | Type | Collation | Nullable | Default 285 | --------+---------+-----------+----------+--------- 286 | i | integer | | | 287 | Partition key: RANGE (i) 288 | Foreign-key constraints: 289 | "tab1_fk_i_fkey" FOREIGN KEY (i) REFERENCES tab1_pk(i) 290 | Number of partitions: 1 (Use \d+ to list them.) 291 | 292 | SELECT rewrite_table('tab1_pk', 'tab1_pk_new', 'tab1_pk_orig'); 293 | rewrite_table 294 | --------------- 295 | 296 | (1 row) 297 | 298 | -- Note that tab1_fk still references tab1_pk_orig - that's expected. 299 | \d tab1_fk 300 | Partitioned table "public.tab1_fk" 301 | Column | Type | Collation | Nullable | Default 302 | --------+---------+-----------+----------+--------- 303 | i | integer | | | 304 | Partition key: RANGE (i) 305 | Foreign-key constraints: 306 | "tab1_fk_i_fkey" FOREIGN KEY (i) REFERENCES tab1_pk_orig(i) 307 | Number of partitions: 1 (Use \d+ to list them.) 308 | 309 | -- The same once again, but now rewrite the FK table. 310 | DROP TABLE tab1_fk; 311 | DROP TABLE tab1_pk; 312 | ALTER TABLE tab1_pk_orig RENAME TO tab1_pk; 313 | CREATE TABLE tab1_fk(i int PRIMARY KEY REFERENCES tab1_pk); 314 | INSERT INTO tab1_fk(i) VALUES (1); 315 | CREATE TABLE tab1_fk_new(i int PRIMARY KEY) PARTITION BY RANGE (i); 316 | CREATE TABLE tab1_fk_new_1 PARTITION OF tab1_fk_new DEFAULT; 317 | \d tab1_fk 318 | Table "public.tab1_fk" 319 | Column | Type | Collation | Nullable | Default 320 | --------+---------+-----------+----------+--------- 321 | i | integer | | not null | 322 | Indexes: 323 | "tab1_fk_pkey" PRIMARY KEY, btree (i) 324 | Foreign-key constraints: 325 | "tab1_fk_i_fkey" FOREIGN KEY (i) REFERENCES tab1_pk(i) 326 | 327 | SELECT rewrite_table('tab1_fk', 'tab1_fk_new', 'tab1_fk_orig'); 328 | rewrite_table 329 | --------------- 330 | 331 | (1 row) 332 | 333 | \d tab1_fk 334 | Partitioned table "public.tab1_fk" 335 | Column | Type | Collation | Nullable | Default 336 | --------+---------+-----------+----------+--------- 337 | i | integer | | not null | 338 | Partition key: RANGE (i) 339 | Indexes: 340 | "tab1_fk_new_pkey" PRIMARY KEY, btree (i) 341 | Foreign-key constraints: 342 | "tab1_fk_i_fkey2" FOREIGN KEY (i) REFERENCES tab1_pk(i) NOT VALID 343 | Number of partitions: 1 (Use \d+ to list them.) 344 | 345 | -- Check if sequence on the target table is synchronized with that of the 346 | -- source table. 347 | CREATE TABLE tab5(i int primary key generated always as identity); 348 | CREATE TABLE tab5_new(i int primary key generated always as identity); 349 | INSERT INTO tab5(i) VALUES (DEFAULT); 350 | SELECT rewrite_table('tab5', 'tab5_new', 'tab5_orig'); 351 | rewrite_table 352 | --------------- 353 | 354 | (1 row) 355 | 356 | INSERT INTO tab5(i) VALUES (DEFAULT); 357 | SELECT i FROM tab5 ORDER BY i; 358 | i 359 | --- 360 | 1 361 | 2 362 | (2 rows) 363 | 364 | -- The same with serial column. 365 | CREATE TABLE tab6(i serial primary key); 366 | CREATE TABLE tab6_new(i serial primary key); 367 | INSERT INTO tab6(i) VALUES (DEFAULT); 368 | SELECT rewrite_table('tab6', 'tab6_new', 'tab6_orig'); 369 | rewrite_table 370 | --------------- 371 | 372 | (1 row) 373 | 374 | INSERT INTO tab6(i) VALUES (DEFAULT); 375 | SELECT i FROM tab6 ORDER BY i; 376 | i 377 | --- 378 | 1 379 | 2 380 | (2 rows) 381 | 382 | -------------------------------------------------------------------------------- /concurrent.c: -------------------------------------------------------------------------------- 1 | /*----------------------------------------------------------------------------------- 2 | * 3 | * concurrent.c 4 | * Tools for maintenance that requires table rewriting. 5 | * 6 | * This file handles changes that took place while the data is being 7 | * copied from one table to another one. 8 | * 9 | * Copyright (c) 2021-2025, Cybertec PostgreSQL International GmbH 10 | * 11 | *----------------------------------------------------------------------------------- 12 | */ 13 | 14 | 15 | #include "pg_rewrite.h" 16 | 17 | #include "access/heaptoast.h" 18 | #include "executor/execPartition.h" 19 | #include "executor/executor.h" 20 | #include "replication/decode.h" 21 | #include "utils/rel.h" 22 | 23 | typedef enum 24 | { 25 | CHANGE_INSERT, 26 | CHANGE_UPDATE_OLD, 27 | CHANGE_UPDATE_NEW, 28 | CHANGE_DELETE 29 | } ConcurrentChangeKind; 30 | 31 | typedef struct ConcurrentChange 32 | { 33 | /* See the enum above. */ 34 | ConcurrentChangeKind kind; 35 | 36 | /* 37 | * The actual tuple. 38 | * 39 | * The tuple data follows the ConcurrentChange structure. Before use make 40 | * sure the tuple is correctly aligned (ConcurrentChange can be stored as 41 | * bytea) and that tuple->t_data is fixed. 42 | */ 43 | HeapTupleData tup_data; 44 | } ConcurrentChange; 45 | 46 | static void apply_concurrent_changes(EState *estate, ModifyTableState *mtstate, 47 | struct PartitionTupleRouting *proute, 48 | DecodingOutputState *dstate, 49 | ScanKey key, int nkeys, 50 | Relation ident_index, 51 | TupleTableSlot *slot_dst_ind, 52 | partitions_hash *partitions, 53 | TupleConversionMapExt *conv_map, 54 | struct timeval *must_complete); 55 | static void apply_insert(HeapTuple tup, TupleTableSlot *slot, 56 | EState *estate, ModifyTableState *mtstate, 57 | struct PartitionTupleRouting *proute, 58 | partitions_hash *partitions, 59 | TupleConversionMapExt *conv_map, 60 | BulkInsertState bistate); 61 | static void apply_update_or_delete(HeapTuple tup, 62 | HeapTuple tup_old, 63 | ConcurrentChangeKind change_kind, 64 | EState *estate, 65 | ScanKey key, int nkeys, Relation ident_index, 66 | TupleTableSlot *slot_dst, 67 | TupleTableSlot *slot_dst_ind, 68 | ModifyTableState *mtstate, 69 | struct PartitionTupleRouting *proute, 70 | partitions_hash *partitions, 71 | TupleConversionMapExt *conv_map); 72 | static void find_tuple_in_partition(HeapTuple tup, Relation partition, 73 | partitions_hash *partitions, 74 | ScanKey key, int nkeys, ItemPointer ctid); 75 | static void find_tuple(HeapTuple tup, Relation rel, Relation ident_index, 76 | ScanKey key, int nkeys, ItemPointer ctid, 77 | TupleTableSlot *slot_dst_ind); 78 | static bool processing_time_elapsed(struct timeval *utmost); 79 | 80 | static void plugin_startup(LogicalDecodingContext *ctx, 81 | OutputPluginOptions *opt, bool is_init); 82 | static void plugin_shutdown(LogicalDecodingContext *ctx); 83 | static void plugin_begin_txn(LogicalDecodingContext *ctx, 84 | ReorderBufferTXN *txn); 85 | static void plugin_commit_txn(LogicalDecodingContext *ctx, 86 | ReorderBufferTXN *txn, XLogRecPtr commit_lsn); 87 | static void plugin_change(LogicalDecodingContext *ctx, ReorderBufferTXN *txn, 88 | Relation rel, ReorderBufferChange *change); 89 | static void store_change(LogicalDecodingContext *ctx, 90 | ConcurrentChangeKind kind, HeapTuple tuple); 91 | static HeapTuple get_changed_tuple(ConcurrentChange *change); 92 | static bool plugin_filter(LogicalDecodingContext *ctx, RepOriginId origin_id); 93 | 94 | /* 95 | * Decode and apply concurrent changes. If there are too many of them, split 96 | * the processing into multiple iterations so that the intermediate storage 97 | * (tuplestore) is not likely to be written to disk. 98 | * 99 | * See check_catalog_changes() for explanation of lock_held argument. 100 | * 101 | * Returns true if must_complete is NULL or if managed to complete by the time 102 | * *must_complete indicates. 103 | */ 104 | bool 105 | pg_rewrite_process_concurrent_changes(EState *estate, 106 | ModifyTableState *mtstate, 107 | struct PartitionTupleRouting *proute, 108 | LogicalDecodingContext *ctx, 109 | XLogRecPtr end_of_wal, 110 | ScanKey ident_key, 111 | int ident_key_nentries, 112 | Relation ident_index, 113 | TupleTableSlot *slot_dst_ind, 114 | LOCKMODE lock_held, 115 | partitions_hash *partitions, 116 | TupleConversionMapExt *conv_map, 117 | struct timeval *must_complete) 118 | { 119 | DecodingOutputState *dstate; 120 | bool done; 121 | 122 | /* 123 | * Some arguments are specific to partitioned table, some to 124 | * non-partitioned one. XXX Is some refactoring needed here, such as using 125 | * an union? 126 | */ 127 | Assert((ident_index && slot_dst_ind && partitions == NULL 128 | && proute == NULL) || 129 | (ident_index == NULL && slot_dst_ind == NULL&& 130 | partitions && proute)); 131 | 132 | dstate = (DecodingOutputState *) ctx->output_writer_private; 133 | 134 | /* 135 | * If some changes could not be applied due to time constraint, make sure 136 | * the tuplestore is empty before we insert new tuples into it. 137 | */ 138 | if (dstate->nchanges > 0) 139 | apply_concurrent_changes(estate, mtstate, proute, 140 | dstate, ident_key, ident_key_nentries, 141 | ident_index, slot_dst_ind, 142 | partitions, conv_map, must_complete); 143 | /* Ran out of time? */ 144 | if (dstate->nchanges > 0) 145 | return false; 146 | 147 | done = false; 148 | while (!done) 149 | { 150 | pg_rewrite_exit_if_requested(); 151 | 152 | done = pg_rewrite_decode_concurrent_changes(ctx, end_of_wal, 153 | must_complete); 154 | 155 | if (processing_time_elapsed(must_complete)) 156 | /* Caller is responsible for applying the changes. */ 157 | return false; 158 | 159 | if (dstate->nchanges == 0) 160 | continue; 161 | 162 | /* 163 | * XXX Consider if it's possible to check *must_complete and stop 164 | * processing partway through. Partial cleanup of the tuplestore seems 165 | * non-trivial. 166 | */ 167 | apply_concurrent_changes(estate, mtstate, proute, 168 | dstate, ident_key, ident_key_nentries, 169 | ident_index, slot_dst_ind, 170 | partitions, conv_map, must_complete); 171 | /* Ran out of time? */ 172 | if (dstate->nchanges > 0) 173 | return false; 174 | } 175 | 176 | return true; 177 | } 178 | 179 | /* 180 | * Decode logical changes from the XLOG sequence up to end_of_wal. 181 | * 182 | * Returns true iff done (for now), i.e. no more changes below the end_of_wal 183 | * can be decoded. 184 | */ 185 | bool 186 | pg_rewrite_decode_concurrent_changes(LogicalDecodingContext *ctx, 187 | XLogRecPtr end_of_wal, 188 | struct timeval *must_complete) 189 | { 190 | DecodingOutputState *dstate; 191 | ResourceOwner resowner_old; 192 | 193 | /* 194 | * Invalidate the "present" cache before moving to "(recent) history". 195 | * 196 | * Note: The cache entry of the transient relation is not affected 197 | * (because it was created by the current transaction), but the tuple 198 | * descriptor shouldn't change anyway (as opposed to index info, which we 199 | * change at some point). Moreover, tuples of the transient relation 200 | * should not actually be deconstructed: reorderbuffer.c records the 201 | * tuples, but - as it never receives the corresponding commit record - 202 | * does not examine them in detail. 203 | */ 204 | InvalidateSystemCaches(); 205 | 206 | dstate = (DecodingOutputState *) ctx->output_writer_private; 207 | resowner_old = CurrentResourceOwner; 208 | CurrentResourceOwner = dstate->resowner; 209 | 210 | PG_TRY(); 211 | { 212 | while (ctx->reader->EndRecPtr < end_of_wal) 213 | { 214 | XLogRecord *record; 215 | XLogSegNo segno_new; 216 | char *errm = NULL; 217 | XLogRecPtr end_lsn; 218 | 219 | record = XLogReadRecord(ctx->reader, &errm); 220 | if (errm) 221 | elog(ERROR, "%s", errm); 222 | 223 | if (record != NULL) 224 | LogicalDecodingProcessRecord(ctx, ctx->reader); 225 | 226 | if (processing_time_elapsed(must_complete)) 227 | break; 228 | 229 | /* 230 | * If WAL segment boundary has been crossed, inform PG core that 231 | * we no longer need the previous segment. 232 | */ 233 | end_lsn = ctx->reader->EndRecPtr; 234 | XLByteToSeg(end_lsn, segno_new, wal_segment_size); 235 | if (segno_new != rewrite_current_segment) 236 | { 237 | LogicalConfirmReceivedLocation(end_lsn); 238 | elog(DEBUG1, "pg_rewrite: confirmed receive location %X/%X", 239 | (uint32) (end_lsn >> 32), (uint32) end_lsn); 240 | rewrite_current_segment = segno_new; 241 | } 242 | 243 | pg_rewrite_exit_if_requested(); 244 | } 245 | InvalidateSystemCaches(); 246 | CurrentResourceOwner = resowner_old; 247 | } 248 | PG_CATCH(); 249 | { 250 | InvalidateSystemCaches(); 251 | CurrentResourceOwner = resowner_old; 252 | PG_RE_THROW(); 253 | } 254 | PG_END_TRY(); 255 | 256 | elog(DEBUG1, "pg_rewrite: %.0f changes decoded but not applied yet", 257 | dstate->nchanges); 258 | 259 | return ctx->reader->EndRecPtr >= end_of_wal; 260 | } 261 | 262 | /* 263 | * Apply changes that happened during the initial load. 264 | * 265 | * Scan key is passed by caller, so it does not have to be constructed 266 | * multiple times. Key entries have all fields initialized, except for 267 | * sk_argument. 268 | */ 269 | static void 270 | apply_concurrent_changes(EState *estate, ModifyTableState *mtstate, 271 | struct PartitionTupleRouting *proute, 272 | DecodingOutputState *dstate, 273 | ScanKey key, int nkeys, 274 | Relation ident_index, 275 | TupleTableSlot *slot_dst_ind, 276 | partitions_hash *partitions, 277 | TupleConversionMapExt *conv_map, 278 | struct timeval *must_complete) 279 | { 280 | BulkInsertState bistate = NULL; 281 | HeapTuple tup_old = NULL; 282 | Relation rel_dst; 283 | TupleTableSlot *slot_dst; 284 | 285 | if (dstate->nchanges == 0) 286 | return; 287 | 288 | /* See perform_initial_load() */ 289 | if (proute == NULL) 290 | bistate = GetBulkInsertState(); 291 | 292 | /* 293 | * Slot for the destination relation is needed even in the partitioned 294 | * case, to route changes to partitions. 295 | */ 296 | rel_dst = mtstate->resultRelInfo->ri_RelationDesc; 297 | slot_dst = MakeSingleTupleTableSlot(RelationGetDescr(rel_dst), 298 | &TTSOpsHeapTuple); 299 | 300 | /* 301 | * In case functions in the index need the active snapshot and caller 302 | * hasn't set one. 303 | */ 304 | PushActiveSnapshot(GetTransactionSnapshot()); 305 | 306 | while (tuplestore_gettupleslot(dstate->tstore, true, false, 307 | dstate->tsslot)) 308 | { 309 | bool shouldFree; 310 | HeapTuple tup_change, 311 | tup; 312 | char *change_raw; 313 | ConcurrentChange *change; 314 | bool isnull[1]; 315 | Datum values[1]; 316 | 317 | Assert(dstate->nchanges > 0); 318 | dstate->nchanges--; 319 | 320 | /* Get the change from the single-column tuple. */ 321 | tup_change = ExecFetchSlotHeapTuple(dstate->tsslot, false, &shouldFree); 322 | heap_deform_tuple(tup_change, dstate->tupdesc_change, values, isnull); 323 | Assert(!isnull[0]); 324 | 325 | /* This is bytea, but char* is easier to work with. */ 326 | change_raw = (char *) DatumGetByteaP(values[0]); 327 | 328 | change = (ConcurrentChange *) VARDATA(change_raw); 329 | 330 | tup = get_changed_tuple(change); 331 | 332 | if (change->kind == CHANGE_UPDATE_OLD) 333 | { 334 | Assert(tup_old == NULL); 335 | tup_old = tup; 336 | } 337 | else if (change->kind == CHANGE_INSERT) 338 | { 339 | Assert(tup_old == NULL); 340 | apply_insert(tup, slot_dst, estate, mtstate, proute, 341 | partitions, conv_map, bistate); 342 | } 343 | else if (change->kind == CHANGE_UPDATE_NEW || 344 | change->kind == CHANGE_DELETE) 345 | { 346 | apply_update_or_delete(tup, tup_old, change->kind, 347 | estate, key, nkeys, ident_index, 348 | slot_dst, slot_dst_ind, mtstate, proute, 349 | partitions, conv_map); 350 | 351 | /* The function is responsible for freeing. */ 352 | if (tup_old != NULL) 353 | tup_old = NULL; 354 | } 355 | else 356 | elog(ERROR, "Unrecognized kind of change: %d", change->kind); 357 | 358 | /* If there's any change, make it visible to the next iteration. */ 359 | if (change->kind != CHANGE_UPDATE_OLD) 360 | { 361 | CommandCounterIncrement(); 362 | UpdateActiveSnapshotCommandId(); 363 | } 364 | 365 | /* TTSOpsMinimalTuple has .get_heap_tuple==NULL. */ 366 | Assert(shouldFree); 367 | pfree(tup_change); 368 | 369 | /* 370 | * If there is a limit on the time of completion, check it 371 | * now. However, make sure the loop does not break if tup_old was set 372 | * in the previous iteration. In such a case we could not resume the 373 | * processing in the next call. 374 | */ 375 | if (must_complete && tup_old == NULL && 376 | processing_time_elapsed(must_complete)) 377 | /* The next call will process the remaining changes. */ 378 | break; 379 | } 380 | 381 | /* If we could not apply all the changes, the next call will do. */ 382 | if (dstate->nchanges == 0) 383 | tuplestore_clear(dstate->tstore); 384 | 385 | PopActiveSnapshot(); 386 | 387 | /* Cleanup. */ 388 | if (bistate) 389 | FreeBulkInsertState(bistate); 390 | 391 | ExecDropSingleTupleTableSlot(slot_dst); 392 | } 393 | 394 | static void 395 | apply_insert(HeapTuple tup, TupleTableSlot *slot, 396 | EState *estate, ModifyTableState *mtstate, 397 | struct PartitionTupleRouting *proute, 398 | partitions_hash *partitions, TupleConversionMapExt *conv_map, 399 | BulkInsertState bistate) 400 | { 401 | List *recheck; 402 | Relation rel_ins; 403 | ResultRelInfo *rri = NULL; 404 | 405 | if (conv_map) 406 | tup = convert_tuple_for_dest_table(tup, conv_map); 407 | ExecStoreHeapTuple(tup, slot, false); 408 | if (proute) 409 | { 410 | PartitionEntry *entry; 411 | 412 | /* Which partition does the tuple belong to? */ 413 | rri = ExecFindPartition(mtstate, mtstate->rootResultRelInfo, 414 | proute, slot, estate); 415 | rel_ins = rri->ri_RelationDesc; 416 | 417 | entry = get_partition_entry(partitions, 418 | RelationGetRelid(rel_ins)); 419 | bistate = entry->bistate; 420 | 421 | /* 422 | * Make sure the tuple matches the partition. The typical problem we 423 | * address here is that a partition was attached that has a different 424 | * order of columns. 425 | */ 426 | if (entry->conv_map) 427 | { 428 | tup = convert_tuple_for_dest_table(tup, entry->conv_map); 429 | ExecClearTuple(slot); 430 | ExecStoreHeapTuple(tup, slot, false); 431 | } 432 | } 433 | else 434 | { 435 | /* Non-partitioned table. */ 436 | rri = mtstate->resultRelInfo; 437 | rel_ins = rri->ri_RelationDesc; 438 | /* Use bistate passed by the caller. */ 439 | } 440 | Assert(bistate != NULL); 441 | table_tuple_insert(rel_ins, slot, GetCurrentCommandId(true), 0, 442 | bistate); 443 | 444 | #if PG_VERSION_NUM < 140000 445 | estate->es_result_relation_info = rri; 446 | #endif 447 | /* Update indexes. */ 448 | recheck = ExecInsertIndexTuples( 449 | #if PG_VERSION_NUM >= 140000 450 | rri, 451 | #endif 452 | slot, 453 | estate, 454 | #if PG_VERSION_NUM >= 140000 455 | false, /* update */ 456 | #endif 457 | false, /* noDupErr */ 458 | NULL, /* specConflict */ 459 | NIL /* arbiterIndexes */ 460 | #if PG_VERSION_NUM >= 160000 461 | , false /* onlySummarizing */ 462 | #endif 463 | ); 464 | ExecClearTuple(slot); 465 | 466 | pfree(tup); 467 | 468 | /* 469 | * If recheck is required, it must have been preformed on the source 470 | * relation by now. (All the logical changes we process here are already 471 | * committed.) 472 | */ 473 | list_free(recheck); 474 | 475 | /* Update the progress information. */ 476 | SpinLockAcquire(&MyWorkerTask->mutex); 477 | MyWorkerTask->progress.ins++; 478 | SpinLockRelease(&MyWorkerTask->mutex); 479 | } 480 | 481 | static void 482 | apply_update_or_delete(HeapTuple tup, HeapTuple tup_old, 483 | ConcurrentChangeKind change_kind, 484 | EState *estate, 485 | ScanKey key, int nkeys, Relation ident_index, 486 | TupleTableSlot *slot_dst, 487 | TupleTableSlot *slot_dst_ind, 488 | ModifyTableState *mtstate, 489 | struct PartitionTupleRouting *proute, 490 | partitions_hash *partitions, 491 | TupleConversionMapExt *conv_map) 492 | { 493 | ResultRelInfo *rri, *rri_old = NULL; 494 | 495 | /* 496 | * Convert the tuple(s) to match the destination table. 497 | */ 498 | if (conv_map) 499 | { 500 | tup = convert_tuple_for_dest_table(tup, conv_map); 501 | 502 | if (tup_old) 503 | { 504 | Assert(change_kind == CHANGE_UPDATE_NEW); 505 | 506 | tup_old = convert_tuple_for_dest_table(tup_old, conv_map); 507 | } 508 | } 509 | 510 | /* Is the destination table partitioned? */ 511 | if (proute) 512 | { 513 | /* Which partition does the tuple belong to? */ 514 | ExecStoreHeapTuple(tup, slot_dst, false); 515 | rri = ExecFindPartition(mtstate, mtstate->rootResultRelInfo, 516 | proute, slot_dst, estate); 517 | ExecClearTuple(slot_dst); 518 | 519 | if (change_kind == CHANGE_UPDATE_NEW && tup_old) 520 | { 521 | ExecStoreHeapTuple(tup_old, slot_dst, false); 522 | rri_old = ExecFindPartition(mtstate, mtstate->rootResultRelInfo, 523 | proute, slot_dst, estate); 524 | ExecClearTuple(slot_dst); 525 | } 526 | } 527 | else 528 | rri = mtstate->resultRelInfo; 529 | 530 | /* Is this a cross-partition update? */ 531 | if (rri_old && 532 | RelationGetRelid(rri_old->ri_RelationDesc) != 533 | RelationGetRelid(rri->ri_RelationDesc)) 534 | { 535 | ItemPointerData ctid; 536 | List *recheck; 537 | PartitionEntry *entry; 538 | 539 | /* 540 | * Cross-partition update. Delete the old tuple from its partition. 541 | */ 542 | find_tuple_in_partition(tup_old, rri_old->ri_RelationDesc, 543 | partitions, key, nkeys, &ctid); 544 | simple_heap_delete(rri_old->ri_RelationDesc, &ctid); 545 | 546 | /* Update the progress information. */ 547 | SpinLockAcquire(&MyWorkerTask->mutex); 548 | MyWorkerTask->progress.del++; 549 | SpinLockRelease(&MyWorkerTask->mutex); 550 | 551 | /* 552 | * Insert the new tuple into its partition. This might include 553 | * conversion to match the partition, see above. 554 | */ 555 | entry = get_partition_entry(partitions, 556 | RelationGetRelid(rri->ri_RelationDesc)); 557 | if (entry->conv_map) 558 | tup = convert_tuple_for_dest_table(tup, entry->conv_map); 559 | ExecStoreHeapTuple(tup, entry->slot, false); 560 | table_tuple_insert(rri->ri_RelationDesc, entry->slot, 561 | GetCurrentCommandId(true), 0, NULL); 562 | 563 | #if PG_VERSION_NUM < 140000 564 | estate->es_result_relation_info = rri; 565 | #endif 566 | /* Update indexes. */ 567 | recheck = ExecInsertIndexTuples( 568 | #if PG_VERSION_NUM >= 140000 569 | rri, 570 | #endif 571 | entry->slot, 572 | estate, 573 | #if PG_VERSION_NUM >= 140000 574 | false, /* update */ 575 | #endif 576 | false, /* noDupErr */ 577 | NULL, /* specConflict */ 578 | NIL /* arbiterIndexes */ 579 | #if PG_VERSION_NUM >= 160000 580 | , false /* onlySummarizing */ 581 | #endif 582 | ); 583 | ExecClearTuple(entry->slot); 584 | 585 | /* Update the progress information. */ 586 | SpinLockAcquire(&MyWorkerTask->mutex); 587 | MyWorkerTask->progress.ins++; 588 | SpinLockRelease(&MyWorkerTask->mutex); 589 | 590 | list_free(recheck); 591 | } 592 | else 593 | { 594 | HeapTuple tup_key; 595 | ItemPointerData ctid; 596 | 597 | /* 598 | * Both old and new tuple are in the same partition, or the target 599 | * table is not partitioned. Find the tuple to be updated or deleted. 600 | */ 601 | if (change_kind == CHANGE_UPDATE_NEW) 602 | tup_key = tup_old != NULL ? tup_old : tup; 603 | else 604 | { 605 | Assert(change_kind == CHANGE_DELETE); 606 | Assert(tup_old == NULL); 607 | tup_key = tup; 608 | } 609 | 610 | if (partitions) 611 | find_tuple_in_partition(tup_key, rri->ri_RelationDesc, 612 | partitions, key, nkeys, &ctid); 613 | else 614 | find_tuple(tup_key, rri->ri_RelationDesc, ident_index, key, nkeys, 615 | &ctid, slot_dst_ind); 616 | 617 | if (change_kind == CHANGE_UPDATE_NEW) 618 | { 619 | PartitionEntry *entry = NULL; 620 | 621 | #if PG_VERSION_NUM >= 160000 622 | TU_UpdateIndexes update_indexes; 623 | #endif 624 | 625 | if (partitions) 626 | { 627 | /* 628 | * Make sure the tuple matches the partition. 629 | */ 630 | entry = get_partition_entry(partitions, 631 | RelationGetRelid(rri->ri_RelationDesc)); 632 | if (entry->conv_map) 633 | tup = convert_tuple_for_dest_table(tup, 634 | entry->conv_map); 635 | } 636 | 637 | simple_heap_update(rri->ri_RelationDesc, &ctid, tup 638 | #if PG_VERSION_NUM >= 160000 639 | , &update_indexes 640 | #endif 641 | ); 642 | if (!HeapTupleIsHeapOnly(tup)) 643 | { 644 | TupleTableSlot *slot; 645 | List *recheck; 646 | 647 | slot = entry ? entry->slot : slot_dst; 648 | 649 | ExecStoreHeapTuple(tup, slot, false); 650 | 651 | /* 652 | * XXX Consider passing update=true, however it requires 653 | * es_range_table to be initialized. Is it worth the 654 | * complexity? 655 | */ 656 | recheck = ExecInsertIndexTuples( 657 | #if PG_VERSION_NUM >= 140000 658 | rri, 659 | #endif 660 | slot, 661 | estate, 662 | #if PG_VERSION_NUM >= 140000 663 | false, /* update */ 664 | #endif 665 | false, /* noDupErr */ 666 | NULL, /* specConflict */ 667 | NIL /* arbiterIndexes */ 668 | #if PG_VERSION_NUM >= 160000 669 | /* onlySummarizing */ 670 | , update_indexes == TU_Summarizing 671 | #endif 672 | ); 673 | ExecClearTuple(slot); 674 | list_free(recheck); 675 | } 676 | 677 | /* Update the progress information. */ 678 | SpinLockAcquire(&MyWorkerTask->mutex); 679 | MyWorkerTask->progress.upd++; 680 | SpinLockRelease(&MyWorkerTask->mutex); 681 | } 682 | else 683 | { 684 | Assert(change_kind == CHANGE_DELETE); 685 | 686 | simple_heap_delete(rri->ri_RelationDesc, &ctid); 687 | 688 | /* Update the progress information. */ 689 | SpinLockAcquire(&MyWorkerTask->mutex); 690 | MyWorkerTask->progress.del++; 691 | SpinLockRelease(&MyWorkerTask->mutex); 692 | } 693 | } 694 | 695 | pfree(tup); 696 | if (tup_old) 697 | pfree(tup_old); 698 | } 699 | 700 | /* 701 | * Find tuple whose identity key is passed as 'tup' in relation 'rel' and put 702 | * its location into 'ctid'. 703 | */ 704 | static void 705 | find_tuple_in_partition(HeapTuple tup, Relation partition, 706 | partitions_hash *partitions, 707 | ScanKey key, int nkeys, ItemPointer ctid) 708 | { 709 | Oid part_oid = RelationGetRelid(partition); 710 | HeapTuple tup_mapped = NULL; 711 | PartitionEntry *entry; 712 | 713 | entry = partitions_lookup(partitions, part_oid); 714 | if (entry == NULL) 715 | elog(ERROR, "identity index not found for partition %u", part_oid); 716 | Assert(entry->part_oid == part_oid); 717 | 718 | /* 719 | * Make sure the tuple matches the partition. 720 | */ 721 | if (entry->conv_map) 722 | { 723 | /* 724 | * convert_tuple_for_dest_table() is not suitable here because we need 725 | * to keep the original tuple. XXX Should we add a boolean argument to 726 | * the function that indicates whether it should free the original 727 | * tuple? 728 | */ 729 | tup_mapped = pg_rewrite_execute_attr_map_tuple(tup, 730 | entry->conv_map); 731 | tup = tup_mapped; 732 | } 733 | find_tuple(tup, partition, entry->ident_index, key, nkeys, ctid, 734 | entry->slot_ind); 735 | if (tup_mapped) 736 | pfree(tup_mapped); 737 | } 738 | 739 | /* 740 | * Find tuple whose identity key is passed as 'tup' in relation 'rel' and put 741 | * its location into 'ctid'. 742 | */ 743 | static void 744 | find_tuple(HeapTuple tup, Relation rel, Relation ident_index, ScanKey key, 745 | int nkeys, ItemPointer ctid, TupleTableSlot *slot_dst_ind) 746 | { 747 | Form_pg_index ident_form; 748 | int2vector *ident_indkey; 749 | IndexScanDesc scan; 750 | int i; 751 | HeapTuple tup_exist; 752 | 753 | ident_form = ident_index->rd_index; 754 | ident_indkey = &ident_form->indkey; 755 | scan = index_beginscan(rel, ident_index, GetActiveSnapshot(), 756 | #if PG_VERSION_NUM >= 180000 757 | NULL, /* instrument */ 758 | #endif 759 | nkeys, 0); 760 | index_rescan(scan, key, nkeys, NULL, 0); 761 | 762 | /* Use the incoming tuple to finalize the scan key. */ 763 | for (i = 0; i < scan->numberOfKeys; i++) 764 | { 765 | ScanKey entry; 766 | bool isnull; 767 | int16 attno_heap; 768 | 769 | entry = &scan->keyData[i]; 770 | attno_heap = ident_indkey->values[i]; 771 | entry->sk_argument = heap_getattr(tup, 772 | attno_heap, 773 | rel->rd_att, 774 | &isnull); 775 | Assert(!isnull); 776 | } 777 | if (index_getnext_slot(scan, ForwardScanDirection, slot_dst_ind)) 778 | { 779 | bool shouldFreeInd; 780 | 781 | tup_exist = ExecFetchSlotHeapTuple(slot_dst_ind, false, 782 | &shouldFreeInd); 783 | /* TTSOpsBufferHeapTuple has .get_heap_tuple != NULL. */ 784 | Assert(!shouldFreeInd); 785 | } 786 | else 787 | tup_exist = NULL; 788 | if (tup_exist == NULL) 789 | elog(ERROR, "Failed to find target tuple"); 790 | ItemPointerCopy(&tup_exist->t_self, ctid); 791 | index_endscan(scan); 792 | } 793 | 794 | static bool 795 | processing_time_elapsed(struct timeval *utmost) 796 | { 797 | struct timeval now; 798 | 799 | if (utmost == NULL) 800 | return false; 801 | 802 | gettimeofday(&now, NULL); 803 | 804 | if (now.tv_sec < utmost->tv_sec) 805 | return false; 806 | 807 | if (now.tv_sec > utmost->tv_sec) 808 | return true; 809 | 810 | return now.tv_usec >= utmost->tv_usec; 811 | } 812 | 813 | /* 814 | * Convert tuple according to the map and free the original one. 815 | */ 816 | HeapTuple 817 | convert_tuple_for_dest_table(HeapTuple tuple, 818 | TupleConversionMapExt *conv_map) 819 | { 820 | HeapTuple orig = tuple; 821 | 822 | tuple = pg_rewrite_execute_attr_map_tuple(tuple, conv_map); 823 | pfree(orig); 824 | 825 | return tuple; 826 | } 827 | 828 | void 829 | _PG_output_plugin_init(OutputPluginCallbacks *cb) 830 | { 831 | AssertVariableIsOfType(&_PG_output_plugin_init, LogicalOutputPluginInit); 832 | 833 | cb->startup_cb = plugin_startup; 834 | cb->begin_cb = plugin_begin_txn; 835 | cb->change_cb = plugin_change; 836 | cb->commit_cb = plugin_commit_txn; 837 | cb->filter_by_origin_cb = plugin_filter; 838 | cb->shutdown_cb = plugin_shutdown; 839 | } 840 | 841 | 842 | /* initialize this plugin */ 843 | static void 844 | plugin_startup(LogicalDecodingContext *ctx, OutputPluginOptions *opt, 845 | bool is_init) 846 | { 847 | ctx->output_plugin_private = NULL; 848 | 849 | /* Probably unnecessary, as we don't use the SQL interface ... */ 850 | opt->output_type = OUTPUT_PLUGIN_BINARY_OUTPUT; 851 | 852 | if (ctx->output_plugin_options != NIL) 853 | { 854 | ereport(ERROR, 855 | (errcode(ERRCODE_INVALID_PARAMETER_VALUE), 856 | errmsg("This plugin does not expect any options"))); 857 | } 858 | } 859 | 860 | static void 861 | plugin_shutdown(LogicalDecodingContext *ctx) 862 | { 863 | } 864 | 865 | /* 866 | * As we don't release the slot during processing of particular table, there's 867 | * no room for SQL interface, even for debugging purposes. Therefore we need 868 | * neither OutputPluginPrepareWrite() nor OutputPluginWrite() in the plugin 869 | * callbacks. (Although we might want to write custom callbacks, this API 870 | * seems to be unnecessarily generic for our purposes.) 871 | */ 872 | 873 | /* BEGIN callback */ 874 | static void 875 | plugin_begin_txn(LogicalDecodingContext *ctx, ReorderBufferTXN *txn) 876 | { 877 | } 878 | 879 | /* COMMIT callback */ 880 | static void 881 | plugin_commit_txn(LogicalDecodingContext *ctx, ReorderBufferTXN *txn, 882 | XLogRecPtr commit_lsn) 883 | { 884 | } 885 | 886 | /* 887 | * Callback for individual changed tuples 888 | */ 889 | static void 890 | plugin_change(LogicalDecodingContext *ctx, ReorderBufferTXN *txn, 891 | Relation relation, ReorderBufferChange *change) 892 | { 893 | DecodingOutputState *dstate; 894 | 895 | dstate = (DecodingOutputState *) ctx->output_writer_private; 896 | 897 | /* Only interested in one particular relation. */ 898 | if (relation->rd_id != dstate->relid) 899 | return; 900 | 901 | /* Decode entry depending on its type */ 902 | switch (change->action) 903 | { 904 | case REORDER_BUFFER_CHANGE_INSERT: 905 | { 906 | HeapTuple newtuple; 907 | 908 | newtuple = change->data.tp.newtuple != NULL ? 909 | #if PG_VERSION_NUM >= 170000 910 | change->data.tp.newtuple : NULL; 911 | #else 912 | &change->data.tp.newtuple->tuple : NULL; 913 | #endif 914 | 915 | /* 916 | * Identity checks in the main function should have made this 917 | * impossible. 918 | */ 919 | if (newtuple == NULL) 920 | elog(ERROR, "Incomplete insert info."); 921 | 922 | store_change(ctx, CHANGE_INSERT, newtuple); 923 | } 924 | break; 925 | case REORDER_BUFFER_CHANGE_UPDATE: 926 | { 927 | HeapTuple oldtuple, 928 | newtuple; 929 | 930 | oldtuple = change->data.tp.oldtuple != NULL ? 931 | #if PG_VERSION_NUM >= 170000 932 | change->data.tp.oldtuple : NULL; 933 | #else 934 | &change->data.tp.oldtuple->tuple : NULL; 935 | #endif 936 | newtuple = change->data.tp.newtuple != NULL ? 937 | #if PG_VERSION_NUM >= 170000 938 | change->data.tp.newtuple : NULL; 939 | #else 940 | &change->data.tp.newtuple->tuple : NULL; 941 | #endif 942 | 943 | if (newtuple == NULL) 944 | elog(ERROR, "Incomplete update info."); 945 | 946 | if (oldtuple != NULL) 947 | store_change(ctx, CHANGE_UPDATE_OLD, oldtuple); 948 | 949 | store_change(ctx, CHANGE_UPDATE_NEW, newtuple); 950 | } 951 | break; 952 | case REORDER_BUFFER_CHANGE_DELETE: 953 | { 954 | HeapTuple oldtuple; 955 | 956 | oldtuple = change->data.tp.oldtuple ? 957 | #if PG_VERSION_NUM >= 170000 958 | change->data.tp.oldtuple : NULL; 959 | #else 960 | &change->data.tp.oldtuple->tuple : NULL; 961 | #endif 962 | 963 | if (oldtuple == NULL) 964 | elog(ERROR, "Incomplete delete info."); 965 | 966 | store_change(ctx, CHANGE_DELETE, oldtuple); 967 | } 968 | break; 969 | default: 970 | /* Should not come here */ 971 | Assert(0); 972 | break; 973 | } 974 | } 975 | 976 | /* Store concurrent data change. */ 977 | static void 978 | store_change(LogicalDecodingContext *ctx, ConcurrentChangeKind kind, 979 | HeapTuple tuple) 980 | { 981 | DecodingOutputState *dstate; 982 | char *change_raw; 983 | ConcurrentChange *change; 984 | MemoryContext oldcontext; 985 | bool flattened = false; 986 | Size size; 987 | Datum values[1]; 988 | bool isnull[1]; 989 | char *dst; 990 | 991 | dstate = (DecodingOutputState *) ctx->output_writer_private; 992 | 993 | /* 994 | * ReorderBufferCommit() stores the TOAST chunks in its private memory 995 | * context and frees them after having called apply_change(). Therefore we 996 | * need flat copy (including TOAST) that we eventually copy into the 997 | * memory context which is available to 998 | * pg_rewrite_decode_concurrent_changes(). 999 | */ 1000 | if (HeapTupleHasExternal(tuple)) 1001 | { 1002 | /* 1003 | * toast_flatten_tuple_to_datum() might be more convenient but we 1004 | * don't want the decompression it does. 1005 | */ 1006 | tuple = toast_flatten_tuple(tuple, dstate->tupdesc_src); 1007 | flattened = true; 1008 | } 1009 | 1010 | size = MAXALIGN(VARHDRSZ) + sizeof(ConcurrentChange) + tuple->t_len; 1011 | /* XXX Isn't there any function / macro to do this? */ 1012 | if (size >= 0x3FFFFFFF) 1013 | elog(ERROR, "Change is too big."); 1014 | 1015 | oldcontext = MemoryContextSwitchTo(ctx->context); 1016 | change_raw = (char *) palloc(size); 1017 | MemoryContextSwitchTo(oldcontext); 1018 | 1019 | SET_VARSIZE(change_raw, size); 1020 | change = (ConcurrentChange *) VARDATA(change_raw); 1021 | 1022 | /* 1023 | * Copy the tuple. 1024 | * 1025 | * CAUTION: change->tup_data.t_data must be fixed on retrieval! 1026 | */ 1027 | memcpy(&change->tup_data, tuple, sizeof(HeapTupleData)); 1028 | dst = (char *) change + sizeof(ConcurrentChange); 1029 | memcpy(dst, tuple->t_data, tuple->t_len); 1030 | 1031 | /* The other field. */ 1032 | change->kind = kind; 1033 | 1034 | /* The data has been copied. */ 1035 | if (flattened) 1036 | pfree(tuple); 1037 | 1038 | /* Store as tuple of 1 bytea column. */ 1039 | values[0] = PointerGetDatum(change_raw); 1040 | isnull[0] = false; 1041 | tuplestore_putvalues(dstate->tstore, dstate->tupdesc_change, 1042 | values, isnull); 1043 | 1044 | /* Accounting. */ 1045 | dstate->nchanges++; 1046 | 1047 | /* Cleanup. */ 1048 | pfree(change_raw); 1049 | } 1050 | 1051 | /* 1052 | * Retrieve tuple from a change structure. As for the change, no alignment is 1053 | * assumed. 1054 | */ 1055 | static HeapTuple 1056 | get_changed_tuple(ConcurrentChange *change) 1057 | { 1058 | HeapTupleData tup_data; 1059 | HeapTuple result; 1060 | char *src; 1061 | 1062 | /* 1063 | * Ensure alignment before accessing the fields. (This is why we can't use 1064 | * heap_copytuple() instead of this function.) 1065 | */ 1066 | memcpy(&tup_data, &change->tup_data, sizeof(HeapTupleData)); 1067 | 1068 | result = (HeapTuple) palloc(HEAPTUPLESIZE + tup_data.t_len); 1069 | memcpy(result, &tup_data, sizeof(HeapTupleData)); 1070 | result->t_data = (HeapTupleHeader) ((char *) result + HEAPTUPLESIZE); 1071 | src = (char *) change + sizeof(ConcurrentChange); 1072 | memcpy(result->t_data, src, result->t_len); 1073 | 1074 | return result; 1075 | } 1076 | 1077 | /* 1078 | * A filter that recognizes changes produced by the initial load. 1079 | */ 1080 | static bool 1081 | plugin_filter(LogicalDecodingContext *ctx, RepOriginId origin_id) 1082 | { 1083 | DecodingOutputState *dstate; 1084 | 1085 | dstate = (DecodingOutputState *) ctx->output_writer_private; 1086 | 1087 | /* dstate is not initialized during decoding setup - should it be? */ 1088 | if (dstate && dstate->rorigin != InvalidRepOriginId && 1089 | origin_id == dstate->rorigin) 1090 | return true; 1091 | 1092 | return false; 1093 | } 1094 | --------------------------------------------------------------------------------