├── pg_rewrite.md
├── .gitignore
├── .dir-locals.el
├── pg_rewrite.control
├── typedefs.list
├── pg_rewrite--1.2--2.0.sql
├── pg_rewrite--1.0.sql
├── pg_rewrite--1.3--2.0.sql
├── Makefile
├── pg_rewrite--1.1--1.2.sql
├── LICENSE
├── .github
    └── workflows
    │   └── regression.yml
├── pg_rewrite--1.0--1.1.sql
├── NEWS
├── sql
    ├── generated.sql
    └── pg_rewrite.sql
├── expected
    ├── generated.out
    ├── generated_1.out
    ├── pg_rewrite_concurrent_toast.out
    ├── pg_rewrite_concurrent_partition.out
    ├── pg_rewrite_concurrent.out
    ├── pg_rewrite_1.out
    └── pg_rewrite.out
├── specs
    ├── pg_rewrite_concurrent_toast.spec
    ├── pg_rewrite_concurrent.spec
    └── pg_rewrite_concurrent_partition.spec
├── pg_rewrite.h
├── README.md
└── concurrent.c


/pg_rewrite.md:
--------------------------------------------------------------------------------
1 | README.md


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 | *.o
3 | *.so
4 | results/
5 | GPATH
6 | GRTAGS
7 | GTAGS
8 | 


--------------------------------------------------------------------------------
/.dir-locals.el:
--------------------------------------------------------------------------------
1 | ((c-mode . ((c-basic-offset . 4)
2 |             (c-file-style . "bsd")
3 |             (fill-column . 78)
4 |             (indent-tabs-mode . t)
5 |             (tab-width . 4))))
6 | 


--------------------------------------------------------------------------------
/pg_rewrite.control:
--------------------------------------------------------------------------------
1 | # pg_rewrite extension
2 | comment = 'Tool for maintenance that requires table rewriting.'
3 | default_version = '2.0'
4 | module_pathname = '$libdir/pg_rewrite'
5 | relocatable = true
6 | 


--------------------------------------------------------------------------------
/typedefs.list:
--------------------------------------------------------------------------------
 1 | CatalogState
 2 | ConcurrentChange
 3 | ConcurrentChangeKind
 4 | ConstraintInfo
 5 | DecodingOutputState
 6 | IndexCatInfo
 7 | PartitionEntry
 8 | PgClassCatInfo
 9 | TypeCatInfo
10 | partitions_hash
11 | 


--------------------------------------------------------------------------------
/pg_rewrite--1.2--2.0.sql:
--------------------------------------------------------------------------------
1 | /* pg_rewrite--1.2--2.0.sql */
2 | 
3 | -- complain if script is sourced in psql, rather than via ALTER EXTENSION
4 | \echo Use "ALTER EXTENSION pg_rewrite UPDATE TO '2.0'" to load this file. \quit
5 | 


--------------------------------------------------------------------------------
/pg_rewrite--1.0.sql:
--------------------------------------------------------------------------------
 1 | /* pg_rewrite--1.0.sql */
 2 | 
 3 | -- complain if script is sourced in psql, rather than via CREATE EXTENSION
 4 | \echo Use "CREATE EXTENSION pg_rewrite" to load this file. \quit
 5 | 
 6 | CREATE FUNCTION partition_table(
 7 |        src_table	text,
 8 |        dst_table	text,
 9 |        src_table_new	text)
10 | RETURNS void
11 | AS 'MODULE_PATHNAME', 'partition_table'
12 | LANGUAGE C;
13 | 


--------------------------------------------------------------------------------
/pg_rewrite--1.3--2.0.sql:
--------------------------------------------------------------------------------
 1 | /* pg_rewrite--1.3--2.0.sql */
 2 | 
 3 | -- complain if script is sourced in psql, rather than via ALTER EXTENSION
 4 | \echo Use "ALTER EXTENSION pg_rewrite UPDATE TO '2.0'" to load this file. \quit
 5 | 
 6 | DROP FUNCTION IF EXISTS rewrite_table_nowait;
 7 | CREATE FUNCTION rewrite_table_nowait(
 8 |        src_table	text,
 9 |        dst_table	text,
10 |        src_table_new	text)
11 | RETURNS void
12 | AS 'MODULE_PATHNAME', 'rewrite_table_nowait'
13 | LANGUAGE C
14 | STRICT;


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | PG_CONFIG ?= pg_config
 2 | MODULE_big = pg_rewrite
 3 | OBJS = pg_rewrite.o concurrent.o $(WIN32RES)
 4 | PGFILEDESC = "pg_rewrite - tools for maintenance that requires table rewriting."
 5 | 
 6 | EXTENSION = pg_rewrite
 7 | DATA = pg_rewrite--1.0.sql pg_rewrite--1.0--1.1.sql pg_rewrite--1.1--1.2.sql\
 8 | pg_rewrite--1.2--2.0.sql pg_rewrite--1.3--2.0.sql
 9 | DOCS = pg_rewrite.md
10 | 
11 | REGRESS = pg_rewrite generated
12 | #ISOLATION = pg_rewrite_concurrent pg_rewrite_concurrent_partition \
13 | pg_rewrite_concurrent_toast
14 | 
15 | PGXS := $(shell $(PG_CONFIG) --pgxs)
16 | include $(PGXS)
17 | 
18 | 


--------------------------------------------------------------------------------
/pg_rewrite--1.1--1.2.sql:
--------------------------------------------------------------------------------
 1 | /* pg_rewrite--1.1--1.2.sql */
 2 | 
 3 | -- complain if script is sourced in psql, rather than via ALTER EXTENSION
 4 | \echo Use "ALTER EXTENSION pg_rewrite UPDATE TO '1.2'" to load this file. \quit
 5 | 
 6 | DROP FUNCTION partition_table(text, text, text);
 7 | 
 8 | CREATE FUNCTION rewrite_table(
 9 |        src_table	text,
10 |        dst_table	text,
11 |        src_table_new	text)
12 | RETURNS void
13 | AS 'MODULE_PATHNAME', 'rewrite_table'
14 | LANGUAGE C
15 | STRICT;
16 | 
17 | CREATE FUNCTION rewrite_table_nowait(
18 |        src_table	text,
19 |        dst_table	text,
20 |        src_table_new	text)
21 | RETURNS void
22 | AS 'MODULE_PATHNAME', 'rewrite_table_nowait'
23 | LANGUAGE C
24 | STRICT;
25 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2021-2023, Cybertec PostgreSQL International GmbH
 2 | 
 3 | Permission to use, copy, modify, and distribute this software and its
 4 | documentation for any purpose, without fee, and without a written agreement is
 5 | hereby granted, provided that the above copyright notice and this paragraph
 6 | and the following two paragraphs appear in all copies.
 7 | 
 8 | IN NO EVENT SHALL Cybertec PostgreSQL International GmbH BE LIABLE TO ANY
 9 | PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
10 | INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS
11 | DOCUMENTATION, EVEN IF Cybertec PostgreSQL International GmbH HAS BEEN ADVISED
12 | OF THE POSSIBILITY OF SUCH DAMAGE.
13 | 
14 | Cybertec PostgreSQL International GmbH SPECIFICALLY DISCLAIMS ANY WARRANTIES,
15 | INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
16 | FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS
17 | IS" BASIS, AND Cybertec PostgreSQL International GmbH HAS NO OBLIGATIONS TO
18 | PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
19 | 


--------------------------------------------------------------------------------
/.github/workflows/regression.yml:
--------------------------------------------------------------------------------
 1 | name: Build
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | jobs:
 6 |   build:
 7 |     runs-on: ubuntu-latest
 8 | 
 9 |     defaults:
10 |       run:
11 |         shell: sh
12 | 
13 |     strategy:
14 |       matrix:
15 |         pgversion:
16 |           - 17
17 | 
18 |     env:
19 |       PGVERSION: ${{ matrix.pgversion }}
20 | 
21 |     steps:
22 |     - name: checkout
23 |       uses: actions/checkout@v3
24 | 
25 |     - name: install pg
26 |       run: |
27 |         sudo /usr/share/postgresql-common/pgdg/apt.postgresql.org.sh -v $PGVERSION -p -i
28 |         sudo -u postgres createuser -s "$USER"
29 | 
30 |     - name: build
31 |       run: |
32 |         make PROFILE="-Werror"
33 |         sudo -E make install
34 | 
35 |     - name: test
36 |       run: |
37 |         sudo pg_conftool set shared_preload_libraries pg_rewrite
38 |         sudo pg_conftool set wal_level logical
39 |         sudo pg_conftool set max_replication_slots 1
40 |         sudo pg_ctlcluster $PGVERSION main restart
41 |         make installcheck
42 | 
43 |     - name: show regression diffs
44 |       if: ${{ failure() }}
45 |       run: |
46 |         cat /home/runner/work/pg_rewrite/pg_rewrite/output_iso/regression.diffs
47 | 


--------------------------------------------------------------------------------
/pg_rewrite--1.0--1.1.sql:
--------------------------------------------------------------------------------
 1 | /* pg_rewrite--1.0--1.1.sql */
 2 | 
 3 | -- complain if script is sourced in psql, rather than via ALTER EXTENSION
 4 | \echo Use "ALTER EXTENSION pg_rewrite UPDATE TO '1.1'" to load this file. \quit
 5 | 
 6 | DROP FUNCTION partition_table(text, text, text);
 7 | CREATE FUNCTION partition_table(
 8 |        src_table	text,
 9 |        dst_table	text,
10 |        src_table_new	text)
11 | RETURNS void
12 | AS 'MODULE_PATHNAME', 'partition_table_new'
13 | LANGUAGE C
14 | STRICT;
15 | 
16 | CREATE FUNCTION pg_rewrite_get_task_list()
17 | RETURNS TABLE (
18 | 	tabschema_src	name,
19 | 	tabname_src	name,
20 | 	tabschema_dst	name,
21 | 	tabname_dst	name,
22 | 	tabname_src_new	name,
23 | 	ins_initial	bigint,
24 | 	ins		bigint,
25 | 	upd		bigint,
26 | 	del		bigint)
27 | AS 'MODULE_PATHNAME', 'pg_rewrite_get_task_list'
28 | LANGUAGE C;
29 | 
30 | -- The column names should match the arguments of the partition_table()
31 | -- function.
32 | CREATE VIEW pg_rewrite_progress AS
33 | SELECT	COALESCE(tabschema_src || '.', '') || tabname_src AS src_table,
34 | 	COALESCE(tabschema_dst || '.', '') || tabname_dst AS dst_table,
35 | 	tabname_src_new AS src_table_new,
36 | 	ins_initial, ins, upd, del
37 | FROM pg_rewrite_get_task_list();
38 | 


--------------------------------------------------------------------------------
/NEWS:
--------------------------------------------------------------------------------
 1 | Release 2.0
 2 | ===========
 3 | 
 4 | 1.  This release makes the extension useful in more use cases.
 5 | 
 6 |     Besides turning a non-partitioned table into a partitioned one, it can be
 7 |     used to change 1) data type of column(s), 2) order of columns, 3)
 8 |     tablespace.
 9 | 
10 | 2.  A single function `rewrite_table()` is used now to handle all the use
11 |     cases.
12 | 
13 | 3.  Constraints are handled in a more convenient way.
14 | 
15 |     The extension now takes care of creating the constraints on the target
16 |     table according to the source table. The user only needs to validate the
17 |     constraints after the rewriting has finished.
18 | 
19 |     Unlike the previous release, the rewritten table can be referenced by
20 |     foreign key constraints.
21 | 
22 |     Note: The `rewrite.check_constraints` configuration variable was
23 |     removed. If there is a risk that other users could run `ALTER TABLE` on
24 |     the table during rewriting, please revoke the corresponding privileges
25 |     from them temporarily.
26 | 
27 | 
28 | Release 1.1.1
29 | =============
30 | 
31 | This release only adjusts the code so it is compatible with PostgreSQL server
32 | version 17.
33 | 
34 | 
35 | Release 1.1.0
36 | =============
37 | 
38 | New Features
39 | ------------
40 | 
41 | 1. Make the code compatible with PostgreSQL server version 16.
42 | 
43 | 2. Added progress monitoring.
44 | 


--------------------------------------------------------------------------------
/sql/generated.sql:
--------------------------------------------------------------------------------
 1 | -- Generated columns - some meaningful combinations of source and destination
 2 | -- columns.
 3 | CREATE TABLE tab7(
 4 |         i int primary key,
 5 | 	j int,
 6 | 	k int generated always as (i + 1) virtual,
 7 | 	l int generated always AS (i + 1) stored,
 8 | 	m int generated always AS (i + 1) virtual);
 9 | CREATE TABLE tab7_new(
10 |         i int primary key,
11 |         -- Override the value copied from the source table.
12 |         j int generated always AS (i - 1) stored,
13 | 	-- Check that the expression is evaluated correctly on the source
14 | 	-- table.
15 | 	k int,
16 | 	-- The same for stored expression.
17 | 	l int,
18 | 	-- Override the value computed on the source table.
19 | 	m int generated always as (i - 1) virtual);
20 | INSERT INTO tab7(i, j) VALUES (1, 1);
21 | SELECT rewrite_table('tab7', 'tab7_new', 'tab7_orig');
22 | SELECT * FROM tab7;
23 | 
24 | CREATE EXTENSION pageinspect;
25 | -- HEAP_HASNULL indicates that the value of 'm' hasn't been copied from the
26 | -- source table.
27 | SELECT raw_flags
28 | FROM heap_page_items(get_raw_page('tab7', 0)),
29 | LATERAL heap_tuple_infomask_flags(t_infomask, t_infomask2);
30 | 
31 | -- For PG < 18, test without VIRTUAL columns.
32 | CREATE TABLE tab8(
33 |         i int primary key,
34 |         j int,
35 |         k int generated always AS (i + 1) stored);
36 | CREATE TABLE tab8_new(
37 |         i int primary key,
38 |         -- Override the value copied from the source table.
39 |         j int generated always AS (i - 1) stored,
40 |         -- Check that the expression is evaluated correctly on the source
41 |         -- table.
42 |         k int);
43 | INSERT INTO tab8(i, j) VALUES (1, 1);
44 | SELECT rewrite_table('tab8', 'tab8_new', 'tab8_orig');
45 | SELECT * FROM tab8;
46 | 


--------------------------------------------------------------------------------
/expected/generated.out:
--------------------------------------------------------------------------------
 1 | -- Generated columns - some meaningful combinations of source and destination
 2 | -- columns.
 3 | CREATE TABLE tab7(
 4 |         i int primary key,
 5 | 	j int,
 6 | 	k int generated always as (i + 1) virtual,
 7 | 	l int generated always AS (i + 1) stored,
 8 | 	m int generated always AS (i + 1) virtual);
 9 | CREATE TABLE tab7_new(
10 |         i int primary key,
11 |         -- Override the value copied from the source table.
12 |         j int generated always AS (i - 1) stored,
13 | 	-- Check that the expression is evaluated correctly on the source
14 | 	-- table.
15 | 	k int,
16 | 	-- The same for stored expression.
17 | 	l int,
18 | 	-- Override the value computed on the source table.
19 | 	m int generated always as (i - 1) virtual);
20 | INSERT INTO tab7(i, j) VALUES (1, 1);
21 | SELECT rewrite_table('tab7', 'tab7_new', 'tab7_orig');
22 |  rewrite_table 
23 | ---------------
24 |  
25 | (1 row)
26 | 
27 | SELECT * FROM tab7;
28 |  i | j | k | l | m 
29 | ---+---+---+---+---
30 |  1 | 0 | 2 | 2 | 0
31 | (1 row)
32 | 
33 | CREATE EXTENSION pageinspect;
34 | -- HEAP_HASNULL indicates that the value of 'm' hasn't been copied from the
35 | -- source table.
36 | SELECT raw_flags
37 | FROM heap_page_items(get_raw_page('tab7', 0)),
38 | LATERAL heap_tuple_infomask_flags(t_infomask, t_infomask2);
39 |                       raw_flags                       
40 | ------------------------------------------------------
41 |  {HEAP_HASNULL,HEAP_XMIN_COMMITTED,HEAP_XMAX_INVALID}
42 | (1 row)
43 | 
44 | -- For PG < 18, test without VIRTUAL columns.
45 | CREATE TABLE tab8(
46 |         i int primary key,
47 |         j int,
48 |         k int generated always AS (i + 1) stored);
49 | CREATE TABLE tab8_new(
50 |         i int primary key,
51 |         -- Override the value copied from the source table.
52 |         j int generated always AS (i - 1) stored,
53 |         -- Check that the expression is evaluated correctly on the source
54 |         -- table.
55 |         k int);
56 | INSERT INTO tab8(i, j) VALUES (1, 1);
57 | SELECT rewrite_table('tab8', 'tab8_new', 'tab8_orig');
58 |  rewrite_table 
59 | ---------------
60 |  
61 | (1 row)
62 | 
63 | SELECT * FROM tab8;
64 |  i | j | k 
65 | ---+---+---
66 |  1 | 0 | 2
67 | (1 row)
68 | 
69 | 


--------------------------------------------------------------------------------
/expected/generated_1.out:
--------------------------------------------------------------------------------
 1 | -- Generated columns - some meaningful combinations of source and destination
 2 | -- columns.
 3 | CREATE TABLE tab7(
 4 |         i int primary key,
 5 | 	j int,
 6 | 	k int generated always as (i + 1) virtual,
 7 | 	l int generated always AS (i + 1) stored,
 8 | 	m int generated always AS (i + 1) virtual);
 9 | ERROR:  syntax error at or near "virtual"
10 | LINE 4:  k int generated always as (i + 1) virtual,
11 |                                            ^
12 | CREATE TABLE tab7_new(
13 |         i int primary key,
14 |         -- Override the value copied from the source table.
15 |         j int generated always AS (i - 1) stored,
16 | 	-- Check that the expression is evaluated correctly on the source
17 | 	-- table.
18 | 	k int,
19 | 	-- The same for stored expression.
20 | 	l int,
21 | 	-- Override the value computed on the source table.
22 | 	m int generated always as (i - 1) virtual);
23 | ERROR:  syntax error at or near "virtual"
24 | LINE 11:  m int generated always as (i - 1) virtual);
25 |                                             ^
26 | INSERT INTO tab7(i, j) VALUES (1, 1);
27 | ERROR:  relation "tab7" does not exist
28 | LINE 1: INSERT INTO tab7(i, j) VALUES (1, 1);
29 |                     ^
30 | SELECT rewrite_table('tab7', 'tab7_new', 'tab7_orig');
31 | ERROR:  relation "tab7" does not exist
32 | SELECT * FROM tab7;
33 | ERROR:  relation "tab7" does not exist
34 | LINE 1: SELECT * FROM tab7;
35 |                       ^
36 | CREATE EXTENSION pageinspect;
37 | -- HEAP_HASNULL indicates that the value of 'm' hasn't been copied from the
38 | -- source table.
39 | SELECT raw_flags
40 | FROM heap_page_items(get_raw_page('tab7', 0)),
41 | LATERAL heap_tuple_infomask_flags(t_infomask, t_infomask2);
42 | ERROR:  relation "tab7" does not exist
43 | -- For PG < 18, test without VIRTUAL columns.
44 | CREATE TABLE tab8(
45 |         i int primary key,
46 |         j int,
47 |         k int generated always AS (i + 1) stored);
48 | CREATE TABLE tab8_new(
49 |         i int primary key,
50 |         -- Override the value copied from the source table.
51 |         j int generated always AS (i - 1) stored,
52 |         -- Check that the expression is evaluated correctly on the source
53 |         -- table.
54 |         k int);
55 | INSERT INTO tab8(i, j) VALUES (1, 1);
56 | SELECT rewrite_table('tab8', 'tab8_new', 'tab8_orig');
57 |  rewrite_table 
58 | ---------------
59 |  
60 | (1 row)
61 | 
62 | SELECT * FROM tab8;
63 |  i | j | k 
64 | ---+---+---
65 |  1 | 0 | 2
66 | (1 row)
67 | 
68 | 


--------------------------------------------------------------------------------
/expected/pg_rewrite_concurrent_toast.out:
--------------------------------------------------------------------------------
  1 | Parsed test spec with 2 sessions
  2 | 
  3 | starting permutation: do_rewrite wait_for_before_lock_ip do_changes wakeup_before_lock_ip wait_for_after_commit_ip do_check wakeup_after_commit_ip
  4 | injection_points_attach
  5 | -----------------------
  6 |                        
  7 | (1 row)
  8 | 
  9 | step do_rewrite: 
 10 |     SELECT rewrite_table_nowait('tbl_src', 'tbl_dst', 'tbl_src_old');
 11 | 
 12 | rewrite_table_nowait
 13 | --------------------
 14 |                     
 15 | (1 row)
 16 | 
 17 | step wait_for_before_lock_ip: 
 18 | DO $$
 19 | BEGIN
 20 |         LOOP
 21 | 		PERFORM pg_stat_clear_snapshot();
 22 | 
 23 | 	        PERFORM
 24 | 		FROM pg_stat_activity
 25 | 		WHERE (wait_event_type, wait_event)=('InjectionPoint', 'pg_rewrite-before-lock');
 26 | 
 27 | 		IF FOUND THEN
 28 | 			EXIT;
 29 | 		END IF;
 30 | 
 31 | 		PERFORM pg_sleep(.1);
 32 | 	END LOOP;
 33 | END;
 34 | $$;
 35 | 
 36 | step do_changes: 
 37 |     INSERT INTO tbl_src(i, t)
 38 |     SELECT 5, string_agg(random()::text, '')
 39 |     FROM generate_series(1, 200) h(y);
 40 | 
 41 |     UPDATE tbl_src SET t = t || 'x' WHERE i = 1;
 42 | 
 43 | step wakeup_before_lock_ip: 
 44 |     SELECT injection_points_wakeup('pg_rewrite-before-lock');
 45 | 
 46 | injection_points_wakeup
 47 | -----------------------
 48 |                        
 49 | (1 row)
 50 | 
 51 | step wait_for_after_commit_ip: 
 52 | DO $$
 53 | BEGIN
 54 |         LOOP
 55 | 		PERFORM pg_stat_clear_snapshot();
 56 | 
 57 | 	        PERFORM
 58 | 		FROM pg_stat_activity
 59 | 		WHERE (wait_event_type, wait_event)=('InjectionPoint', 'pg_rewrite-after-commit');
 60 | 
 61 | 		IF FOUND THEN
 62 | 			EXIT;
 63 | 		END IF;
 64 | 
 65 | 		PERFORM pg_sleep(.1);
 66 | 	END LOOP;
 67 | END;
 68 | $$;
 69 | 
 70 | step do_check: 
 71 |     TABLE pg_rewrite_progress;
 72 | 
 73 |     -- Each row should contain TOASTed value.
 74 |     SELECT count(*) FROM tbl_src WHERE pg_column_toast_chunk_id(t) ISNULL;
 75 | 
 76 |     -- The contents of the new table should be identical to that of the old
 77 |     -- one.
 78 |     SELECT count(*)
 79 |     FROM tbl_src t1 JOIN tbl_src_old t2 ON t1.i = t2.i
 80 |     WHERE t1.t <> t2.t;
 81 | 
 82 | src_table|dst_table|src_table_new|ins_initial|ins|upd|del
 83 | ---------+---------+-------------+-----------+---+---+---
 84 | tbl_src  |tbl_dst  |tbl_src_old  |          2|  1|  1|  0
 85 | (1 row)
 86 | 
 87 | count
 88 | -----
 89 |     0
 90 | (1 row)
 91 | 
 92 | count
 93 | -----
 94 |     0
 95 | (1 row)
 96 | 
 97 | step wakeup_after_commit_ip: 
 98 |     SELECT injection_points_wakeup('pg_rewrite-after-commit');
 99 | 
100 | injection_points_wakeup
101 | -----------------------
102 |                        
103 | (1 row)
104 | 
105 | injection_points_detach
106 | -----------------------
107 |                        
108 | (1 row)
109 | 
110 | 


--------------------------------------------------------------------------------
/expected/pg_rewrite_concurrent_partition.out:
--------------------------------------------------------------------------------
  1 | Parsed test spec with 2 sessions
  2 | 
  3 | starting permutation: do_rewrite wait_for_before_lock_ip do_changes wakeup_before_lock_ip wait_for_after_commit_ip do_check wakeup_after_commit_ip
  4 | injection_points_attach
  5 | -----------------------
  6 |                        
  7 | (1 row)
  8 | 
  9 | step do_rewrite: 
 10 |     SELECT rewrite_table_nowait('tbl_src', 'tbl_dst', 'tbl_src_old');
 11 | 
 12 | rewrite_table_nowait
 13 | --------------------
 14 |                     
 15 | (1 row)
 16 | 
 17 | step wait_for_before_lock_ip: 
 18 | DO $$
 19 | BEGIN
 20 |         LOOP
 21 | 		PERFORM pg_stat_clear_snapshot();
 22 | 
 23 | 	        PERFORM
 24 | 		FROM pg_stat_activity
 25 | 		WHERE (wait_event_type, wait_event)=('InjectionPoint', 'pg_rewrite-before-lock');
 26 | 
 27 | 		IF FOUND THEN
 28 | 			EXIT;
 29 | 		END IF;
 30 | 
 31 | 		PERFORM pg_sleep(.1);
 32 | 	END LOOP;
 33 | END;
 34 | $$;
 35 | 
 36 | step do_changes: 
 37 | 	-- Insert one row into each partition.
 38 | 	INSERT INTO tbl_src VALUES (2, 20), (3, 30), (5, 50);
 39 | 
 40 | 	-- Update with no identity change.
 41 | 	UPDATE tbl_src SET j=0 WHERE i=1;
 42 | 
 43 | 	-- Update with identity change but within the same partition.
 44 | 	UPDATE tbl_src SET i=6 WHERE i=5;
 45 | 
 46 | 	-- Cross-partition update.
 47 | 	UPDATE tbl_src SET i=7 WHERE i=3;
 48 | 
 49 | 	-- Update a row we inserted and updated, to check that it's visible.
 50 | 	UPDATE tbl_src SET j=4 WHERE i=7;
 51 | 
 52 | 	-- Delete.
 53 | 	DELETE FROM tbl_src WHERE i=4;
 54 | 
 55 | step wakeup_before_lock_ip: 
 56 |     SELECT injection_points_wakeup('pg_rewrite-before-lock');
 57 | 
 58 | injection_points_wakeup
 59 | -----------------------
 60 |                        
 61 | (1 row)
 62 | 
 63 | step wait_for_after_commit_ip: 
 64 | DO $$
 65 | BEGIN
 66 |         LOOP
 67 | 		PERFORM pg_stat_clear_snapshot();
 68 | 
 69 | 	        PERFORM
 70 | 		FROM pg_stat_activity
 71 | 		WHERE (wait_event_type, wait_event)=('InjectionPoint', 'pg_rewrite-after-commit');
 72 | 
 73 | 		IF FOUND THEN
 74 | 			EXIT;
 75 | 		END IF;
 76 | 
 77 | 		PERFORM pg_sleep(.1);
 78 | 	END LOOP;
 79 | END;
 80 | $$
 81 | 
 82 | step do_check: 
 83 |     TABLE pg_rewrite_progress;
 84 | 
 85 |     SELECT i, j FROM tbl_src ORDER BY i, j;
 86 | 
 87 | src_table|dst_table|src_table_new|ins_initial|ins|upd|del
 88 | ---------+---------+-------------+-----------+---+---+---
 89 | tbl_src  |tbl_dst  |tbl_src_old  |          2|  4|  3|  2
 90 | (1 row)
 91 | 
 92 | i| j
 93 | -+--
 94 | 1| 0
 95 | 2|20
 96 | 6|50
 97 | 7| 4
 98 | (4 rows)
 99 | 
100 | step wakeup_after_commit_ip: 
101 |     SELECT injection_points_wakeup('pg_rewrite-after-commit');
102 | 
103 | injection_points_wakeup
104 | -----------------------
105 |                        
106 | (1 row)
107 | 
108 | injection_points_detach
109 | -----------------------
110 |                        
111 | (1 row)
112 | 
113 | 


--------------------------------------------------------------------------------
/expected/pg_rewrite_concurrent.out:
--------------------------------------------------------------------------------
  1 | Parsed test spec with 2 sessions
  2 | 
  3 | starting permutation: do_rewrite wait_for_before_lock_ip do_changes wakeup_before_lock_ip wait_for_after_commit_ip do_check wakeup_after_commit_ip
  4 | injection_points_attach
  5 | -----------------------
  6 |                        
  7 | (1 row)
  8 | 
  9 | step do_rewrite: 
 10 |     SELECT rewrite_table_nowait('tbl_src', 'tbl_dst', 'tbl_src_old');
 11 | 
 12 | rewrite_table_nowait
 13 | --------------------
 14 |                     
 15 | (1 row)
 16 | 
 17 | step wait_for_before_lock_ip: 
 18 | DO $$
 19 | BEGIN
 20 |         LOOP
 21 | 		PERFORM pg_stat_clear_snapshot();
 22 | 
 23 | 	        PERFORM
 24 | 		FROM pg_stat_activity
 25 | 		WHERE (wait_event_type, wait_event)=('InjectionPoint', 'pg_rewrite-before-lock');
 26 | 
 27 | 		IF FOUND THEN
 28 | 			EXIT;
 29 | 		END IF;
 30 | 
 31 | 		PERFORM pg_sleep(.1);
 32 | 	END LOOP;
 33 | END;
 34 | $$;
 35 | 
 36 | step do_changes: 
 37 | 	INSERT INTO tbl_src VALUES (2, 20), (3, 30), (5, 50);
 38 | 
 39 | 	-- Update with no identity change.
 40 | 	UPDATE tbl_src SET j=0 WHERE i=1;
 41 | 
 42 | 	-- Update with identity change.
 43 | 	UPDATE tbl_src SET i=6 WHERE i=4;
 44 | 
 45 | 	-- Update a row we inserted, to check that the insertion is visible.
 46 | 	UPDATE tbl_src SET j=7 WHERE i=2;
 47 | 	-- ... and update it again, to check that the update is visible.
 48 | 	UPDATE tbl_src SET j=8 WHERE j=7;
 49 | 
 50 | 	-- Delete.
 51 | 	DELETE FROM tbl_src WHERE i=7;
 52 | 
 53 | step wakeup_before_lock_ip: 
 54 |     SELECT injection_points_wakeup('pg_rewrite-before-lock');
 55 | 
 56 | injection_points_wakeup
 57 | -----------------------
 58 |                        
 59 | (1 row)
 60 | 
 61 | step wait_for_after_commit_ip: 
 62 | DO $$
 63 | BEGIN
 64 |         LOOP
 65 | 		PERFORM pg_stat_clear_snapshot();
 66 | 
 67 | 	        PERFORM
 68 | 		FROM pg_stat_activity
 69 | 		WHERE (wait_event_type, wait_event)=('InjectionPoint', 'pg_rewrite-after-commit');
 70 | 
 71 | 		IF FOUND THEN
 72 | 			EXIT;
 73 | 		END IF;
 74 | 
 75 | 		PERFORM pg_sleep(.1);
 76 | 	END LOOP;
 77 | END;
 78 | $$;
 79 | 
 80 | step do_check: 
 81 |     TABLE pg_rewrite_progress;
 82 | 
 83 |     SELECT i, j, k, l FROM tbl_src ORDER BY i, j;
 84 | 
 85 | src_table|dst_table|src_table_new|ins_initial|ins|upd|del
 86 | ---------+---------+-------------+-----------+---+---+---
 87 | tbl_src  |tbl_dst  |tbl_src_old  |          3|  3|  4|  1
 88 | (1 row)
 89 | 
 90 | i| j|  k|  l
 91 | -+--+---+---
 92 | 1| 0|  0|  0
 93 | 2| 8| -8| -8
 94 | 3|30|-30|-30
 95 | 5|50|-50|-50
 96 | 6|40|-40|-40
 97 | (5 rows)
 98 | 
 99 | step wakeup_after_commit_ip: 
100 |     SELECT injection_points_wakeup('pg_rewrite-after-commit');
101 | 
102 | injection_points_wakeup
103 | -----------------------
104 |                        
105 | (1 row)
106 | 
107 | injection_points_detach
108 | -----------------------
109 |                        
110 | (1 row)
111 | 
112 | 


--------------------------------------------------------------------------------
/specs/pg_rewrite_concurrent_toast.spec:
--------------------------------------------------------------------------------
  1 | setup
  2 | {
  3 |     CREATE EXTENSION injection_points;
  4 |     CREATE EXTENSION pg_rewrite;
  5 | 
  6 |     CREATE TABLE tbl_src(i int primary key, t text);
  7 | 
  8 |     INSERT INTO tbl_src(i, t)
  9 |     SELECT x, string_agg(random()::text, '')
 10 |     FROM generate_series(1, 2) g(x), generate_series(1, 200) h(y)
 11 |     GROUP BY x;
 12 | 
 13 |     CREATE TABLE tbl_dst(i int primary key, t text);
 14 | }
 15 | 
 16 | teardown
 17 | {
 18 |     DROP EXTENSION injection_points;
 19 |     DROP EXTENSION pg_rewrite;
 20 |     DROP TABLE tbl_src;
 21 |     DROP TABLE tbl_src_old;
 22 | }
 23 | 
 24 | session s1
 25 | setup
 26 | {
 27 |     SELECT injection_points_attach('pg_rewrite-before-lock', 'wait');
 28 |     SELECT injection_points_attach('pg_rewrite-after-commit', 'wait');
 29 | }
 30 | # Perform the initial load and wait for s2 to do some data changes.
 31 | #
 32 | # Since pg_rewrite uses background worker, the isolation tester does not
 33 | # recognize that the session waits on an injection point (because the worker
 34 | # is who waits). Therefore use rewrite_table_nowait(), which only launches the
 35 | # worker and goes on. The 'wait_for_s1_sleep' step below then checks until the
 36 | # waiting started.
 37 | step do_rewrite
 38 | {
 39 |     SELECT rewrite_table_nowait('tbl_src', 'tbl_dst', 'tbl_src_old');
 40 | }
 41 | # Check the data.
 42 | step do_check
 43 | {
 44 |     TABLE pg_rewrite_progress;
 45 | 
 46 |     -- Each row should contain TOASTed value.
 47 |     SELECT count(*) FROM tbl_src WHERE pg_column_toast_chunk_id(t) ISNULL;
 48 | 
 49 |     -- The contents of the new table should be identical to that of the old
 50 |     -- one.
 51 |     SELECT count(*)
 52 |     FROM tbl_src t1 JOIN tbl_src_old t2 ON t1.i = t2.i
 53 |     WHERE t1.t <> t2.t;
 54 | }
 55 | 
 56 | session s2
 57 | # Since s1 uses background worker, the backend executing 'wait_before_lock'
 58 | # does not appear to be waiting on the injection point. Instead we need to
 59 | # check explicitly if the waiting on the injection point is in progress, and
 60 | # wait if it's not.
 61 | step wait_for_before_lock_ip
 62 | {
 63 | DO $$
 64 | BEGIN
 65 |         LOOP
 66 | 		PERFORM pg_stat_clear_snapshot();
 67 | 
 68 | 	        PERFORM
 69 | 		FROM pg_stat_activity
 70 | 		WHERE (wait_event_type, wait_event)=('InjectionPoint', 'pg_rewrite-before-lock');
 71 | 
 72 | 		IF FOUND THEN
 73 | 			EXIT;
 74 | 		END IF;
 75 | 
 76 | 		PERFORM pg_sleep(.1);
 77 | 	END LOOP;
 78 | END;
 79 | $$;
 80 | }
 81 | step do_changes
 82 | {
 83 |     INSERT INTO tbl_src(i, t)
 84 |     SELECT 5, string_agg(random()::text, '')
 85 |     FROM generate_series(1, 200) h(y);
 86 | 
 87 |     UPDATE tbl_src SET t = t || 'x' WHERE i = 1;
 88 | }
 89 | step wakeup_before_lock_ip
 90 | {
 91 |     SELECT injection_points_wakeup('pg_rewrite-before-lock');
 92 | }
 93 | # Wait until the concurrent changes have been committed by the pg_rewrite
 94 | # worker.
 95 | step wait_for_after_commit_ip
 96 | {
 97 | DO $$
 98 | BEGIN
 99 |         LOOP
100 | 		PERFORM pg_stat_clear_snapshot();
101 | 
102 | 	        PERFORM
103 | 		FROM pg_stat_activity
104 | 		WHERE (wait_event_type, wait_event)=('InjectionPoint', 'pg_rewrite-after-commit');
105 | 
106 | 		IF FOUND THEN
107 | 			EXIT;
108 | 		END IF;
109 | 
110 | 		PERFORM pg_sleep(.1);
111 | 	END LOOP;
112 | END;
113 | $$;
114 | }
115 | # Like wakeup_before_lock_ip above.
116 | step wakeup_after_commit_ip
117 | {
118 |     SELECT injection_points_wakeup('pg_rewrite-after-commit');
119 | }
120 | teardown
121 | {
122 |     SELECT injection_points_detach('pg_rewrite-before-lock');
123 |     SELECT injection_points_detach('pg_rewrite-after-commit');
124 | }
125 | 
126 | permutation
127 |     do_rewrite
128 |     wait_for_before_lock_ip
129 |     do_changes
130 |     wakeup_before_lock_ip
131 |     wait_for_after_commit_ip
132 |     do_check
133 |     wakeup_after_commit_ip
134 | 


--------------------------------------------------------------------------------
/specs/pg_rewrite_concurrent.spec:
--------------------------------------------------------------------------------
  1 | setup
  2 | {
  3 |     CREATE EXTENSION injection_points;
  4 |     CREATE EXTENSION pg_rewrite;
  5 | 
  6 |     CREATE TABLE tbl_src(i int primary key, j int,
  7 | 			 k int generated always as (-j) virtual,
  8 | 			 l int generated always as (-j) stored);
  9 |     INSERT INTO tbl_src(i, j) VALUES (1, 10), (4, 40), (7, 70);
 10 | 
 11 |     -- Change of data type and column order.
 12 |     CREATE TABLE tbl_dst(j int, i bigint primary key, k int, l int);
 13 | }
 14 | 
 15 | teardown
 16 | {
 17 |     DROP EXTENSION injection_points;
 18 |     DROP EXTENSION pg_rewrite;
 19 |     DROP TABLE tbl_src;
 20 |     DROP TABLE tbl_src_old;
 21 | }
 22 | 
 23 | session s1
 24 | setup
 25 | {
 26 |     SELECT injection_points_attach('pg_rewrite-before-lock', 'wait');
 27 |     SELECT injection_points_attach('pg_rewrite-after-commit', 'wait');
 28 | }
 29 | # Perform the initial load and wait for s2 to do some data changes.
 30 | #
 31 | # Since pg_rewrite uses background worker, the isolation tester does not
 32 | # recognize that the session waits on an injection point (because the worker
 33 | # is who waits). Therefore use rewrite_table_nowait(), which only launches the
 34 | # worker and goes on. The 'wait_for_s1_sleep' step below then checks until the
 35 | # waiting started.
 36 | step do_rewrite
 37 | {
 38 |     SELECT rewrite_table_nowait('tbl_src', 'tbl_dst', 'tbl_src_old');
 39 | }
 40 | # Check the data.
 41 | step do_check
 42 | {
 43 |     TABLE pg_rewrite_progress;
 44 | 
 45 |     SELECT i, j, k, l FROM tbl_src ORDER BY i, j;
 46 | }
 47 | 
 48 | session s2
 49 | # Since s1 uses background worker, the backend executing 'wait_before_lock'
 50 | # does not appear to be waiting on the injection point. Instead we need to
 51 | # check explicitly if the waiting on the injection point is in progress, and
 52 | # wait if it's not.
 53 | step wait_for_before_lock_ip
 54 | {
 55 | DO $$
 56 | BEGIN
 57 |         LOOP
 58 | 		PERFORM pg_stat_clear_snapshot();
 59 | 
 60 | 	        PERFORM
 61 | 		FROM pg_stat_activity
 62 | 		WHERE (wait_event_type, wait_event)=('InjectionPoint', 'pg_rewrite-before-lock');
 63 | 
 64 | 		IF FOUND THEN
 65 | 			EXIT;
 66 | 		END IF;
 67 | 
 68 | 		PERFORM pg_sleep(.1);
 69 | 	END LOOP;
 70 | END;
 71 | $$;
 72 | }
 73 | step do_changes
 74 | {
 75 | 	INSERT INTO tbl_src VALUES (2, 20), (3, 30), (5, 50);
 76 | 
 77 | 	-- Update with no identity change.
 78 | 	UPDATE tbl_src SET j=0 WHERE i=1;
 79 | 
 80 | 	-- Update with identity change.
 81 | 	UPDATE tbl_src SET i=6 WHERE i=4;
 82 | 
 83 | 	-- Update a row we inserted, to check that the insertion is visible.
 84 | 	UPDATE tbl_src SET j=7 WHERE i=2;
 85 | 	-- ... and update it again, to check that the update is visible.
 86 | 	UPDATE tbl_src SET j=8 WHERE j=7;
 87 | 
 88 | 	-- Delete.
 89 | 	DELETE FROM tbl_src WHERE i=7;
 90 | }
 91 | step wakeup_before_lock_ip
 92 | {
 93 |     SELECT injection_points_wakeup('pg_rewrite-before-lock');
 94 | }
 95 | # Wait until the concurrent changes have been committed by the pg_rewrite
 96 | # worker.
 97 | step wait_for_after_commit_ip
 98 | {
 99 | DO $$
100 | BEGIN
101 |         LOOP
102 | 		PERFORM pg_stat_clear_snapshot();
103 | 
104 | 	        PERFORM
105 | 		FROM pg_stat_activity
106 | 		WHERE (wait_event_type, wait_event)=('InjectionPoint', 'pg_rewrite-after-commit');
107 | 
108 | 		IF FOUND THEN
109 | 			EXIT;
110 | 		END IF;
111 | 
112 | 		PERFORM pg_sleep(.1);
113 | 	END LOOP;
114 | END;
115 | $$;
116 | }
117 | # Like wakeup_before_lock_ip above.
118 | step wakeup_after_commit_ip
119 | {
120 |     SELECT injection_points_wakeup('pg_rewrite-after-commit');
121 | }
122 | teardown
123 | {
124 |     SELECT injection_points_detach('pg_rewrite-before-lock');
125 |     SELECT injection_points_detach('pg_rewrite-after-commit');
126 | }
127 | 
128 | permutation
129 |     do_rewrite
130 |     wait_for_before_lock_ip
131 |     do_changes
132 |     wakeup_before_lock_ip
133 |     wait_for_after_commit_ip
134 |     do_check
135 |     wakeup_after_commit_ip
136 | 


--------------------------------------------------------------------------------
/specs/pg_rewrite_concurrent_partition.spec:
--------------------------------------------------------------------------------
  1 | setup
  2 | {
  3 |     CREATE EXTENSION injection_points;
  4 |     CREATE EXTENSION pg_rewrite;
  5 | 
  6 |     CREATE TABLE tbl_src(i int primary key, j int);
  7 |     INSERT INTO tbl_src(i, j) VALUES (1, 10), (4, 40);
  8 | 
  9 |     -- Besides partitioning, also test change of column type (int -> bigint).
 10 |     CREATE TABLE tbl_dst(i bigint primary key, j int) PARTITION BY RANGE(i);
 11 |     CREATE TABLE tbl_dst_part_1 PARTITION OF tbl_dst FOR VALUES FROM (1) TO (4);
 12 | 
 13 |     -- Create a partition with different order of columns, to test that
 14 |     -- partition maps work.
 15 |     CREATE TABLE tbl_dst_part_2(j int, i bigint primary key);
 16 |     ALTER TABLE tbl_dst ATTACH PARTITION tbl_dst_part_2 FOR VALUES FROM (4) TO (8);
 17 | }
 18 | 
 19 | teardown
 20 | {
 21 |     DROP EXTENSION injection_points;
 22 |     DROP EXTENSION pg_rewrite;
 23 |     DROP TABLE tbl_src;
 24 |     DROP TABLE tbl_src_old;
 25 | }
 26 | 
 27 | session s1
 28 | setup
 29 | {
 30 |     SELECT injection_points_attach('pg_rewrite-before-lock', 'wait');
 31 |     SELECT injection_points_attach('pg_rewrite-after-commit', 'wait');
 32 | }
 33 | # Perform the initial load and wait for s2 to do some data changes.
 34 | #
 35 | # Since pg_rewrite uses background worker, the isolation tester does not
 36 | # recognize that the session waits on an injection point (because the worker
 37 | # is who waits). Therefore use rewrite_table_nowait(), which only launches the
 38 | # worker and goes on. The 'wait_for_s1_sleep' step below then checks until the
 39 | # waiting started.
 40 | step do_rewrite
 41 | {
 42 |     SELECT rewrite_table_nowait('tbl_src', 'tbl_dst', 'tbl_src_old');
 43 | }
 44 | # Check the data.
 45 | step do_check
 46 | {
 47 |     TABLE pg_rewrite_progress;
 48 | 
 49 |     SELECT i, j FROM tbl_src ORDER BY i, j;
 50 | }
 51 | 
 52 | session s2
 53 | # Since s1 uses background worker, the backend executing 'wait_before_lock'
 54 | # does not appear to be waiting on the injection point. Instead we need to
 55 | # check explicitly if the waiting on the injection point is in progress, and
 56 | # wait if it's not.
 57 | step wait_for_before_lock_ip
 58 | {
 59 | DO $$
 60 | BEGIN
 61 |         LOOP
 62 | 		PERFORM pg_stat_clear_snapshot();
 63 | 
 64 | 	        PERFORM
 65 | 		FROM pg_stat_activity
 66 | 		WHERE (wait_event_type, wait_event)=('InjectionPoint', 'pg_rewrite-before-lock');
 67 | 
 68 | 		IF FOUND THEN
 69 | 			EXIT;
 70 | 		END IF;
 71 | 
 72 | 		PERFORM pg_sleep(.1);
 73 | 	END LOOP;
 74 | END;
 75 | $$;
 76 | }
 77 | step do_changes
 78 | {
 79 | 	-- Insert one row into each partition.
 80 | 	INSERT INTO tbl_src VALUES (2, 20), (3, 30), (5, 50);
 81 | 
 82 | 	-- Update with no identity change.
 83 | 	UPDATE tbl_src SET j=0 WHERE i=1;
 84 | 
 85 | 	-- Update with identity change but within the same partition.
 86 | 	UPDATE tbl_src SET i=6 WHERE i=5;
 87 | 
 88 | 	-- Cross-partition update.
 89 | 	UPDATE tbl_src SET i=7 WHERE i=3;
 90 | 
 91 | 	-- Update a row we inserted and updated, to check that it's visible.
 92 | 	UPDATE tbl_src SET j=4 WHERE i=7;
 93 | 
 94 | 	-- Delete.
 95 | 	DELETE FROM tbl_src WHERE i=4;
 96 | }
 97 | step wakeup_before_lock_ip
 98 | {
 99 |     SELECT injection_points_wakeup('pg_rewrite-before-lock');
100 | }
101 | # Wait until the concurrent changes have been committed by the pg_rewrite
102 | # worker.
103 | step wait_for_after_commit_ip
104 | {
105 | DO $$
106 | BEGIN
107 |         LOOP
108 | 		PERFORM pg_stat_clear_snapshot();
109 | 
110 | 	        PERFORM
111 | 		FROM pg_stat_activity
112 | 		WHERE (wait_event_type, wait_event)=('InjectionPoint', 'pg_rewrite-after-commit');
113 | 
114 | 		IF FOUND THEN
115 | 			EXIT;
116 | 		END IF;
117 | 
118 | 		PERFORM pg_sleep(.1);
119 | 	END LOOP;
120 | END;
121 | $$
122 | }
123 | # Like wakeup_before_lock_ip above.
124 | step wakeup_after_commit_ip
125 | {
126 |     SELECT injection_points_wakeup('pg_rewrite-after-commit');
127 | }
128 | teardown
129 | {
130 |     SELECT injection_points_detach('pg_rewrite-before-lock');
131 |     SELECT injection_points_detach('pg_rewrite-after-commit');
132 | }
133 | 
134 | permutation
135 |     do_rewrite
136 |     wait_for_before_lock_ip
137 |     do_changes
138 |     wakeup_before_lock_ip
139 |     wait_for_after_commit_ip
140 |     do_check
141 |     wakeup_after_commit_ip
142 | 


--------------------------------------------------------------------------------
/sql/pg_rewrite.sql:
--------------------------------------------------------------------------------
  1 | DROP EXTENSION IF EXISTS pg_rewrite;
  2 | CREATE EXTENSION pg_rewrite;
  3 | 
  4 | CREATE TABLE tab1(i int PRIMARY KEY, j int, k int);
  5 | -- If a dropped column is encountered, the source tuple should be converted
  6 | -- so it matches the destination table.
  7 | ALTER TABLE tab1 DROP COLUMN k;
  8 | ALTER TABLE tab1 ADD COLUMN k int;
  9 | INSERT INTO tab1(i, j, k)
 10 | SELECT i, i / 2, i
 11 | FROM generate_series(0, 1023) g(i);
 12 | 
 13 | CREATE TABLE tab1_new(i int PRIMARY KEY, j int, k int) PARTITION BY RANGE(i);
 14 | CREATE TABLE tab1_new_part_1 PARTITION OF tab1_new FOR VALUES FROM (0) TO (256);
 15 | CREATE TABLE tab1_new_part_2 PARTITION OF tab1_new FOR VALUES FROM (256) TO (512);
 16 | CREATE TABLE tab1_new_part_3 PARTITION OF tab1_new FOR VALUES FROM (512) TO (768);
 17 | CREATE TABLE tab1_new_part_4 PARTITION OF tab1_new FOR VALUES FROM (768) TO (1024);
 18 | 
 19 | -- Also test handling of constraints that require "manual" validation.
 20 | ALTER TABLE tab1 ADD CHECK (k >= 0);
 21 | 
 22 | CREATE TABLE tab1_fk(i int REFERENCES tab1);
 23 | INSERT INTO tab1_fk(i) VALUES (1);
 24 | \d tab1
 25 | 
 26 | -- Process the table.
 27 | SELECT rewrite_table('tab1', 'tab1_new', 'tab1_orig');
 28 | 
 29 | -- tab1 should now be partitioned.
 30 | \d tab1
 31 | 
 32 | -- Validate the constraints.
 33 | ALTER TABLE tab1 VALIDATE CONSTRAINT tab1_k_check2;
 34 | ALTER TABLE tab1_fk VALIDATE CONSTRAINT tab1_fk_i_fkey2;
 35 | 
 36 | \d tab1
 37 | 
 38 | EXPLAIN (COSTS off) SELECT * FROM tab1;
 39 | 
 40 | -- Check that the contents has not changed.
 41 | SELECT count(*) FROM tab1;
 42 | 
 43 | SELECT *
 44 | FROM tab1 t FULL JOIN tab1_orig o ON t.i = o.i
 45 | WHERE t.i ISNULL OR o.i ISNULL;
 46 | 
 47 | -- List partitioning
 48 | CREATE TABLE tab2(i int, j int, PRIMARY KEY (i, j));
 49 | INSERT INTO tab2(i, j)
 50 | SELECT i, j
 51 | FROM generate_series(1, 4) g(i), generate_series(1, 4) h(j);
 52 | 
 53 | CREATE TABLE tab2_new(i int, j int, PRIMARY KEY (i, j)) PARTITION BY LIST(i);
 54 | CREATE TABLE tab2_new_part_1 PARTITION OF tab2_new FOR VALUES IN (1);
 55 | CREATE TABLE tab2_new_part_2 PARTITION OF tab2_new FOR VALUES IN (2);
 56 | CREATE TABLE tab2_new_part_3 PARTITION OF tab2_new FOR VALUES IN (3);
 57 | CREATE TABLE tab2_new_part_4 PARTITION OF tab2_new FOR VALUES IN (4);
 58 | 
 59 | SELECT rewrite_table('tab2', 'tab2_new', 'tab2_orig');
 60 | 
 61 | TABLE tab2_new_part_1;
 62 | TABLE tab2_new_part_2;
 63 | TABLE tab2_new_part_3;
 64 | TABLE tab2_new_part_4;
 65 | 
 66 | -- Hash partitioning
 67 | CREATE TABLE tab3(i int, j int, PRIMARY KEY (i, j));
 68 | INSERT INTO tab3(i, j)
 69 | SELECT i, j
 70 | FROM generate_series(1, 4) g(i), generate_series(1, 4) h(j);
 71 | 
 72 | CREATE TABLE tab3_new(i int, j int, PRIMARY KEY (i, j)) PARTITION BY HASH(i);
 73 | CREATE TABLE tab3_new_part_1 PARTITION OF tab3_new FOR VALUES WITH (MODULUS 4, REMAINDER 0);
 74 | CREATE TABLE tab3_new_part_2 PARTITION OF tab3_new FOR VALUES WITH (MODULUS 4, REMAINDER 1);
 75 | CREATE TABLE tab3_new_part_3 PARTITION OF tab3_new FOR VALUES WITH (MODULUS 4, REMAINDER 2);
 76 | CREATE TABLE tab3_new_part_4 PARTITION OF tab3_new FOR VALUES WITH (MODULUS 4, REMAINDER 3);
 77 | 
 78 | SELECT rewrite_table('tab3', 'tab3_new', 'tab3_orig');
 79 | 
 80 | TABLE tab3_new_part_1;
 81 | TABLE tab3_new_part_2;
 82 | TABLE tab3_new_part_3;
 83 | TABLE tab3_new_part_4;
 84 | 
 85 | -- Change of precision and scale of a numeric data type.
 86 | CREATE TABLE tab4(i int PRIMARY KEY, j numeric(3, 1));
 87 | INSERT INTO tab4(i, j) VALUES (1, 0.1);
 88 | CREATE TABLE tab4_new(i int PRIMARY KEY, j numeric(4, 2));
 89 | TABLE tab4;
 90 | SELECT rewrite_table('tab4', 'tab4_new', 'tab4_orig');
 91 | TABLE tab4;
 92 | 
 93 | -- One more test for "manual" validation of FKs, this time we rewrite the PK
 94 | -- table. The NOT VALID constraint cannot be used if the FK table is
 95 | -- partitioned and if PG version is < 18, so we need a separate test.
 96 | CREATE TABLE tab1_pk(i int primary key);
 97 | INSERT INTO tab1_pk(i) VALUES (1);
 98 | CREATE TABLE tab1_pk_new(i bigint primary key);
 99 | 
100 | DROP TABLE tab1_fk;
101 | CREATE TABLE tab1_fk(i int REFERENCES tab1_pk);
102 | INSERT INTO tab1_fk(i) VALUES (1);
103 | 
104 | \d tab1_pk
105 | SELECT rewrite_table('tab1_pk', 'tab1_pk_new', 'tab1_pk_orig');
106 | \d tab1_pk
107 | ALTER TABLE tab1_fk VALIDATE CONSTRAINT tab1_fk_i_fkey2;
108 | \d tab1_pk
109 | 
110 | -- For the partitioned FK table, test at least that the FK creation is skipped
111 | -- (i.e. ERROR saying that NOT VALID is not supported is no raised)
112 | DROP TABLE tab1_fk;
113 | CREATE TABLE tab1_fk(i int REFERENCES tab1_pk) PARTITION BY RANGE (i);
114 | CREATE TABLE tab1_fk_1 PARTITION OF tab1_fk DEFAULT;
115 | INSERT INTO tab1_fk(i) VALUES (1);
116 | 
117 | ALTER TABLE tab1_pk_orig RENAME TO tab1_pk_new;
118 | TRUNCATE TABLE tab1_pk_new;
119 | 
120 | \d tab1_fk
121 | SELECT rewrite_table('tab1_pk', 'tab1_pk_new', 'tab1_pk_orig');
122 | -- Note that tab1_fk still references tab1_pk_orig - that's expected.
123 | \d tab1_fk
124 | 
125 | -- The same once again, but now rewrite the FK table.
126 | DROP TABLE tab1_fk;
127 | DROP TABLE tab1_pk;
128 | ALTER TABLE tab1_pk_orig RENAME TO tab1_pk;
129 | CREATE TABLE tab1_fk(i int PRIMARY KEY REFERENCES tab1_pk);
130 | INSERT INTO tab1_fk(i) VALUES (1);
131 | CREATE TABLE tab1_fk_new(i int PRIMARY KEY) PARTITION BY RANGE (i);
132 | CREATE TABLE tab1_fk_new_1 PARTITION OF tab1_fk_new DEFAULT;
133 | \d tab1_fk
134 | SELECT rewrite_table('tab1_fk', 'tab1_fk_new', 'tab1_fk_orig');
135 | \d tab1_fk
136 | 
137 | -- Check if sequence on the target table is synchronized with that of the
138 | -- source table.
139 | CREATE TABLE tab5(i int primary key generated always as identity);
140 | CREATE TABLE tab5_new(i int primary key generated always as identity);
141 | INSERT INTO tab5(i) VALUES (DEFAULT);
142 | SELECT rewrite_table('tab5', 'tab5_new', 'tab5_orig');
143 | INSERT INTO tab5(i) VALUES (DEFAULT);
144 | SELECT i FROM tab5 ORDER BY i;
145 | 
146 | -- The same with serial column.
147 | CREATE TABLE tab6(i serial primary key);
148 | CREATE TABLE tab6_new(i serial primary key);
149 | INSERT INTO tab6(i) VALUES (DEFAULT);
150 | SELECT rewrite_table('tab6', 'tab6_new', 'tab6_orig');
151 | INSERT INTO tab6(i) VALUES (DEFAULT);
152 | SELECT i FROM tab6 ORDER BY i;
153 | 


--------------------------------------------------------------------------------
/pg_rewrite.h:
--------------------------------------------------------------------------------
  1 | /*----------------------------------------------------------------
  2 |  *
  3 |  * pg_rewrite.h
  4 |  *     Tools for maintenance that requires table rewriting.
  5 |  *
  6 |  * Copyright (c) 2021-2025, Cybertec PostgreSQL International GmbH
  7 |  *
  8 |  *----------------------------------------------------------------
  9 |  */
 10 | 
 11 | #include <sys/time.h>
 12 | 
 13 | #include "c.h"
 14 | #include "postgres.h"
 15 | #include "fmgr.h"
 16 | #include "miscadmin.h"
 17 | 
 18 | #include "access/genam.h"
 19 | #include "access/heapam.h"
 20 | #include "access/relscan.h"
 21 | #include "access/xlog_internal.h"
 22 | #include "access/xact.h"
 23 | #include "catalog/pg_class.h"
 24 | #include "nodes/execnodes.h"
 25 | #include "postmaster/bgworker.h"
 26 | #include "replication/logical.h"
 27 | #include "replication/origin.h"
 28 | #include "utils/inval.h"
 29 | #include "utils/resowner.h"
 30 | #include "utils/snapmgr.h"
 31 | 
 32 | typedef struct DecodingOutputState
 33 | {
 34 | 	/* The relation whose changes we're decoding. */
 35 | 	Oid			relid;
 36 | 
 37 | 	/*
 38 | 	 * Decoded changes are stored here. Although we try to avoid excessive
 39 | 	 * batches, it can happen that the changes need to be stored to disk. The
 40 | 	 * tuplestore does this transparently.
 41 | 	 */
 42 | 	Tuplestorestate *tstore;
 43 | 
 44 | 	/* The current number of changes in tstore. */
 45 | 	double		nchanges;
 46 | 
 47 | 	/*
 48 | 	 * Descriptor to store the ConcurrentChange structure serialized (bytea).
 49 | 	 * We can't store the tuple directly because tuplestore only supports
 50 | 	 * minimum tuple and we may need to transfer OID system column from the
 51 | 	 * output plugin. Also we need to transfer the change kind, so it's better
 52 | 	 * to put everything in the structure than to use 2 tuplestores "in
 53 | 	 * parallel".
 54 | 	 */
 55 | 	TupleDesc	tupdesc_change;
 56 | 
 57 | 	/*
 58 | 	 * Tuple descriptor needed process the concurrent data changes.
 59 | 	 */
 60 | 	TupleDesc	tupdesc_src;
 61 | 
 62 | 	/* Slot to retrieve data from tstore. */
 63 | 	TupleTableSlot *tsslot;
 64 | 
 65 | 	/*
 66 | 	 * WAL records having this origin have been created by the initial load
 67 | 	 * and should not be decoded.
 68 | 	 */
 69 | 	RepOriginId rorigin;
 70 | 
 71 | 	ResourceOwner resowner;
 72 | } DecodingOutputState;
 73 | 
 74 | /* The WAL segment being decoded. */
 75 | extern XLogSegNo rewrite_current_segment;
 76 | 
 77 | extern void _PG_init(void);
 78 | 
 79 | /* Progress tracking. */
 80 | typedef struct TaskProgress
 81 | {
 82 | 	/* Tuples inserted during the initial load. */
 83 | 	int64		ins_initial;
 84 | 
 85 | 	/*
 86 | 	 * Tuples inserted, updated and deleted after the initial load (i.e.
 87 | 	 * during the catch-up phase).
 88 | 	 */
 89 | 	int64		ins;
 90 | 	int64		upd;
 91 | 	int64		del;
 92 | } TaskProgress;
 93 | 
 94 | /*
 95 |  * The new implementation, which delegates the execution to a background
 96 |  * worker (as opposed to the PG executor).
 97 |  *
 98 |  * Arguments are passed to the worker via this structure, located in the
 99 |  * shared memory.
100 |  */
101 | typedef struct WorkerTask
102 | {
103 | 	/* Connection info. */
104 | 	Oid		dbid;
105 | 	Oid		roleid;
106 | 
107 | 	/* Worker that performs the task both sets and clears this field. */
108 | 	pid_t		pid;
109 | 
110 | 	/* See the comments of pg_rewrite_exit_if_requested(). */
111 | 	bool	exit_requested;
112 | 
113 | 	/* The progress is only valid if the dbid is valid. */
114 | 	TaskProgress	progress;
115 | 
116 | 	/*
117 | 	 * Use this when setting / clearing the fields above. Once dbid is set,
118 | 	 * the task belongs to the backend that set it, so the other fields may be
119 | 	 * assigned w/o the lock.
120 | 	 */
121 | 	slock_t		mutex;
122 | 
123 | 	/* The tables to work on. */
124 | 	NameData	relschema;
125 | 	NameData	relname;
126 | 	NameData	relname_new;
127 | 	NameData	relschema_dst;
128 | 	NameData	relname_dst;
129 | 
130 | 	/*
131 | 	 * Space for the worker to send an error message to the backend.
132 | 	 *
133 | 	 * XXX Note that later messages overwrite the earlier ones, so only the
134 | 	 * last message is received. Is it worth using a queue instead?
135 | 	 */
136 | #define	MAX_ERR_MSG_LEN	1024
137 | 	char		msg[MAX_ERR_MSG_LEN];
138 | 
139 | 	/* Detailed error message. */
140 | 	char		msg_detail[MAX_ERR_MSG_LEN];
141 | 
142 | 	int		elevel;
143 | 
144 | 	/*
145 | 	 * Should rewrite_table() return w/o waiting for the worker's exit? If
146 | 	 * this flag is set, the worker is responsible for releasing the
147 | 	 * task. Otherwise the worker must not release the task because the
148 | 	 * backend might be interested in 'msg' and 'msg_detail'.
149 | 	 */
150 | 	bool	nowait;
151 | 
152 | 	int             max_xlock_time;
153 | } WorkerTask;
154 | 
155 | #define		MAX_TASKS	8
156 | 
157 | /* Each backend stores here the pointer to its task in the shared memory. */
158 | extern WorkerTask *MyWorkerTask;
159 | 
160 | /*
161 |  * Like AttrMap in PG core, but here we add an array of expressions to coerce
162 |  * the input values to output ones. (A new name is needed as it's hard to
163 |  * avoid inclusion of the in-core structure.)
164 |  */
165 | typedef struct AttrMapExt
166 | {
167 | 	AttrNumber *attnums;
168 | 	int			maplen;
169 | 	bool	dropped_attr;	/* Has outer or inner descriptor a dropped
170 | 							 * attribute? */
171 | 	Node	**exprsIn;		/* Non-NULL field tells how to convert the input
172 | 							 * value to the output data type and/or to
173 | 							 * evaluate the column expression. NULL indicates
174 | 							 * that no conversion is needed and that there is
175 | 							 * no expression for given column. */
176 | 	Node	**exprsOut;		/*
177 | 							 * Likewise, expression to compute the value of an
178 | 							 * output column.
179 | 							 */
180 | } AttrMapExt;
181 | 
182 | /*
183 |  * Like TupleConversionMap in PG core, but here we add an array of expressions
184 |  * to coerce the input values to output ones. (A new name is needed as it's
185 |  * hard to avoid inclusion of the in-core structure.)
186 |  */
187 | typedef struct TupleConversionMapExt
188 | {
189 | 	TupleDesc	indesc;			/* tupdesc for source rowtype */
190 | 	TupleDesc	outdesc;		/* tupdesc for result rowtype */
191 | 	AttrMapExt    *attrMap;		/* indexes of input fields, or 0 for null */
192 | 	Datum	   *invalues;		/* workspace for deconstructing source */
193 | 	bool	   *inisnull;
194 | 	ExprState	**exprsIn;		/* See AttrMapExt */
195 | 	ExprState	**exprsOut;		/* See AttrMapExt */
196 | 	EState		*estate;		/* Executor state used to evaluate
197 | 								 * coerceExprs. */
198 | 	TupleTableSlot *in_slot;	/* Slot to store the input tuple for
199 | 								 * coercion. */
200 | 	TupleTableSlot *out_slot;	/* Slot to construct the output tuple. */
201 | } TupleConversionMapExt;
202 | 
203 | /*
204 |  * Hash table to cache partition-specific information.
205 |  */
206 | typedef struct PartitionEntry
207 | {
208 | 	Oid			part_oid;		/* key */
209 | 	Relation	ident_index;
210 | 
211 | 	/*
212 | 	 * Slot (TTSOpsHeapTuple) to apply data changes to the partition.
213 | 	 */
214 | 	TupleTableSlot *slot;
215 | 
216 | 	/*
217 | 	 * Slot to retrieve tuples from the partition. Separate from 'slot_ind'
218 | 	 * because it has to be TTSOpsBufferHeapTuple.
219 | 	 */
220 | 	TupleTableSlot *slot_ind;
221 | 
222 | 	/* This should make insertions into partitions more efficient. */
223 | 	BulkInsertState bistate;
224 | 
225 | 	/*
226 | 	 * Map to convert tuples that match the partitioned table so they match
227 | 	 * this partition.
228 | 	 */
229 | 	TupleConversionMapExt	*conv_map;
230 | 
231 | 	char		status;			/* used by simplehash */
232 | } PartitionEntry;
233 | 
234 | #define SH_PREFIX partitions
235 | #define SH_ELEMENT_TYPE PartitionEntry
236 | #define SH_KEY_TYPE Oid
237 | #define SH_KEY part_oid
238 | #define SH_HASH_KEY(tb, key) (key)
239 | #define SH_EQUAL(tb, a, b) ((a) == (b))
240 | #define SH_SCOPE static inline
241 | #define SH_DECLARE
242 | #define SH_DEFINE
243 | #include "lib/simplehash.h"
244 | 
245 | extern PGDLLEXPORT void rewrite_worker_main(Datum main_arg);
246 | 
247 | extern void pg_rewrite_exit_if_requested(void);
248 | 
249 | /*
250 |  * Use function names distinct from those in pg_squeeze, in case both
251 |  * extensions are installed.
252 |  */
253 | extern bool pg_rewrite_process_concurrent_changes(EState *estate,
254 | 												  ModifyTableState *mtstate,
255 | 												  struct PartitionTupleRouting *proute,
256 | 												  LogicalDecodingContext *ctx,
257 | 												  XLogRecPtr end_of_wal,
258 | 												  ScanKey ident_key,
259 | 												  int ident_key_nentries,
260 | 												  Relation ident_index,
261 | 												  TupleTableSlot *slot_dst_ind,
262 | 												  LOCKMODE lock_held,
263 | 												  partitions_hash *partitions,
264 | 												  TupleConversionMapExt *conv_map,
265 | 												  struct timeval *must_complete);
266 | extern bool pg_rewrite_decode_concurrent_changes(LogicalDecodingContext *ctx,
267 | 												 XLogRecPtr end_of_wal,
268 | 												 struct timeval *must_complete);
269 | extern HeapTuple convert_tuple_for_dest_table(HeapTuple tuple,
270 | 											  TupleConversionMapExt *conv_map);
271 | extern void _PG_output_plugin_init(OutputPluginCallbacks *cb);
272 | extern PartitionEntry *get_partition_entry(partitions_hash *partitions,
273 | 										   Oid part_oid);;
274 | extern HeapTuple pg_rewrite_execute_attr_map_tuple(HeapTuple tuple,
275 | 												   TupleConversionMapExt *map);
276 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # pg_rewrite
  2 | 
  3 | `pg_rewrite` is a tool to rewrite table (i.e. to copy its data to a new
  4 | file). It allows both read and write access to the table during the rewriting.
  5 | 
  6 | Following are the most common reasons to rewrite a table:
  7 | 
  8 | 1.  Change data type of column(s)
  9 | 
 10 |     Typically this is needed if the existing data type is running out of
 11 |     values. For example, you may need to change `interger` type to
 12 |     `bigint`. `ALTER TABLE` command can do that too, but it allows neither
 13 |     write nor read access to the table during the rewriting.
 14 | 
 15 | 2.  Partition the table
 16 | 
 17 |     If you realize that your table is getting much bigger than expected and
 18 |     that partitioning would make your life easier, the next question may be
 19 |     how to copy the existing data to the new, partitioned table without
 20 |     stopping all the applications that run DML commands on the table. (When
 21 |     you decide to use partitioning, the amount of data to copy might already
 22 |     be significant, so the copying might need a while.)
 23 | 
 24 | 3.  Change order of columns
 25 | 
 26 |     If you conclude that a different order of columns would save significant
 27 |     disk space (due to reduced paddding), the problem boils down to copying
 28 |     data to a new table like in 2). Again, you may need `pg_rewrite` to make
 29 |     the change smooth.
 30 | 
 31 | 4.  Move table into another tablespace.
 32 | 
 33 |     `ALTER TABLE` command can do that, but it allows neither write nor read
 34 |     access to the table during the rewriting. With `pg_rewrite`, you only need
 35 |     to create the new table in the desired tablespace. The rest is identical
 36 |     to the other use cases.
 37 | 
 38 | Note that the following use cases can be combined in a single rewrited.
 39 | 
 40 | 
 41 | # INSTALLATION
 42 | 
 43 | Install PostgreSQL before proceeding. Make sure to have `pg_config` binary,
 44 | these are typically included in `-dev` and `-devel` packages. PostgreSQL server
 45 | version 13 or later is required.
 46 | 
 47 | ```bash
 48 | git clone https://github.com/cybertec-postgresql/pg_rewrite.git
 49 | cd pg_rewrite
 50 | git checkout <the latest stable version>
 51 | make
 52 | make install
 53 | ```
 54 | 
 55 | Add these to `postgresql.conf`:
 56 | 
 57 | ```
 58 | wal_level = logical
 59 | max_replication_slots = 1 # ... or add 1 to the current value.
 60 | shared_preload_libraries = 'pg_rewrite' # ... or add the library to the existing ones.
 61 | ```
 62 | 
 63 | Restart the cluster, and invoke:
 64 | 
 65 | ```
 66 | CREATE EXTENSION pg_rewrite;
 67 | ```
 68 | 
 69 | # USAGE
 70 | 
 71 | Assume you have a table defined like this
 72 | 
 73 | ```
 74 | CREATE TABLE measurement (
 75 |     id              int,
 76 |     city_id         int not null,
 77 |     logdate         date not null,
 78 |     peaktemp        int,
 79 |     PRIMARY KEY(id, logdate)
 80 | );
 81 | ```
 82 | 
 83 | and you need to replace it with a partitioned table. At the same time, you
 84 | want to change the data type of the `id` column to `bigint`.
 85 | 
 86 | 
 87 | ```
 88 | CREATE TABLE measurement_aux (
 89 |     id              bigint,
 90 |     city_id         int not null,
 91 |     logdate         date not null,
 92 |     peaktemp        int,
 93 |     PRIMARY KEY(id, logdate)
 94 | ) PARTITION BY RANGE (logdate);
 95 | ```
 96 | 
 97 | Then create partitions for all the rows currently present in the `measurement`
 98 | table, and also for the data that might be inserted during processing:
 99 | 
100 | ```
101 | CREATE TABLE measurement_y2006m02 PARTITION OF measurement_aux
102 |     FOR VALUES FROM ('2006-02-01') TO ('2006-03-01');
103 | 
104 | CREATE TABLE measurement_y2006m03 PARTITION OF measurement_aux
105 |     FOR VALUES FROM ('2006-03-01') TO ('2006-04-01');
106 | 
107 | -- ...
108 | ```
109 | 
110 | *It's essential that both the source (`measurement`) and target
111 | (`measurement_aux`) table have an identity index. It is needed to process data
112 | changes that applications make while data is being copied from the source to
113 | the target table. If the replica identity of the table is DEFAULT or FULL,
114 | primary key constraint provides the identity index. If your table has no
115 | primary key, you need to set the identity index explicitly using the [ALTER
116 | COMMAND ... REPLICA IDENTITY USING INDEX ...][1] command.
117 | 
118 | Also note that the key (i.e. column list) of the identity index of the source
119 | and target table must be identical.*
120 | 
121 | Then, in order to copy the data into the target table, run the
122 | `rewrite_table()` function and pass it both the source and target table, as
123 | well as a new table name for the source table. For example:
124 | 
125 | ```
126 | SELECT rewrite_table('measurement', 'measurement_aux', 'measurement_old');
127 | ```
128 | 
129 | The call will first copy all rows from `measurement` to `measurement_aux`. Then
130 | it will apply to `measurement_aux` all the data changes (INSERT, UPDATE,
131 | DELETE) that took place in `measurement` during the copying. Next, it will
132 | lock `measurement` so that neither read nor write access is possible. Finally
133 | it will rename `measurement` to `measurement_old` and `measurement_aux` to
134 | `measurement`. Thus `measurement` ends up to be the partitioned table, while
135 | `measurement_old` is the original, non-partitioned table.
136 | 
137 | If a column of the target table has a different data type from the
138 | corresponding column of the source table, an implicit or assignment cast must
139 | exist between the two types.
140 | 
141 | # Constraints
142 | 
143 | The target table should obviously end up with the same constraints as the
144 | source table. It's recommended to handle constraints creation this way:
145 | 
146 | 1.  Add PRIMARY KEY, UNIQUE and EXCLUDE constraints of the source table to the
147 |     target table before you call `rewrite_table()`. These are enforced during
148 |     the rewriting, so any violation would make `rewrite_table()` fail
149 |     (ROLLBACK). (The constraints must have been enforced in the source table,
150 |     but it does not hurt to check them in the target table, especially if the
151 |     column data type is being changed.)
152 | 
153 | 2.  If the version of PostgreSQL server is 17 or lower, add NOT NULL
154 |     constraints of the source table to the target table. `rewrite_table()`
155 |     by-passes validation of these, but all the rows it inserts into the target
156 |     table must have been validated in the source table. Even if the column
157 |     data tape is different in the target table, the data type conversion
158 |     should not turn non-NULL value to NULL or vice versa.
159 | 
160 | 3.  CHECK constraints are created automatically by `rewrite_table()`
161 |     (according to the source table) when all the data changes have been
162 |     applied to the target table. However, these constraints are created as NOT
163 |     VALID, so you need to use the `ALTER TABLE ... VALIDATE CONSTRAINT ...`
164 |     command to validate them.
165 | 
166 |     (The function does not create these constraints immediately as valid,
167 |     because that could imply blocking access to the table for significant
168 |     time.)
169 | 
170 | 4.  If the version of PostgreSQL server is 18 or higher, NOT NULL constraints
171 |     are also created automatically and need to be validated using the `ALTER
172 |     TABLE ... VALIDATE CONSTRAINT ...` command.
173 | 
174 | 5.  FOREIGN KEY constraints are also created automatically (according to the
175 |     source table) and need to be validated using the `ALTER TABLE ... VALIDATE
176 |     CONSTRAINT ...` command, unless the referencing table is partitioned and
177 |     the version of PostgreSQL server is 17 or lower: those versions do not
178 |     support the NOT VALID option for partitioned tables.
179 | 
180 |     Therefore, if the referencing table is partitioned and if the server
181 |     version is 17 or lower, you need to use the `ALTER TABLE ... ADD
182 |     CONSTRAINT ... FOREIGN KEY ...` command after `rewrite_table()` has
183 |     finished. Please run the command as soon as possible to minimize the risk
184 |     that applications modify the database in a way that violates the
185 |     constraints.
186 | 
187 | 6.  Drop all foreign keys involving the source table.
188 | 
189 |     You probably want to drop the source table anyway, but if you don't, you
190 |     should at least drop its FOREIGN KEY constraints. As the table was
191 |     renamed, applications will no longer update it. Therefore, attempts to
192 |     update the other tables involved in its foreign keys may cause errors.
193 | 
194 | # Sequences
195 | 
196 | If a sequence is used to generate column value in the source table (typically
197 | the column data type is `serial` or the column is declared `GENARATED ... AS
198 | IDENTITY`), and if `rewrite_table()` finds the corresponding sequence for the
199 | target table, it sets its value according to the sequence for the source
200 | table. If it cannot identify the sequence for the target table, a log message
201 | is printed out.
202 | 
203 | # Progress monitoring
204 | 
205 | If `rewrite_table()` takes long time to finish, you might be interested in the
206 | progress. The `pg_rewrite_progress` view shows all the pending calls of the
207 | function in the current database. The `src_table`, `dst_table` and
208 | `src_table_new` columns contain the arguments of the `rewrite_table()`
209 | function. `ins_initial` is the number of tuples inserted into the new table
210 | storage during the "initial load stage", i.e. the number of tuples present in
211 | the table before the processing started. On the other hand, `ins`, `upd` and
212 | `del` are the numbers of tuples inserted, updated and deleted by applications
213 | during the table processing. (These "concurrent data changes" must also be
214 | incorporated into the partitioned table, otherwise they'd get lost.)
215 | 
216 | # Limitations
217 | 
218 | 1.  If the target table is partitioned, it's not allowed to have foreign
219 |     tables as partitions.
220 | 
221 | 2.  Indexes are not renamed.
222 | 
223 |     While the target table (`measurement_aux` above) is renamed to the source
224 |     table (`measurement`), its indexes are not renamed to match the source
225 |     table. If you consider it a problem, please use the `ALTER INDEX` command
226 |     to rename them. This operation blocks neither reads nor writes.
227 | 
228 | # Configuration
229 | 
230 | Following is the description of the configuration variables that affect
231 | behavior of the functions of this extension.
232 | 
233 | * `rewrite.max_xlock_time`
234 | 
235 | Although the table being processed is available for both read and write
236 | operations by other transactions most of the time, an exclusive lock is needed
237 | to finalize the processing (i.e. to do the table renaming), which blocks both
238 | read and write access. This should take very short time that users should
239 | harly notice.
240 | 
241 | However, if a significant amount of changes took place in the source table
242 | while the extension was waiting for the (exclusive) lock, the outage might
243 | take proportionally longer time. The point is that those changes need to be
244 | propagated to the target table before the exclusive lock can be released.
245 | 
246 | If the extension function seems to block access to tables too much, consider
247 | setting `rewrite.max_xlock_time` GUC parameter. For example:
248 | 
249 | ```
250 | SET rewrite.max_xlock_time TO 100;
251 | ```
252 | 
253 | Tells that the exclusive lock shouldn't be held for more than 0.1 second (100
254 | milliseconds). If more time is needed for the final stage, the particular
255 | function releases the exclusive lock, processes the changes committed by the
256 | other transactions in between and tries the final stage again. Error is
257 | reported if the lock duration is exceeded a few more times. If that happens,
258 | you should either increase the setting or try to process the problematic table
259 | later, when the write activity is lower.
260 | 
261 | The default value is `0`, meaning that the final stage can take as much time as
262 | it needs.
263 | 
264 | # Concurrency
265 | 
266 | 1. While the rewrite_table() function is executing, `ALTER TABLE` command on
267 |    the same table should be blocked until the rewriting is done. However, in
268 |    some cases the `ALTER TABLE` command and the rewrite_table() function might
269 |    end up in a deadlock. Therefore it's recommended not to run ALTER TABLE on
270 |    a table which is being rewritten.
271 | 
272 | 2. The `rewrite_table()` function allows for MVCC-unsafe behavior described in
273 |    the first paragraph of [mvcc-caveats][2].
274 | 
275 | 
276 | [1] https://www.postgresql.org/docs/17/sql-altertable.html
277 | [2] https://www.postgresql.org/docs/current/mvcc-caveats.html
278 | 


--------------------------------------------------------------------------------
/expected/pg_rewrite_1.out:
--------------------------------------------------------------------------------
  1 | DROP EXTENSION IF EXISTS pg_rewrite;
  2 | NOTICE:  extension "pg_rewrite" does not exist, skipping
  3 | CREATE EXTENSION pg_rewrite;
  4 | CREATE TABLE tab1(i int PRIMARY KEY, j int, k int);
  5 | -- If a dropped column is encountered, the source tuple should be converted
  6 | -- so it matches the destination table.
  7 | ALTER TABLE tab1 DROP COLUMN k;
  8 | ALTER TABLE tab1 ADD COLUMN k int;
  9 | INSERT INTO tab1(i, j, k)
 10 | SELECT i, i / 2, i
 11 | FROM generate_series(0, 1023) g(i);
 12 | CREATE TABLE tab1_new(i int PRIMARY KEY, j int, k int) PARTITION BY RANGE(i);
 13 | CREATE TABLE tab1_new_part_1 PARTITION OF tab1_new FOR VALUES FROM (0) TO (256);
 14 | CREATE TABLE tab1_new_part_2 PARTITION OF tab1_new FOR VALUES FROM (256) TO (512);
 15 | CREATE TABLE tab1_new_part_3 PARTITION OF tab1_new FOR VALUES FROM (512) TO (768);
 16 | CREATE TABLE tab1_new_part_4 PARTITION OF tab1_new FOR VALUES FROM (768) TO (1024);
 17 | -- Also test handling of constraints that require "manual" validation.
 18 | ALTER TABLE tab1 ADD CHECK (k >= 0);
 19 | CREATE TABLE tab1_fk(i int REFERENCES tab1);
 20 | INSERT INTO tab1_fk(i) VALUES (1);
 21 | \d tab1
 22 |                 Table "public.tab1"
 23 |  Column |  Type   | Collation | Nullable | Default 
 24 | --------+---------+-----------+----------+---------
 25 |  i      | integer |           | not null | 
 26 |  j      | integer |           |          | 
 27 |  k      | integer |           |          | 
 28 | Indexes:
 29 |     "tab1_pkey" PRIMARY KEY, btree (i)
 30 | Check constraints:
 31 |     "tab1_k_check" CHECK (k >= 0)
 32 | Referenced by:
 33 |     TABLE "tab1_fk" CONSTRAINT "tab1_fk_i_fkey" FOREIGN KEY (i) REFERENCES tab1(i)
 34 | 
 35 | -- Process the table.
 36 | SELECT rewrite_table('tab1', 'tab1_new', 'tab1_orig');
 37 |  rewrite_table 
 38 | ---------------
 39 |  
 40 | (1 row)
 41 | 
 42 | -- tab1 should now be partitioned.
 43 | \d tab1
 44 |           Partitioned table "public.tab1"
 45 |  Column |  Type   | Collation | Nullable | Default 
 46 | --------+---------+-----------+----------+---------
 47 |  i      | integer |           | not null | 
 48 |  j      | integer |           |          | 
 49 |  k      | integer |           |          | 
 50 | Partition key: RANGE (i)
 51 | Indexes:
 52 |     "tab1_new_pkey" PRIMARY KEY, btree (i)
 53 | Check constraints:
 54 |     "tab1_k_check2" CHECK (k >= 0) NOT VALID
 55 | Referenced by:
 56 |     TABLE "tab1_fk" CONSTRAINT "tab1_fk_i_fkey2" FOREIGN KEY (i) REFERENCES tab1(i) NOT VALID
 57 | Number of partitions: 4 (Use \d+ to list them.)
 58 | 
 59 | -- Validate the constraints.
 60 | ALTER TABLE tab1 VALIDATE CONSTRAINT tab1_k_check2;
 61 | ALTER TABLE tab1_fk VALIDATE CONSTRAINT tab1_fk_i_fkey2;
 62 | \d tab1
 63 |           Partitioned table "public.tab1"
 64 |  Column |  Type   | Collation | Nullable | Default 
 65 | --------+---------+-----------+----------+---------
 66 |  i      | integer |           | not null | 
 67 |  j      | integer |           |          | 
 68 |  k      | integer |           |          | 
 69 | Partition key: RANGE (i)
 70 | Indexes:
 71 |     "tab1_new_pkey" PRIMARY KEY, btree (i)
 72 | Check constraints:
 73 |     "tab1_k_check2" CHECK (k >= 0)
 74 | Referenced by:
 75 |     TABLE "tab1_fk" CONSTRAINT "tab1_fk_i_fkey2" FOREIGN KEY (i) REFERENCES tab1(i)
 76 | Number of partitions: 4 (Use \d+ to list them.)
 77 | 
 78 | EXPLAIN (COSTS off) SELECT * FROM tab1;
 79 |                 QUERY PLAN                
 80 | ------------------------------------------
 81 |  Append
 82 |    ->  Seq Scan on tab1_new_part_1 tab1_1
 83 |    ->  Seq Scan on tab1_new_part_2 tab1_2
 84 |    ->  Seq Scan on tab1_new_part_3 tab1_3
 85 |    ->  Seq Scan on tab1_new_part_4 tab1_4
 86 | (5 rows)
 87 | 
 88 | -- Check that the contents has not changed.
 89 | SELECT count(*) FROM tab1;
 90 |  count 
 91 | -------
 92 |   1024
 93 | (1 row)
 94 | 
 95 | SELECT *
 96 | FROM tab1 t FULL JOIN tab1_orig o ON t.i = o.i
 97 | WHERE t.i ISNULL OR o.i ISNULL;
 98 |  i | j | k | i | j | k 
 99 | ---+---+---+---+---+---
100 | (0 rows)
101 | 
102 | -- List partitioning
103 | CREATE TABLE tab2(i int, j int, PRIMARY KEY (i, j));
104 | INSERT INTO tab2(i, j)
105 | SELECT i, j
106 | FROM generate_series(1, 4) g(i), generate_series(1, 4) h(j);
107 | CREATE TABLE tab2_new(i int, j int, PRIMARY KEY (i, j)) PARTITION BY LIST(i);
108 | CREATE TABLE tab2_new_part_1 PARTITION OF tab2_new FOR VALUES IN (1);
109 | CREATE TABLE tab2_new_part_2 PARTITION OF tab2_new FOR VALUES IN (2);
110 | CREATE TABLE tab2_new_part_3 PARTITION OF tab2_new FOR VALUES IN (3);
111 | CREATE TABLE tab2_new_part_4 PARTITION OF tab2_new FOR VALUES IN (4);
112 | SELECT rewrite_table('tab2', 'tab2_new', 'tab2_orig');
113 |  rewrite_table 
114 | ---------------
115 |  
116 | (1 row)
117 | 
118 | TABLE tab2_new_part_1;
119 |  i | j 
120 | ---+---
121 |  1 | 1
122 |  1 | 2
123 |  1 | 3
124 |  1 | 4
125 | (4 rows)
126 | 
127 | TABLE tab2_new_part_2;
128 |  i | j 
129 | ---+---
130 |  2 | 1
131 |  2 | 2
132 |  2 | 3
133 |  2 | 4
134 | (4 rows)
135 | 
136 | TABLE tab2_new_part_3;
137 |  i | j 
138 | ---+---
139 |  3 | 1
140 |  3 | 2
141 |  3 | 3
142 |  3 | 4
143 | (4 rows)
144 | 
145 | TABLE tab2_new_part_4;
146 |  i | j 
147 | ---+---
148 |  4 | 1
149 |  4 | 2
150 |  4 | 3
151 |  4 | 4
152 | (4 rows)
153 | 
154 | -- Hash partitioning
155 | CREATE TABLE tab3(i int, j int, PRIMARY KEY (i, j));
156 | INSERT INTO tab3(i, j)
157 | SELECT i, j
158 | FROM generate_series(1, 4) g(i), generate_series(1, 4) h(j);
159 | CREATE TABLE tab3_new(i int, j int, PRIMARY KEY (i, j)) PARTITION BY HASH(i);
160 | CREATE TABLE tab3_new_part_1 PARTITION OF tab3_new FOR VALUES WITH (MODULUS 4, REMAINDER 0);
161 | CREATE TABLE tab3_new_part_2 PARTITION OF tab3_new FOR VALUES WITH (MODULUS 4, REMAINDER 1);
162 | CREATE TABLE tab3_new_part_3 PARTITION OF tab3_new FOR VALUES WITH (MODULUS 4, REMAINDER 2);
163 | CREATE TABLE tab3_new_part_4 PARTITION OF tab3_new FOR VALUES WITH (MODULUS 4, REMAINDER 3);
164 | SELECT rewrite_table('tab3', 'tab3_new', 'tab3_orig');
165 |  rewrite_table 
166 | ---------------
167 |  
168 | (1 row)
169 | 
170 | TABLE tab3_new_part_1;
171 |  i | j 
172 | ---+---
173 |  1 | 1
174 |  1 | 2
175 |  1 | 3
176 |  1 | 4
177 | (4 rows)
178 | 
179 | TABLE tab3_new_part_2;
180 |  i | j 
181 | ---+---
182 |  3 | 1
183 |  3 | 2
184 |  3 | 3
185 |  3 | 4
186 | (4 rows)
187 | 
188 | TABLE tab3_new_part_3;
189 |  i | j 
190 | ---+---
191 |  2 | 1
192 |  2 | 2
193 |  2 | 3
194 |  2 | 4
195 | (4 rows)
196 | 
197 | TABLE tab3_new_part_4;
198 |  i | j 
199 | ---+---
200 |  4 | 1
201 |  4 | 2
202 |  4 | 3
203 |  4 | 4
204 | (4 rows)
205 | 
206 | -- Change of precision and scale of a numeric data type.
207 | CREATE TABLE tab4(i int PRIMARY KEY, j numeric(3, 1));
208 | INSERT INTO tab4(i, j) VALUES (1, 0.1);
209 | CREATE TABLE tab4_new(i int PRIMARY KEY, j numeric(4, 2));
210 | TABLE tab4;
211 |  i |  j  
212 | ---+-----
213 |  1 | 0.1
214 | (1 row)
215 | 
216 | SELECT rewrite_table('tab4', 'tab4_new', 'tab4_orig');
217 |  rewrite_table 
218 | ---------------
219 |  
220 | (1 row)
221 | 
222 | TABLE tab4;
223 |  i |  j   
224 | ---+------
225 |  1 | 0.10
226 | (1 row)
227 | 
228 | -- One more test for "manual" validation of FKs, this time we rewrite the PK
229 | -- table. The NOT VALID constraint cannot be used if the FK table is
230 | -- partitioned and if PG version is < 18, so we need a separate test.
231 | CREATE TABLE tab1_pk(i int primary key);
232 | INSERT INTO tab1_pk(i) VALUES (1);
233 | CREATE TABLE tab1_pk_new(i bigint primary key);
234 | DROP TABLE tab1_fk;
235 | CREATE TABLE tab1_fk(i int REFERENCES tab1_pk);
236 | INSERT INTO tab1_fk(i) VALUES (1);
237 | \d tab1_pk
238 |               Table "public.tab1_pk"
239 |  Column |  Type   | Collation | Nullable | Default 
240 | --------+---------+-----------+----------+---------
241 |  i      | integer |           | not null | 
242 | Indexes:
243 |     "tab1_pk_pkey" PRIMARY KEY, btree (i)
244 | Referenced by:
245 |     TABLE "tab1_fk" CONSTRAINT "tab1_fk_i_fkey" FOREIGN KEY (i) REFERENCES tab1_pk(i)
246 | 
247 | SELECT rewrite_table('tab1_pk', 'tab1_pk_new', 'tab1_pk_orig');
248 |  rewrite_table 
249 | ---------------
250 |  
251 | (1 row)
252 | 
253 | \d tab1_pk
254 |               Table "public.tab1_pk"
255 |  Column |  Type  | Collation | Nullable | Default 
256 | --------+--------+-----------+----------+---------
257 |  i      | bigint |           | not null | 
258 | Indexes:
259 |     "tab1_pk_new_pkey" PRIMARY KEY, btree (i)
260 | Referenced by:
261 |     TABLE "tab1_fk" CONSTRAINT "tab1_fk_i_fkey2" FOREIGN KEY (i) REFERENCES tab1_pk(i) NOT VALID
262 | 
263 | ALTER TABLE tab1_fk VALIDATE CONSTRAINT tab1_fk_i_fkey2;
264 | \d tab1_pk
265 |               Table "public.tab1_pk"
266 |  Column |  Type  | Collation | Nullable | Default 
267 | --------+--------+-----------+----------+---------
268 |  i      | bigint |           | not null | 
269 | Indexes:
270 |     "tab1_pk_new_pkey" PRIMARY KEY, btree (i)
271 | Referenced by:
272 |     TABLE "tab1_fk" CONSTRAINT "tab1_fk_i_fkey2" FOREIGN KEY (i) REFERENCES tab1_pk(i)
273 | 
274 | -- For the partitioned FK table, test at least that the FK creation is skipped
275 | -- (i.e. ERROR saying that NOT VALID is not supported is no raised)
276 | DROP TABLE tab1_fk;
277 | CREATE TABLE tab1_fk(i int REFERENCES tab1_pk) PARTITION BY RANGE (i);
278 | CREATE TABLE tab1_fk_1 PARTITION OF tab1_fk DEFAULT;
279 | INSERT INTO tab1_fk(i) VALUES (1);
280 | ALTER TABLE tab1_pk_orig RENAME TO tab1_pk_new;
281 | TRUNCATE TABLE tab1_pk_new;
282 | \d tab1_fk
283 |         Partitioned table "public.tab1_fk"
284 |  Column |  Type   | Collation | Nullable | Default 
285 | --------+---------+-----------+----------+---------
286 |  i      | integer |           |          | 
287 | Partition key: RANGE (i)
288 | Foreign-key constraints:
289 |     "tab1_fk_i_fkey" FOREIGN KEY (i) REFERENCES tab1_pk(i)
290 | Number of partitions: 1 (Use \d+ to list them.)
291 | 
292 | SELECT rewrite_table('tab1_pk', 'tab1_pk_new', 'tab1_pk_orig');
293 |  rewrite_table 
294 | ---------------
295 |  
296 | (1 row)
297 | 
298 | -- Note that tab1_fk still references tab1_pk_orig - that's expected.
299 | \d tab1_fk
300 |         Partitioned table "public.tab1_fk"
301 |  Column |  Type   | Collation | Nullable | Default 
302 | --------+---------+-----------+----------+---------
303 |  i      | integer |           |          | 
304 | Partition key: RANGE (i)
305 | Foreign-key constraints:
306 |     "tab1_fk_i_fkey" FOREIGN KEY (i) REFERENCES tab1_pk_orig(i)
307 | Number of partitions: 1 (Use \d+ to list them.)
308 | 
309 | -- The same once again, but now rewrite the FK table.
310 | DROP TABLE tab1_fk;
311 | DROP TABLE tab1_pk;
312 | ALTER TABLE tab1_pk_orig RENAME TO tab1_pk;
313 | CREATE TABLE tab1_fk(i int PRIMARY KEY REFERENCES tab1_pk);
314 | INSERT INTO tab1_fk(i) VALUES (1);
315 | CREATE TABLE tab1_fk_new(i int PRIMARY KEY) PARTITION BY RANGE (i);
316 | CREATE TABLE tab1_fk_new_1 PARTITION OF tab1_fk_new DEFAULT;
317 | \d tab1_fk
318 |               Table "public.tab1_fk"
319 |  Column |  Type   | Collation | Nullable | Default 
320 | --------+---------+-----------+----------+---------
321 |  i      | integer |           | not null | 
322 | Indexes:
323 |     "tab1_fk_pkey" PRIMARY KEY, btree (i)
324 | Foreign-key constraints:
325 |     "tab1_fk_i_fkey" FOREIGN KEY (i) REFERENCES tab1_pk(i)
326 | 
327 | SELECT rewrite_table('tab1_fk', 'tab1_fk_new', 'tab1_fk_orig');
328 | NOTICE:  FOREIGN KEY with NOT VALID option cannot be added to partitioned table
329 |  rewrite_table 
330 | ---------------
331 |  
332 | (1 row)
333 | 
334 | \d tab1_fk
335 |         Partitioned table "public.tab1_fk"
336 |  Column |  Type   | Collation | Nullable | Default 
337 | --------+---------+-----------+----------+---------
338 |  i      | integer |           | not null | 
339 | Partition key: RANGE (i)
340 | Indexes:
341 |     "tab1_fk_new_pkey" PRIMARY KEY, btree (i)
342 | Number of partitions: 1 (Use \d+ to list them.)
343 | 
344 | -- Check if sequence on the target table is synchronized with that of the
345 | -- source table.
346 | CREATE TABLE tab5(i int primary key generated always as identity);
347 | CREATE TABLE tab5_new(i int primary key generated always as identity);
348 | INSERT INTO tab5(i) VALUES (DEFAULT);
349 | SELECT rewrite_table('tab5', 'tab5_new', 'tab5_orig');
350 |  rewrite_table 
351 | ---------------
352 |  
353 | (1 row)
354 | 
355 | INSERT INTO tab5(i) VALUES (DEFAULT);
356 | SELECT i FROM tab5 ORDER BY i;
357 |  i 
358 | ---
359 |  1
360 |  2
361 | (2 rows)
362 | 
363 | -- The same with serial column.
364 | CREATE TABLE tab6(i serial primary key);
365 | CREATE TABLE tab6_new(i serial primary key);
366 | INSERT INTO tab6(i) VALUES (DEFAULT);
367 | SELECT rewrite_table('tab6', 'tab6_new', 'tab6_orig');
368 |  rewrite_table 
369 | ---------------
370 |  
371 | (1 row)
372 | 
373 | INSERT INTO tab6(i) VALUES (DEFAULT);
374 | SELECT i FROM tab6 ORDER BY i;
375 |  i 
376 | ---
377 |  1
378 |  2
379 | (2 rows)
380 | 
381 | 


--------------------------------------------------------------------------------
/expected/pg_rewrite.out:
--------------------------------------------------------------------------------
  1 | DROP EXTENSION IF EXISTS pg_rewrite;
  2 | NOTICE:  extension "pg_rewrite" does not exist, skipping
  3 | CREATE EXTENSION pg_rewrite;
  4 | CREATE TABLE tab1(i int PRIMARY KEY, j int, k int);
  5 | -- If a dropped column is encountered, the source tuple should be converted
  6 | -- so it matches the destination table.
  7 | ALTER TABLE tab1 DROP COLUMN k;
  8 | ALTER TABLE tab1 ADD COLUMN k int;
  9 | INSERT INTO tab1(i, j, k)
 10 | SELECT i, i / 2, i
 11 | FROM generate_series(0, 1023) g(i);
 12 | CREATE TABLE tab1_new(i int PRIMARY KEY, j int, k int) PARTITION BY RANGE(i);
 13 | CREATE TABLE tab1_new_part_1 PARTITION OF tab1_new FOR VALUES FROM (0) TO (256);
 14 | CREATE TABLE tab1_new_part_2 PARTITION OF tab1_new FOR VALUES FROM (256) TO (512);
 15 | CREATE TABLE tab1_new_part_3 PARTITION OF tab1_new FOR VALUES FROM (512) TO (768);
 16 | CREATE TABLE tab1_new_part_4 PARTITION OF tab1_new FOR VALUES FROM (768) TO (1024);
 17 | -- Also test handling of constraints that require "manual" validation.
 18 | ALTER TABLE tab1 ADD CHECK (k >= 0);
 19 | CREATE TABLE tab1_fk(i int REFERENCES tab1);
 20 | INSERT INTO tab1_fk(i) VALUES (1);
 21 | \d tab1
 22 |                 Table "public.tab1"
 23 |  Column |  Type   | Collation | Nullable | Default 
 24 | --------+---------+-----------+----------+---------
 25 |  i      | integer |           | not null | 
 26 |  j      | integer |           |          | 
 27 |  k      | integer |           |          | 
 28 | Indexes:
 29 |     "tab1_pkey" PRIMARY KEY, btree (i)
 30 | Check constraints:
 31 |     "tab1_k_check" CHECK (k >= 0)
 32 | Referenced by:
 33 |     TABLE "tab1_fk" CONSTRAINT "tab1_fk_i_fkey" FOREIGN KEY (i) REFERENCES tab1(i)
 34 | 
 35 | -- Process the table.
 36 | SELECT rewrite_table('tab1', 'tab1_new', 'tab1_orig');
 37 |  rewrite_table 
 38 | ---------------
 39 |  
 40 | (1 row)
 41 | 
 42 | -- tab1 should now be partitioned.
 43 | \d tab1
 44 |           Partitioned table "public.tab1"
 45 |  Column |  Type   | Collation | Nullable | Default 
 46 | --------+---------+-----------+----------+---------
 47 |  i      | integer |           | not null | 
 48 |  j      | integer |           |          | 
 49 |  k      | integer |           |          | 
 50 | Partition key: RANGE (i)
 51 | Indexes:
 52 |     "tab1_new_pkey" PRIMARY KEY, btree (i)
 53 | Check constraints:
 54 |     "tab1_k_check2" CHECK (k >= 0) NOT VALID
 55 | Referenced by:
 56 |     TABLE "tab1_fk" CONSTRAINT "tab1_fk_i_fkey2" FOREIGN KEY (i) REFERENCES tab1(i) NOT VALID
 57 | Number of partitions: 4 (Use \d+ to list them.)
 58 | 
 59 | -- Validate the constraints.
 60 | ALTER TABLE tab1 VALIDATE CONSTRAINT tab1_k_check2;
 61 | ALTER TABLE tab1_fk VALIDATE CONSTRAINT tab1_fk_i_fkey2;
 62 | \d tab1
 63 |           Partitioned table "public.tab1"
 64 |  Column |  Type   | Collation | Nullable | Default 
 65 | --------+---------+-----------+----------+---------
 66 |  i      | integer |           | not null | 
 67 |  j      | integer |           |          | 
 68 |  k      | integer |           |          | 
 69 | Partition key: RANGE (i)
 70 | Indexes:
 71 |     "tab1_new_pkey" PRIMARY KEY, btree (i)
 72 | Check constraints:
 73 |     "tab1_k_check2" CHECK (k >= 0)
 74 | Referenced by:
 75 |     TABLE "tab1_fk" CONSTRAINT "tab1_fk_i_fkey2" FOREIGN KEY (i) REFERENCES tab1(i)
 76 | Number of partitions: 4 (Use \d+ to list them.)
 77 | 
 78 | EXPLAIN (COSTS off) SELECT * FROM tab1;
 79 |                 QUERY PLAN                
 80 | ------------------------------------------
 81 |  Append
 82 |    ->  Seq Scan on tab1_new_part_1 tab1_1
 83 |    ->  Seq Scan on tab1_new_part_2 tab1_2
 84 |    ->  Seq Scan on tab1_new_part_3 tab1_3
 85 |    ->  Seq Scan on tab1_new_part_4 tab1_4
 86 | (5 rows)
 87 | 
 88 | -- Check that the contents has not changed.
 89 | SELECT count(*) FROM tab1;
 90 |  count 
 91 | -------
 92 |   1024
 93 | (1 row)
 94 | 
 95 | SELECT *
 96 | FROM tab1 t FULL JOIN tab1_orig o ON t.i = o.i
 97 | WHERE t.i ISNULL OR o.i ISNULL;
 98 |  i | j | k | i | j | k 
 99 | ---+---+---+---+---+---
100 | (0 rows)
101 | 
102 | -- List partitioning
103 | CREATE TABLE tab2(i int, j int, PRIMARY KEY (i, j));
104 | INSERT INTO tab2(i, j)
105 | SELECT i, j
106 | FROM generate_series(1, 4) g(i), generate_series(1, 4) h(j);
107 | CREATE TABLE tab2_new(i int, j int, PRIMARY KEY (i, j)) PARTITION BY LIST(i);
108 | CREATE TABLE tab2_new_part_1 PARTITION OF tab2_new FOR VALUES IN (1);
109 | CREATE TABLE tab2_new_part_2 PARTITION OF tab2_new FOR VALUES IN (2);
110 | CREATE TABLE tab2_new_part_3 PARTITION OF tab2_new FOR VALUES IN (3);
111 | CREATE TABLE tab2_new_part_4 PARTITION OF tab2_new FOR VALUES IN (4);
112 | SELECT rewrite_table('tab2', 'tab2_new', 'tab2_orig');
113 |  rewrite_table 
114 | ---------------
115 |  
116 | (1 row)
117 | 
118 | TABLE tab2_new_part_1;
119 |  i | j 
120 | ---+---
121 |  1 | 1
122 |  1 | 2
123 |  1 | 3
124 |  1 | 4
125 | (4 rows)
126 | 
127 | TABLE tab2_new_part_2;
128 |  i | j 
129 | ---+---
130 |  2 | 1
131 |  2 | 2
132 |  2 | 3
133 |  2 | 4
134 | (4 rows)
135 | 
136 | TABLE tab2_new_part_3;
137 |  i | j 
138 | ---+---
139 |  3 | 1
140 |  3 | 2
141 |  3 | 3
142 |  3 | 4
143 | (4 rows)
144 | 
145 | TABLE tab2_new_part_4;
146 |  i | j 
147 | ---+---
148 |  4 | 1
149 |  4 | 2
150 |  4 | 3
151 |  4 | 4
152 | (4 rows)
153 | 
154 | -- Hash partitioning
155 | CREATE TABLE tab3(i int, j int, PRIMARY KEY (i, j));
156 | INSERT INTO tab3(i, j)
157 | SELECT i, j
158 | FROM generate_series(1, 4) g(i), generate_series(1, 4) h(j);
159 | CREATE TABLE tab3_new(i int, j int, PRIMARY KEY (i, j)) PARTITION BY HASH(i);
160 | CREATE TABLE tab3_new_part_1 PARTITION OF tab3_new FOR VALUES WITH (MODULUS 4, REMAINDER 0);
161 | CREATE TABLE tab3_new_part_2 PARTITION OF tab3_new FOR VALUES WITH (MODULUS 4, REMAINDER 1);
162 | CREATE TABLE tab3_new_part_3 PARTITION OF tab3_new FOR VALUES WITH (MODULUS 4, REMAINDER 2);
163 | CREATE TABLE tab3_new_part_4 PARTITION OF tab3_new FOR VALUES WITH (MODULUS 4, REMAINDER 3);
164 | SELECT rewrite_table('tab3', 'tab3_new', 'tab3_orig');
165 |  rewrite_table 
166 | ---------------
167 |  
168 | (1 row)
169 | 
170 | TABLE tab3_new_part_1;
171 |  i | j 
172 | ---+---
173 |  1 | 1
174 |  1 | 2
175 |  1 | 3
176 |  1 | 4
177 | (4 rows)
178 | 
179 | TABLE tab3_new_part_2;
180 |  i | j 
181 | ---+---
182 |  3 | 1
183 |  3 | 2
184 |  3 | 3
185 |  3 | 4
186 | (4 rows)
187 | 
188 | TABLE tab3_new_part_3;
189 |  i | j 
190 | ---+---
191 |  2 | 1
192 |  2 | 2
193 |  2 | 3
194 |  2 | 4
195 | (4 rows)
196 | 
197 | TABLE tab3_new_part_4;
198 |  i | j 
199 | ---+---
200 |  4 | 1
201 |  4 | 2
202 |  4 | 3
203 |  4 | 4
204 | (4 rows)
205 | 
206 | -- Change of precision and scale of a numeric data type.
207 | CREATE TABLE tab4(i int PRIMARY KEY, j numeric(3, 1));
208 | INSERT INTO tab4(i, j) VALUES (1, 0.1);
209 | CREATE TABLE tab4_new(i int PRIMARY KEY, j numeric(4, 2));
210 | TABLE tab4;
211 |  i |  j  
212 | ---+-----
213 |  1 | 0.1
214 | (1 row)
215 | 
216 | SELECT rewrite_table('tab4', 'tab4_new', 'tab4_orig');
217 |  rewrite_table 
218 | ---------------
219 |  
220 | (1 row)
221 | 
222 | TABLE tab4;
223 |  i |  j   
224 | ---+------
225 |  1 | 0.10
226 | (1 row)
227 | 
228 | -- One more test for "manual" validation of FKs, this time we rewrite the PK
229 | -- table. The NOT VALID constraint cannot be used if the FK table is
230 | -- partitioned and if PG version is < 18, so we need a separate test.
231 | CREATE TABLE tab1_pk(i int primary key);
232 | INSERT INTO tab1_pk(i) VALUES (1);
233 | CREATE TABLE tab1_pk_new(i bigint primary key);
234 | DROP TABLE tab1_fk;
235 | CREATE TABLE tab1_fk(i int REFERENCES tab1_pk);
236 | INSERT INTO tab1_fk(i) VALUES (1);
237 | \d tab1_pk
238 |               Table "public.tab1_pk"
239 |  Column |  Type   | Collation | Nullable | Default 
240 | --------+---------+-----------+----------+---------
241 |  i      | integer |           | not null | 
242 | Indexes:
243 |     "tab1_pk_pkey" PRIMARY KEY, btree (i)
244 | Referenced by:
245 |     TABLE "tab1_fk" CONSTRAINT "tab1_fk_i_fkey" FOREIGN KEY (i) REFERENCES tab1_pk(i)
246 | 
247 | SELECT rewrite_table('tab1_pk', 'tab1_pk_new', 'tab1_pk_orig');
248 |  rewrite_table 
249 | ---------------
250 |  
251 | (1 row)
252 | 
253 | \d tab1_pk
254 |               Table "public.tab1_pk"
255 |  Column |  Type  | Collation | Nullable | Default 
256 | --------+--------+-----------+----------+---------
257 |  i      | bigint |           | not null | 
258 | Indexes:
259 |     "tab1_pk_new_pkey" PRIMARY KEY, btree (i)
260 | Referenced by:
261 |     TABLE "tab1_fk" CONSTRAINT "tab1_fk_i_fkey2" FOREIGN KEY (i) REFERENCES tab1_pk(i) NOT VALID
262 | 
263 | ALTER TABLE tab1_fk VALIDATE CONSTRAINT tab1_fk_i_fkey2;
264 | \d tab1_pk
265 |               Table "public.tab1_pk"
266 |  Column |  Type  | Collation | Nullable | Default 
267 | --------+--------+-----------+----------+---------
268 |  i      | bigint |           | not null | 
269 | Indexes:
270 |     "tab1_pk_new_pkey" PRIMARY KEY, btree (i)
271 | Referenced by:
272 |     TABLE "tab1_fk" CONSTRAINT "tab1_fk_i_fkey2" FOREIGN KEY (i) REFERENCES tab1_pk(i)
273 | 
274 | -- For the partitioned FK table, test at least that the FK creation is skipped
275 | -- (i.e. ERROR saying that NOT VALID is not supported is no raised)
276 | DROP TABLE tab1_fk;
277 | CREATE TABLE tab1_fk(i int REFERENCES tab1_pk) PARTITION BY RANGE (i);
278 | CREATE TABLE tab1_fk_1 PARTITION OF tab1_fk DEFAULT;
279 | INSERT INTO tab1_fk(i) VALUES (1);
280 | ALTER TABLE tab1_pk_orig RENAME TO tab1_pk_new;
281 | TRUNCATE TABLE tab1_pk_new;
282 | \d tab1_fk
283 |         Partitioned table "public.tab1_fk"
284 |  Column |  Type   | Collation | Nullable | Default 
285 | --------+---------+-----------+----------+---------
286 |  i      | integer |           |          | 
287 | Partition key: RANGE (i)
288 | Foreign-key constraints:
289 |     "tab1_fk_i_fkey" FOREIGN KEY (i) REFERENCES tab1_pk(i)
290 | Number of partitions: 1 (Use \d+ to list them.)
291 | 
292 | SELECT rewrite_table('tab1_pk', 'tab1_pk_new', 'tab1_pk_orig');
293 |  rewrite_table 
294 | ---------------
295 |  
296 | (1 row)
297 | 
298 | -- Note that tab1_fk still references tab1_pk_orig - that's expected.
299 | \d tab1_fk
300 |         Partitioned table "public.tab1_fk"
301 |  Column |  Type   | Collation | Nullable | Default 
302 | --------+---------+-----------+----------+---------
303 |  i      | integer |           |          | 
304 | Partition key: RANGE (i)
305 | Foreign-key constraints:
306 |     "tab1_fk_i_fkey" FOREIGN KEY (i) REFERENCES tab1_pk_orig(i)
307 | Number of partitions: 1 (Use \d+ to list them.)
308 | 
309 | -- The same once again, but now rewrite the FK table.
310 | DROP TABLE tab1_fk;
311 | DROP TABLE tab1_pk;
312 | ALTER TABLE tab1_pk_orig RENAME TO tab1_pk;
313 | CREATE TABLE tab1_fk(i int PRIMARY KEY REFERENCES tab1_pk);
314 | INSERT INTO tab1_fk(i) VALUES (1);
315 | CREATE TABLE tab1_fk_new(i int PRIMARY KEY) PARTITION BY RANGE (i);
316 | CREATE TABLE tab1_fk_new_1 PARTITION OF tab1_fk_new DEFAULT;
317 | \d tab1_fk
318 |               Table "public.tab1_fk"
319 |  Column |  Type   | Collation | Nullable | Default 
320 | --------+---------+-----------+----------+---------
321 |  i      | integer |           | not null | 
322 | Indexes:
323 |     "tab1_fk_pkey" PRIMARY KEY, btree (i)
324 | Foreign-key constraints:
325 |     "tab1_fk_i_fkey" FOREIGN KEY (i) REFERENCES tab1_pk(i)
326 | 
327 | SELECT rewrite_table('tab1_fk', 'tab1_fk_new', 'tab1_fk_orig');
328 |  rewrite_table 
329 | ---------------
330 |  
331 | (1 row)
332 | 
333 | \d tab1_fk
334 |         Partitioned table "public.tab1_fk"
335 |  Column |  Type   | Collation | Nullable | Default 
336 | --------+---------+-----------+----------+---------
337 |  i      | integer |           | not null | 
338 | Partition key: RANGE (i)
339 | Indexes:
340 |     "tab1_fk_new_pkey" PRIMARY KEY, btree (i)
341 | Foreign-key constraints:
342 |     "tab1_fk_i_fkey2" FOREIGN KEY (i) REFERENCES tab1_pk(i) NOT VALID
343 | Number of partitions: 1 (Use \d+ to list them.)
344 | 
345 | -- Check if sequence on the target table is synchronized with that of the
346 | -- source table.
347 | CREATE TABLE tab5(i int primary key generated always as identity);
348 | CREATE TABLE tab5_new(i int primary key generated always as identity);
349 | INSERT INTO tab5(i) VALUES (DEFAULT);
350 | SELECT rewrite_table('tab5', 'tab5_new', 'tab5_orig');
351 |  rewrite_table 
352 | ---------------
353 |  
354 | (1 row)
355 | 
356 | INSERT INTO tab5(i) VALUES (DEFAULT);
357 | SELECT i FROM tab5 ORDER BY i;
358 |  i 
359 | ---
360 |  1
361 |  2
362 | (2 rows)
363 | 
364 | -- The same with serial column.
365 | CREATE TABLE tab6(i serial primary key);
366 | CREATE TABLE tab6_new(i serial primary key);
367 | INSERT INTO tab6(i) VALUES (DEFAULT);
368 | SELECT rewrite_table('tab6', 'tab6_new', 'tab6_orig');
369 |  rewrite_table 
370 | ---------------
371 |  
372 | (1 row)
373 | 
374 | INSERT INTO tab6(i) VALUES (DEFAULT);
375 | SELECT i FROM tab6 ORDER BY i;
376 |  i 
377 | ---
378 |  1
379 |  2
380 | (2 rows)
381 | 
382 | 


--------------------------------------------------------------------------------
/concurrent.c:
--------------------------------------------------------------------------------
   1 | /*-----------------------------------------------------------------------------------
   2 |  *
   3 |  * concurrent.c
   4 |  *     Tools for maintenance that requires table rewriting.
   5 |  *
   6 |  *	   This file handles changes that took place while the data is being
   7 |  *	   copied from one table to another one.
   8 |  *
   9 |  * Copyright (c) 2021-2025, Cybertec PostgreSQL International GmbH
  10 |  *
  11 |  *-----------------------------------------------------------------------------------
  12 |  */
  13 | 
  14 | 
  15 | #include "pg_rewrite.h"
  16 | 
  17 | #include "access/heaptoast.h"
  18 | #include "executor/execPartition.h"
  19 | #include "executor/executor.h"
  20 | #include "replication/decode.h"
  21 | #include "utils/rel.h"
  22 | 
  23 | typedef enum
  24 | {
  25 | 	CHANGE_INSERT,
  26 | 	CHANGE_UPDATE_OLD,
  27 | 	CHANGE_UPDATE_NEW,
  28 | 	CHANGE_DELETE
  29 | } ConcurrentChangeKind;
  30 | 
  31 | typedef struct ConcurrentChange
  32 | {
  33 | 	/* See the enum above. */
  34 | 	ConcurrentChangeKind kind;
  35 | 
  36 | 	/*
  37 | 	 * The actual tuple.
  38 | 	 *
  39 | 	 * The tuple data follows the ConcurrentChange structure. Before use make
  40 | 	 * sure the tuple is correctly aligned (ConcurrentChange can be stored as
  41 | 	 * bytea) and that tuple->t_data is fixed.
  42 | 	 */
  43 | 	HeapTupleData tup_data;
  44 | } ConcurrentChange;
  45 | 
  46 | static void apply_concurrent_changes(EState *estate, ModifyTableState *mtstate,
  47 | 									 struct PartitionTupleRouting *proute,
  48 | 									 DecodingOutputState *dstate,
  49 | 									 ScanKey key, int nkeys,
  50 | 									 Relation ident_index,
  51 | 									 TupleTableSlot	*slot_dst_ind,
  52 | 									 partitions_hash *partitions,
  53 | 									 TupleConversionMapExt *conv_map,
  54 | 									 struct timeval *must_complete);
  55 | static void apply_insert(HeapTuple tup, TupleTableSlot *slot,
  56 | 						 EState *estate, ModifyTableState *mtstate,
  57 | 						 struct PartitionTupleRouting *proute,
  58 | 						 partitions_hash *partitions,
  59 | 						 TupleConversionMapExt *conv_map,
  60 | 						 BulkInsertState bistate);
  61 | static void apply_update_or_delete(HeapTuple tup,
  62 | 								   HeapTuple tup_old,
  63 | 								   ConcurrentChangeKind change_kind,
  64 | 								   EState *estate,
  65 | 								   ScanKey key, int nkeys, Relation ident_index,
  66 | 								   TupleTableSlot *slot_dst,
  67 | 								   TupleTableSlot *slot_dst_ind,
  68 | 								   ModifyTableState *mtstate,
  69 | 								   struct PartitionTupleRouting *proute,
  70 | 								   partitions_hash *partitions,
  71 | 								   TupleConversionMapExt *conv_map);
  72 | static void find_tuple_in_partition(HeapTuple tup, Relation partition,
  73 | 									partitions_hash *partitions,
  74 | 									ScanKey key, int nkeys, ItemPointer ctid);
  75 | static void find_tuple(HeapTuple tup, Relation rel, Relation ident_index,
  76 | 					   ScanKey key, int nkeys, ItemPointer ctid,
  77 | 					   TupleTableSlot *slot_dst_ind);
  78 | static bool processing_time_elapsed(struct timeval *utmost);
  79 | 
  80 | static void plugin_startup(LogicalDecodingContext *ctx,
  81 | 						   OutputPluginOptions *opt, bool is_init);
  82 | static void plugin_shutdown(LogicalDecodingContext *ctx);
  83 | static void plugin_begin_txn(LogicalDecodingContext *ctx,
  84 | 							 ReorderBufferTXN *txn);
  85 | static void plugin_commit_txn(LogicalDecodingContext *ctx,
  86 | 							  ReorderBufferTXN *txn, XLogRecPtr commit_lsn);
  87 | static void plugin_change(LogicalDecodingContext *ctx, ReorderBufferTXN *txn,
  88 | 						  Relation rel, ReorderBufferChange *change);
  89 | static void store_change(LogicalDecodingContext *ctx,
  90 | 						 ConcurrentChangeKind kind, HeapTuple tuple);
  91 | static HeapTuple get_changed_tuple(ConcurrentChange *change);
  92 | static bool plugin_filter(LogicalDecodingContext *ctx, RepOriginId origin_id);
  93 | 
  94 | /*
  95 |  * Decode and apply concurrent changes. If there are too many of them, split
  96 |  * the processing into multiple iterations so that the intermediate storage
  97 |  * (tuplestore) is not likely to be written to disk.
  98 |  *
  99 |  * See check_catalog_changes() for explanation of lock_held argument.
 100 |  *
 101 |  * Returns true if must_complete is NULL or if managed to complete by the time
 102 |  * *must_complete indicates.
 103 |  */
 104 | bool
 105 | pg_rewrite_process_concurrent_changes(EState *estate,
 106 | 									  ModifyTableState *mtstate,
 107 | 									  struct PartitionTupleRouting *proute,
 108 | 									  LogicalDecodingContext *ctx,
 109 | 									  XLogRecPtr end_of_wal,
 110 | 									  ScanKey ident_key,
 111 | 									  int ident_key_nentries,
 112 | 									  Relation ident_index,
 113 | 									  TupleTableSlot *slot_dst_ind,
 114 | 									  LOCKMODE lock_held,
 115 | 									  partitions_hash *partitions,
 116 | 									  TupleConversionMapExt *conv_map,
 117 | 									  struct timeval *must_complete)
 118 | {
 119 | 	DecodingOutputState *dstate;
 120 | 	bool		done;
 121 | 
 122 | 	/*
 123 | 	 * Some arguments are specific to partitioned table, some to
 124 | 	 * non-partitioned one. XXX Is some refactoring needed here, such as using
 125 | 	 * an union?
 126 | 	 */
 127 | 	Assert((ident_index && slot_dst_ind && partitions == NULL
 128 | 			&& proute == NULL) ||
 129 | 		   (ident_index == NULL && slot_dst_ind == NULL&&
 130 | 			partitions && proute));
 131 | 
 132 | 	dstate = (DecodingOutputState *) ctx->output_writer_private;
 133 | 
 134 | 	/*
 135 | 	 * If some changes could not be applied due to time constraint, make sure
 136 | 	 * the tuplestore is empty before we insert new tuples into it.
 137 | 	 */
 138 | 	if (dstate->nchanges > 0)
 139 | 		apply_concurrent_changes(estate, mtstate, proute,
 140 | 								 dstate, ident_key, ident_key_nentries,
 141 | 								 ident_index, slot_dst_ind,
 142 | 								 partitions, conv_map, must_complete);
 143 | 	/* Ran out of time? */
 144 | 	if (dstate->nchanges > 0)
 145 | 		return false;
 146 | 
 147 | 	done = false;
 148 | 	while (!done)
 149 | 	{
 150 | 		pg_rewrite_exit_if_requested();
 151 | 
 152 | 		done = pg_rewrite_decode_concurrent_changes(ctx, end_of_wal,
 153 | 													must_complete);
 154 | 
 155 | 		if (processing_time_elapsed(must_complete))
 156 | 			/* Caller is responsible for applying the changes. */
 157 | 			return false;
 158 | 
 159 | 		if (dstate->nchanges == 0)
 160 | 			continue;
 161 | 
 162 | 		/*
 163 | 		 * XXX Consider if it's possible to check *must_complete and stop
 164 | 		 * processing partway through. Partial cleanup of the tuplestore seems
 165 | 		 * non-trivial.
 166 | 		 */
 167 | 		apply_concurrent_changes(estate, mtstate, proute,
 168 | 								 dstate, ident_key, ident_key_nentries,
 169 | 								 ident_index, slot_dst_ind,
 170 | 								 partitions, conv_map, must_complete);
 171 | 		/* Ran out of time? */
 172 | 		if (dstate->nchanges > 0)
 173 | 			return false;
 174 | 	}
 175 | 
 176 | 	return true;
 177 | }
 178 | 
 179 | /*
 180 |  * Decode logical changes from the XLOG sequence up to end_of_wal.
 181 |  *
 182 |  * Returns true iff done (for now), i.e. no more changes below the end_of_wal
 183 |  * can be decoded.
 184 |  */
 185 | bool
 186 | pg_rewrite_decode_concurrent_changes(LogicalDecodingContext *ctx,
 187 | 									 XLogRecPtr end_of_wal,
 188 | 									 struct timeval *must_complete)
 189 | {
 190 | 	DecodingOutputState *dstate;
 191 | 	ResourceOwner resowner_old;
 192 | 
 193 | 	/*
 194 | 	 * Invalidate the "present" cache before moving to "(recent) history".
 195 | 	 *
 196 | 	 * Note: The cache entry of the transient relation is not affected
 197 | 	 * (because it was created by the current transaction), but the tuple
 198 | 	 * descriptor shouldn't change anyway (as opposed to index info, which we
 199 | 	 * change at some point). Moreover, tuples of the transient relation
 200 | 	 * should not actually be deconstructed: reorderbuffer.c records the
 201 | 	 * tuples, but - as it never receives the corresponding commit record -
 202 | 	 * does not examine them in detail.
 203 | 	 */
 204 | 	InvalidateSystemCaches();
 205 | 
 206 | 	dstate = (DecodingOutputState *) ctx->output_writer_private;
 207 | 	resowner_old = CurrentResourceOwner;
 208 | 	CurrentResourceOwner = dstate->resowner;
 209 | 
 210 | 	PG_TRY();
 211 | 	{
 212 | 		while (ctx->reader->EndRecPtr < end_of_wal)
 213 | 		{
 214 | 			XLogRecord *record;
 215 | 			XLogSegNo	segno_new;
 216 | 			char	   *errm = NULL;
 217 | 			XLogRecPtr	end_lsn;
 218 | 
 219 | 			record = XLogReadRecord(ctx->reader, &errm);
 220 | 			if (errm)
 221 | 				elog(ERROR, "%s", errm);
 222 | 
 223 | 			if (record != NULL)
 224 | 				LogicalDecodingProcessRecord(ctx, ctx->reader);
 225 | 
 226 | 			if (processing_time_elapsed(must_complete))
 227 | 				break;
 228 | 
 229 | 			/*
 230 | 			 * If WAL segment boundary has been crossed, inform PG core that
 231 | 			 * we no longer need the previous segment.
 232 | 			 */
 233 | 			end_lsn = ctx->reader->EndRecPtr;
 234 | 			XLByteToSeg(end_lsn, segno_new, wal_segment_size);
 235 | 			if (segno_new != rewrite_current_segment)
 236 | 			{
 237 | 				LogicalConfirmReceivedLocation(end_lsn);
 238 | 				elog(DEBUG1, "pg_rewrite: confirmed receive location %X/%X",
 239 | 					 (uint32) (end_lsn >> 32), (uint32) end_lsn);
 240 | 				rewrite_current_segment = segno_new;
 241 | 			}
 242 | 
 243 | 			pg_rewrite_exit_if_requested();
 244 | 		}
 245 | 		InvalidateSystemCaches();
 246 | 		CurrentResourceOwner = resowner_old;
 247 | 	}
 248 | 	PG_CATCH();
 249 | 	{
 250 | 		InvalidateSystemCaches();
 251 | 		CurrentResourceOwner = resowner_old;
 252 | 		PG_RE_THROW();
 253 | 	}
 254 | 	PG_END_TRY();
 255 | 
 256 | 	elog(DEBUG1, "pg_rewrite: %.0f changes decoded but not applied yet",
 257 | 		 dstate->nchanges);
 258 | 
 259 | 	return ctx->reader->EndRecPtr >= end_of_wal;
 260 | }
 261 | 
 262 | /*
 263 |  * Apply changes that happened during the initial load.
 264 |  *
 265 |  * Scan key is passed by caller, so it does not have to be constructed
 266 |  * multiple times. Key entries have all fields initialized, except for
 267 |  * sk_argument.
 268 |  */
 269 | static void
 270 | apply_concurrent_changes(EState *estate, ModifyTableState *mtstate,
 271 | 						 struct PartitionTupleRouting *proute,
 272 | 						 DecodingOutputState *dstate,
 273 | 						 ScanKey key, int nkeys,
 274 | 						 Relation ident_index,
 275 | 						 TupleTableSlot	*slot_dst_ind,
 276 | 						 partitions_hash *partitions,
 277 | 						 TupleConversionMapExt *conv_map,
 278 | 						 struct timeval *must_complete)
 279 | {
 280 | 	BulkInsertState	bistate = NULL;
 281 | 	HeapTuple	tup_old = NULL;
 282 | 	Relation	rel_dst;
 283 | 	TupleTableSlot	*slot_dst;
 284 | 
 285 | 	if (dstate->nchanges == 0)
 286 | 		return;
 287 | 
 288 | 	/* See perform_initial_load() */
 289 | 	if (proute == NULL)
 290 | 		bistate = GetBulkInsertState();
 291 | 
 292 | 	/*
 293 | 	 * Slot for the destination relation is needed even in the partitioned
 294 | 	 * case, to route changes to partitions.
 295 | 	 */
 296 | 	rel_dst = mtstate->resultRelInfo->ri_RelationDesc;
 297 | 	slot_dst = MakeSingleTupleTableSlot(RelationGetDescr(rel_dst),
 298 | 										&TTSOpsHeapTuple);
 299 | 
 300 | 	/*
 301 | 	 * In case functions in the index need the active snapshot and caller
 302 | 	 * hasn't set one.
 303 | 	 */
 304 | 	PushActiveSnapshot(GetTransactionSnapshot());
 305 | 
 306 | 	while (tuplestore_gettupleslot(dstate->tstore, true, false,
 307 | 								   dstate->tsslot))
 308 | 	{
 309 | 		bool		shouldFree;
 310 | 		HeapTuple	tup_change,
 311 | 					tup;
 312 | 		char	   *change_raw;
 313 | 		ConcurrentChange *change;
 314 | 		bool		isnull[1];
 315 | 		Datum		values[1];
 316 | 
 317 | 		Assert(dstate->nchanges > 0);
 318 | 		dstate->nchanges--;
 319 | 
 320 | 		/* Get the change from the single-column tuple. */
 321 | 		tup_change = ExecFetchSlotHeapTuple(dstate->tsslot, false, &shouldFree);
 322 | 		heap_deform_tuple(tup_change, dstate->tupdesc_change, values, isnull);
 323 | 		Assert(!isnull[0]);
 324 | 
 325 | 		/* This is bytea, but char* is easier to work with. */
 326 | 		change_raw = (char *) DatumGetByteaP(values[0]);
 327 | 
 328 | 		change = (ConcurrentChange *) VARDATA(change_raw);
 329 | 
 330 | 		tup = get_changed_tuple(change);
 331 | 
 332 | 		if (change->kind == CHANGE_UPDATE_OLD)
 333 | 		{
 334 | 			Assert(tup_old == NULL);
 335 | 			tup_old = tup;
 336 | 		}
 337 | 		else if (change->kind == CHANGE_INSERT)
 338 | 		{
 339 | 			Assert(tup_old == NULL);
 340 | 			apply_insert(tup, slot_dst, estate, mtstate, proute,
 341 | 						 partitions, conv_map, bistate);
 342 | 		}
 343 | 		else if (change->kind == CHANGE_UPDATE_NEW ||
 344 | 				 change->kind == CHANGE_DELETE)
 345 | 		{
 346 | 			apply_update_or_delete(tup, tup_old, change->kind,
 347 | 								   estate, key, nkeys, ident_index,
 348 | 								   slot_dst, slot_dst_ind, mtstate, proute,
 349 | 								   partitions, conv_map);
 350 | 
 351 | 			/* The function is responsible for freeing. */
 352 | 			if (tup_old != NULL)
 353 | 				tup_old = NULL;
 354 | 		}
 355 | 		else
 356 | 			elog(ERROR, "Unrecognized kind of change: %d", change->kind);
 357 | 
 358 | 		/* If there's any change, make it visible to the next iteration. */
 359 | 		if (change->kind != CHANGE_UPDATE_OLD)
 360 | 		{
 361 | 			CommandCounterIncrement();
 362 | 			UpdateActiveSnapshotCommandId();
 363 | 		}
 364 | 
 365 | 		/* TTSOpsMinimalTuple has .get_heap_tuple==NULL. */
 366 | 		Assert(shouldFree);
 367 | 		pfree(tup_change);
 368 | 
 369 | 		/*
 370 | 		 * If there is a limit on the time of completion, check it
 371 | 		 * now. However, make sure the loop does not break if tup_old was set
 372 | 		 * in the previous iteration. In such a case we could not resume the
 373 | 		 * processing in the next call.
 374 | 		 */
 375 | 		if (must_complete && tup_old == NULL &&
 376 | 			processing_time_elapsed(must_complete))
 377 | 			/* The next call will process the remaining changes. */
 378 | 			break;
 379 | 	}
 380 | 
 381 | 	/* If we could not apply all the changes, the next call will do. */
 382 | 	if (dstate->nchanges == 0)
 383 | 		tuplestore_clear(dstate->tstore);
 384 | 
 385 | 	PopActiveSnapshot();
 386 | 
 387 | 	/* Cleanup. */
 388 | 	if (bistate)
 389 | 		FreeBulkInsertState(bistate);
 390 | 
 391 | 	ExecDropSingleTupleTableSlot(slot_dst);
 392 | }
 393 | 
 394 | static void
 395 | apply_insert(HeapTuple tup, TupleTableSlot *slot,
 396 | 			 EState *estate, ModifyTableState *mtstate,
 397 | 			 struct PartitionTupleRouting *proute,
 398 | 			 partitions_hash *partitions, TupleConversionMapExt *conv_map,
 399 | 			 BulkInsertState bistate)
 400 | {
 401 | 	List	   *recheck;
 402 | 	Relation	rel_ins;
 403 | 	ResultRelInfo *rri = NULL;
 404 | 
 405 | 	if (conv_map)
 406 | 		tup = convert_tuple_for_dest_table(tup, conv_map);
 407 | 	ExecStoreHeapTuple(tup, slot, false);
 408 | 	if (proute)
 409 | 	{
 410 | 		PartitionEntry	*entry;
 411 | 
 412 | 		/* Which partition does the tuple belong to? */
 413 | 		rri = ExecFindPartition(mtstate, mtstate->rootResultRelInfo,
 414 | 								proute, slot, estate);
 415 | 		rel_ins = rri->ri_RelationDesc;
 416 | 
 417 | 		entry = get_partition_entry(partitions,
 418 | 									RelationGetRelid(rel_ins));
 419 | 		bistate = entry->bistate;
 420 | 
 421 | 		/*
 422 | 		 * Make sure the tuple matches the partition. The typical problem we
 423 | 		 * address here is that a partition was attached that has a different
 424 | 		 * order of columns.
 425 | 		 */
 426 | 		if (entry->conv_map)
 427 | 		{
 428 | 			tup = convert_tuple_for_dest_table(tup, entry->conv_map);
 429 | 			ExecClearTuple(slot);
 430 | 			ExecStoreHeapTuple(tup, slot, false);
 431 | 		}
 432 | 	}
 433 | 	else
 434 | 	{
 435 | 		/* Non-partitioned table. */
 436 | 		rri = mtstate->resultRelInfo;
 437 | 		rel_ins = rri->ri_RelationDesc;
 438 | 		/* Use bistate passed by the caller. */
 439 | 	}
 440 | 	Assert(bistate != NULL);
 441 | 	table_tuple_insert(rel_ins, slot, GetCurrentCommandId(true), 0,
 442 | 					   bistate);
 443 | 
 444 | #if PG_VERSION_NUM < 140000
 445 | 	estate->es_result_relation_info = rri;
 446 | #endif
 447 | 	/* Update indexes. */
 448 | 	recheck = ExecInsertIndexTuples(
 449 | #if PG_VERSION_NUM >= 140000
 450 | 		rri,
 451 | #endif
 452 | 		slot,
 453 | 		estate,
 454 | #if PG_VERSION_NUM >= 140000
 455 | 		false,	/* update */
 456 | #endif
 457 | 		false,	/* noDupErr */
 458 | 		NULL,	/* specConflict */
 459 | 		NIL		/* arbiterIndexes */
 460 | #if PG_VERSION_NUM >= 160000
 461 | 		, false /* onlySummarizing */
 462 | #endif
 463 | 		);
 464 | 	ExecClearTuple(slot);
 465 | 
 466 | 	pfree(tup);
 467 | 
 468 | 	/*
 469 | 	 * If recheck is required, it must have been preformed on the source
 470 | 	 * relation by now. (All the logical changes we process here are already
 471 | 	 * committed.)
 472 | 	 */
 473 | 	list_free(recheck);
 474 | 
 475 | 	/* Update the progress information. */
 476 | 	SpinLockAcquire(&MyWorkerTask->mutex);
 477 | 	MyWorkerTask->progress.ins++;
 478 | 	SpinLockRelease(&MyWorkerTask->mutex);
 479 | }
 480 | 
 481 | static void
 482 | apply_update_or_delete(HeapTuple tup, HeapTuple tup_old,
 483 | 					   ConcurrentChangeKind change_kind,
 484 | 					   EState *estate,
 485 | 					   ScanKey key, int nkeys, Relation ident_index,
 486 | 					   TupleTableSlot *slot_dst,
 487 | 					   TupleTableSlot *slot_dst_ind,
 488 | 					   ModifyTableState *mtstate,
 489 | 					   struct PartitionTupleRouting *proute,
 490 | 					   partitions_hash *partitions,
 491 | 					   TupleConversionMapExt *conv_map)
 492 | {
 493 | 	ResultRelInfo *rri, *rri_old = NULL;
 494 | 
 495 | 	/*
 496 | 	 * Convert the tuple(s) to match the destination table.
 497 | 	 */
 498 | 	if (conv_map)
 499 | 	{
 500 | 		tup = convert_tuple_for_dest_table(tup, conv_map);
 501 | 
 502 | 		if (tup_old)
 503 | 		{
 504 | 			Assert(change_kind == CHANGE_UPDATE_NEW);
 505 | 
 506 | 			tup_old = convert_tuple_for_dest_table(tup_old, conv_map);
 507 | 		}
 508 | 	}
 509 | 
 510 | 	/* Is the destination table partitioned? */
 511 | 	if (proute)
 512 | 	{
 513 | 		/* Which partition does the tuple belong to? */
 514 | 		ExecStoreHeapTuple(tup, slot_dst, false);
 515 | 		rri = ExecFindPartition(mtstate, mtstate->rootResultRelInfo,
 516 | 								proute, slot_dst, estate);
 517 | 		ExecClearTuple(slot_dst);
 518 | 
 519 | 		if (change_kind == CHANGE_UPDATE_NEW && tup_old)
 520 | 		{
 521 | 			ExecStoreHeapTuple(tup_old, slot_dst, false);
 522 | 			rri_old = ExecFindPartition(mtstate, mtstate->rootResultRelInfo,
 523 | 										proute, slot_dst, estate);
 524 | 			ExecClearTuple(slot_dst);
 525 | 		}
 526 | 	}
 527 | 	else
 528 | 		rri = mtstate->resultRelInfo;
 529 | 
 530 | 	/* Is this a cross-partition update? */
 531 | 	if (rri_old &&
 532 | 		RelationGetRelid(rri_old->ri_RelationDesc) !=
 533 | 		RelationGetRelid(rri->ri_RelationDesc))
 534 | 	{
 535 | 		ItemPointerData ctid;
 536 | 		List	   *recheck;
 537 | 		PartitionEntry *entry;
 538 | 
 539 | 		/*
 540 | 		 * Cross-partition update. Delete the old tuple from its partition.
 541 | 		 */
 542 | 		find_tuple_in_partition(tup_old, rri_old->ri_RelationDesc,
 543 | 								partitions, key, nkeys, &ctid);
 544 | 		simple_heap_delete(rri_old->ri_RelationDesc, &ctid);
 545 | 
 546 | 		/* Update the progress information. */
 547 | 		SpinLockAcquire(&MyWorkerTask->mutex);
 548 | 		MyWorkerTask->progress.del++;
 549 | 		SpinLockRelease(&MyWorkerTask->mutex);
 550 | 
 551 | 		/*
 552 | 		 * Insert the new tuple into its partition. This might include
 553 | 		 * conversion to match the partition, see above.
 554 | 		 */
 555 | 		entry = get_partition_entry(partitions,
 556 | 									RelationGetRelid(rri->ri_RelationDesc));
 557 | 		if (entry->conv_map)
 558 | 			tup = convert_tuple_for_dest_table(tup, entry->conv_map);
 559 | 		ExecStoreHeapTuple(tup, entry->slot, false);
 560 | 		table_tuple_insert(rri->ri_RelationDesc, entry->slot,
 561 | 						   GetCurrentCommandId(true), 0, NULL);
 562 | 
 563 | #if PG_VERSION_NUM < 140000
 564 | 		estate->es_result_relation_info = rri;
 565 | #endif
 566 | 		/* Update indexes. */
 567 | 		recheck = ExecInsertIndexTuples(
 568 | #if PG_VERSION_NUM >= 140000
 569 | 			rri,
 570 | #endif
 571 | 			entry->slot,
 572 | 			estate,
 573 | #if PG_VERSION_NUM >= 140000
 574 | 			false,	/* update */
 575 | #endif
 576 | 			false,	/* noDupErr */
 577 | 			NULL,	/* specConflict */
 578 | 			NIL		/* arbiterIndexes */
 579 | #if PG_VERSION_NUM >= 160000
 580 | 			, false /* onlySummarizing */
 581 | #endif
 582 | 			);
 583 | 		ExecClearTuple(entry->slot);
 584 | 
 585 | 		/* Update the progress information. */
 586 | 		SpinLockAcquire(&MyWorkerTask->mutex);
 587 | 		MyWorkerTask->progress.ins++;
 588 | 		SpinLockRelease(&MyWorkerTask->mutex);
 589 | 
 590 | 		list_free(recheck);
 591 | 	}
 592 | 	else
 593 | 	{
 594 | 		HeapTuple	tup_key;
 595 | 		ItemPointerData ctid;
 596 | 
 597 | 		/*
 598 | 		 * Both old and new tuple are in the same partition, or the target
 599 | 		 * table is not partitioned. Find the tuple to be updated or deleted.
 600 | 		 */
 601 | 		if (change_kind == CHANGE_UPDATE_NEW)
 602 | 			tup_key = tup_old != NULL ? tup_old : tup;
 603 | 		else
 604 | 		{
 605 | 			Assert(change_kind == CHANGE_DELETE);
 606 | 			Assert(tup_old == NULL);
 607 | 			tup_key = tup;
 608 | 		}
 609 | 
 610 | 		if (partitions)
 611 | 			find_tuple_in_partition(tup_key, rri->ri_RelationDesc,
 612 | 									partitions, key, nkeys, &ctid);
 613 | 		else
 614 | 			find_tuple(tup_key, rri->ri_RelationDesc, ident_index, key, nkeys,
 615 | 					   &ctid, slot_dst_ind);
 616 | 
 617 | 		if (change_kind == CHANGE_UPDATE_NEW)
 618 | 		{
 619 | 			PartitionEntry *entry = NULL;
 620 | 
 621 | #if PG_VERSION_NUM >= 160000
 622 | 			TU_UpdateIndexes	update_indexes;
 623 | #endif
 624 | 
 625 | 			if (partitions)
 626 | 			{
 627 | 				/*
 628 | 				 * Make sure the tuple matches the partition.
 629 | 				 */
 630 | 				entry = get_partition_entry(partitions,
 631 | 											RelationGetRelid(rri->ri_RelationDesc));
 632 | 				if (entry->conv_map)
 633 | 					tup = convert_tuple_for_dest_table(tup,
 634 | 													   entry->conv_map);
 635 | 			}
 636 | 
 637 | 			simple_heap_update(rri->ri_RelationDesc, &ctid, tup
 638 | #if PG_VERSION_NUM >= 160000
 639 | 							   , &update_indexes
 640 | #endif
 641 | 				);
 642 | 			if (!HeapTupleIsHeapOnly(tup))
 643 | 			{
 644 | 				TupleTableSlot	*slot;
 645 | 				List	   *recheck;
 646 | 
 647 | 				slot = entry ? entry->slot : slot_dst;
 648 | 
 649 | 				ExecStoreHeapTuple(tup, slot, false);
 650 | 
 651 | 				/*
 652 | 				 * XXX Consider passing update=true, however it requires
 653 | 				 * es_range_table to be initialized. Is it worth the
 654 | 				 * complexity?
 655 | 				 */
 656 | 				recheck = ExecInsertIndexTuples(
 657 | #if PG_VERSION_NUM >= 140000
 658 | 					rri,
 659 | #endif
 660 | 					slot,
 661 | 					estate,
 662 | #if PG_VERSION_NUM >= 140000
 663 | 					false,	/* update */
 664 | #endif
 665 | 					false,	/* noDupErr */
 666 | 					NULL,	/* specConflict */
 667 | 					NIL		/* arbiterIndexes */
 668 | #if PG_VERSION_NUM >= 160000
 669 | 					/* onlySummarizing */
 670 | 					, update_indexes == TU_Summarizing
 671 | #endif
 672 | 					);
 673 | 				ExecClearTuple(slot);
 674 | 				list_free(recheck);
 675 | 			}
 676 | 
 677 | 			/* Update the progress information. */
 678 | 			SpinLockAcquire(&MyWorkerTask->mutex);
 679 | 			MyWorkerTask->progress.upd++;
 680 | 			SpinLockRelease(&MyWorkerTask->mutex);
 681 | 		}
 682 | 		else
 683 | 		{
 684 | 			Assert(change_kind == CHANGE_DELETE);
 685 | 
 686 | 			simple_heap_delete(rri->ri_RelationDesc, &ctid);
 687 | 
 688 | 			/* Update the progress information. */
 689 | 			SpinLockAcquire(&MyWorkerTask->mutex);
 690 | 			MyWorkerTask->progress.del++;
 691 | 			SpinLockRelease(&MyWorkerTask->mutex);
 692 | 		}
 693 | 	}
 694 | 
 695 | 	pfree(tup);
 696 | 	if (tup_old)
 697 | 		pfree(tup_old);
 698 | }
 699 | 
 700 | /*
 701 |  * Find tuple whose identity key is passed as 'tup' in relation 'rel' and put
 702 |  * its location into 'ctid'.
 703 |  */
 704 | static void
 705 | find_tuple_in_partition(HeapTuple tup, Relation partition,
 706 | 						partitions_hash *partitions,
 707 | 						ScanKey key, int nkeys, ItemPointer ctid)
 708 | {
 709 | 	Oid			part_oid = RelationGetRelid(partition);
 710 | 	HeapTuple	tup_mapped = NULL;
 711 | 	PartitionEntry *entry;
 712 | 
 713 | 	entry = partitions_lookup(partitions, part_oid);
 714 | 	if (entry == NULL)
 715 | 		elog(ERROR, "identity index not found for partition %u", part_oid);
 716 | 	Assert(entry->part_oid == part_oid);
 717 | 
 718 | 	/*
 719 | 	 * Make sure the tuple matches the partition.
 720 | 	 */
 721 | 	if (entry->conv_map)
 722 | 	{
 723 | 		/*
 724 | 		 * convert_tuple_for_dest_table() is not suitable here because we need
 725 | 		 * to keep the original tuple. XXX Should we add a boolean argument to
 726 | 		 * the function that indicates whether it should free the original
 727 | 		 * tuple?
 728 | 		 */
 729 | 		tup_mapped = pg_rewrite_execute_attr_map_tuple(tup,
 730 | 													   entry->conv_map);
 731 | 		tup = tup_mapped;
 732 | 	}
 733 | 	find_tuple(tup, partition, entry->ident_index, key, nkeys, ctid,
 734 | 			   entry->slot_ind);
 735 | 	if (tup_mapped)
 736 | 		pfree(tup_mapped);
 737 | }
 738 | 
 739 | /*
 740 |  * Find tuple whose identity key is passed as 'tup' in relation 'rel' and put
 741 |  * its location into 'ctid'.
 742 |  */
 743 | static void
 744 | find_tuple(HeapTuple tup, Relation rel, Relation ident_index, ScanKey key,
 745 | 		   int nkeys, ItemPointer ctid, TupleTableSlot *slot_dst_ind)
 746 | {
 747 | 	Form_pg_index ident_form;
 748 | 	int2vector *ident_indkey;
 749 | 	IndexScanDesc scan;
 750 | 	int			i;
 751 | 	HeapTuple	tup_exist;
 752 | 
 753 | 	ident_form = ident_index->rd_index;
 754 | 	ident_indkey = &ident_form->indkey;
 755 | 	scan = index_beginscan(rel, ident_index, GetActiveSnapshot(),
 756 | #if PG_VERSION_NUM >= 180000
 757 | 						   NULL, /* instrument */
 758 | #endif
 759 | 						   nkeys, 0);
 760 | 	index_rescan(scan, key, nkeys, NULL, 0);
 761 | 
 762 | 	/* Use the incoming tuple to finalize the scan key. */
 763 | 	for (i = 0; i < scan->numberOfKeys; i++)
 764 | 	{
 765 | 		ScanKey		entry;
 766 | 		bool		isnull;
 767 | 		int16		attno_heap;
 768 | 
 769 | 		entry = &scan->keyData[i];
 770 | 		attno_heap = ident_indkey->values[i];
 771 | 		entry->sk_argument = heap_getattr(tup,
 772 | 										  attno_heap,
 773 | 										  rel->rd_att,
 774 | 										  &isnull);
 775 | 		Assert(!isnull);
 776 | 	}
 777 | 	if (index_getnext_slot(scan, ForwardScanDirection, slot_dst_ind))
 778 | 	{
 779 | 		bool		shouldFreeInd;
 780 | 
 781 | 		tup_exist = ExecFetchSlotHeapTuple(slot_dst_ind, false,
 782 | 										   &shouldFreeInd);
 783 | 		/* TTSOpsBufferHeapTuple has .get_heap_tuple != NULL. */
 784 | 		Assert(!shouldFreeInd);
 785 | 	}
 786 | 	else
 787 | 		tup_exist = NULL;
 788 | 	if (tup_exist == NULL)
 789 | 		elog(ERROR, "Failed to find target tuple");
 790 | 	ItemPointerCopy(&tup_exist->t_self, ctid);
 791 | 	index_endscan(scan);
 792 | }
 793 | 
 794 | static bool
 795 | processing_time_elapsed(struct timeval *utmost)
 796 | {
 797 | 	struct timeval now;
 798 | 
 799 | 	if (utmost == NULL)
 800 | 		return false;
 801 | 
 802 | 	gettimeofday(&now, NULL);
 803 | 
 804 | 	if (now.tv_sec < utmost->tv_sec)
 805 | 		return false;
 806 | 
 807 | 	if (now.tv_sec > utmost->tv_sec)
 808 | 		return true;
 809 | 
 810 | 	return now.tv_usec >= utmost->tv_usec;
 811 | }
 812 | 
 813 | /*
 814 |  * Convert tuple according to the map and free the original one.
 815 |  */
 816 | HeapTuple
 817 | convert_tuple_for_dest_table(HeapTuple tuple,
 818 | 							 TupleConversionMapExt *conv_map)
 819 | {
 820 | 	HeapTuple	orig = tuple;
 821 | 
 822 | 	tuple = pg_rewrite_execute_attr_map_tuple(tuple, conv_map);
 823 | 	pfree(orig);
 824 | 
 825 | 	return tuple;
 826 | }
 827 | 
 828 | void
 829 | _PG_output_plugin_init(OutputPluginCallbacks *cb)
 830 | {
 831 | 	AssertVariableIsOfType(&_PG_output_plugin_init, LogicalOutputPluginInit);
 832 | 
 833 | 	cb->startup_cb = plugin_startup;
 834 | 	cb->begin_cb = plugin_begin_txn;
 835 | 	cb->change_cb = plugin_change;
 836 | 	cb->commit_cb = plugin_commit_txn;
 837 | 	cb->filter_by_origin_cb = plugin_filter;
 838 | 	cb->shutdown_cb = plugin_shutdown;
 839 | }
 840 | 
 841 | 
 842 | /* initialize this plugin */
 843 | static void
 844 | plugin_startup(LogicalDecodingContext *ctx, OutputPluginOptions *opt,
 845 | 			   bool is_init)
 846 | {
 847 | 	ctx->output_plugin_private = NULL;
 848 | 
 849 | 	/* Probably unnecessary, as we don't use the SQL interface ... */
 850 | 	opt->output_type = OUTPUT_PLUGIN_BINARY_OUTPUT;
 851 | 
 852 | 	if (ctx->output_plugin_options != NIL)
 853 | 	{
 854 | 		ereport(ERROR,
 855 | 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 856 | 				 errmsg("This plugin does not expect any options")));
 857 | 	}
 858 | }
 859 | 
 860 | static void
 861 | plugin_shutdown(LogicalDecodingContext *ctx)
 862 | {
 863 | }
 864 | 
 865 | /*
 866 |  * As we don't release the slot during processing of particular table, there's
 867 |  * no room for SQL interface, even for debugging purposes. Therefore we need
 868 |  * neither OutputPluginPrepareWrite() nor OutputPluginWrite() in the plugin
 869 |  * callbacks. (Although we might want to write custom callbacks, this API
 870 |  * seems to be unnecessarily generic for our purposes.)
 871 |  */
 872 | 
 873 | /* BEGIN callback */
 874 | static void
 875 | plugin_begin_txn(LogicalDecodingContext *ctx, ReorderBufferTXN *txn)
 876 | {
 877 | }
 878 | 
 879 | /* COMMIT callback */
 880 | static void
 881 | plugin_commit_txn(LogicalDecodingContext *ctx, ReorderBufferTXN *txn,
 882 | 				  XLogRecPtr commit_lsn)
 883 | {
 884 | }
 885 | 
 886 | /*
 887 |  * Callback for individual changed tuples
 888 |  */
 889 | static void
 890 | plugin_change(LogicalDecodingContext *ctx, ReorderBufferTXN *txn,
 891 | 			  Relation relation, ReorderBufferChange *change)
 892 | {
 893 | 	DecodingOutputState *dstate;
 894 | 
 895 | 	dstate = (DecodingOutputState *) ctx->output_writer_private;
 896 | 
 897 | 	/* Only interested in one particular relation. */
 898 | 	if (relation->rd_id != dstate->relid)
 899 | 		return;
 900 | 
 901 | 	/* Decode entry depending on its type */
 902 | 	switch (change->action)
 903 | 	{
 904 | 		case REORDER_BUFFER_CHANGE_INSERT:
 905 | 			{
 906 | 				HeapTuple	newtuple;
 907 | 
 908 | 				newtuple = change->data.tp.newtuple != NULL ?
 909 | #if PG_VERSION_NUM >= 170000
 910 | 					change->data.tp.newtuple : NULL;
 911 | #else
 912 | 					&change->data.tp.newtuple->tuple : NULL;
 913 | #endif
 914 | 
 915 | 				/*
 916 | 				 * Identity checks in the main function should have made this
 917 | 				 * impossible.
 918 | 				 */
 919 | 				if (newtuple == NULL)
 920 | 					elog(ERROR, "Incomplete insert info.");
 921 | 
 922 | 				store_change(ctx, CHANGE_INSERT, newtuple);
 923 | 			}
 924 | 			break;
 925 | 		case REORDER_BUFFER_CHANGE_UPDATE:
 926 | 			{
 927 | 				HeapTuple	oldtuple,
 928 | 							newtuple;
 929 | 
 930 | 				oldtuple = change->data.tp.oldtuple != NULL ?
 931 | #if PG_VERSION_NUM >= 170000
 932 | 					change->data.tp.oldtuple : NULL;
 933 | #else
 934 | 					&change->data.tp.oldtuple->tuple : NULL;
 935 | #endif
 936 | 				newtuple = change->data.tp.newtuple != NULL ?
 937 | #if PG_VERSION_NUM >= 170000
 938 | 					change->data.tp.newtuple : NULL;
 939 | #else
 940 | 					&change->data.tp.newtuple->tuple : NULL;
 941 | #endif
 942 | 
 943 | 				if (newtuple == NULL)
 944 | 					elog(ERROR, "Incomplete update info.");
 945 | 
 946 | 				if (oldtuple != NULL)
 947 | 					store_change(ctx, CHANGE_UPDATE_OLD, oldtuple);
 948 | 
 949 | 				store_change(ctx, CHANGE_UPDATE_NEW, newtuple);
 950 | 			}
 951 | 			break;
 952 | 		case REORDER_BUFFER_CHANGE_DELETE:
 953 | 			{
 954 | 				HeapTuple	oldtuple;
 955 | 
 956 | 				oldtuple = change->data.tp.oldtuple ?
 957 | #if PG_VERSION_NUM >= 170000
 958 | 					change->data.tp.oldtuple : NULL;
 959 | #else
 960 | 					&change->data.tp.oldtuple->tuple : NULL;
 961 | #endif
 962 | 
 963 | 				if (oldtuple == NULL)
 964 | 					elog(ERROR, "Incomplete delete info.");
 965 | 
 966 | 				store_change(ctx, CHANGE_DELETE, oldtuple);
 967 | 			}
 968 | 			break;
 969 | 		default:
 970 | 			/* Should not come here */
 971 | 			Assert(0);
 972 | 			break;
 973 | 	}
 974 | }
 975 | 
 976 | /* Store concurrent data change. */
 977 | static void
 978 | store_change(LogicalDecodingContext *ctx, ConcurrentChangeKind kind,
 979 | 			 HeapTuple tuple)
 980 | {
 981 | 	DecodingOutputState *dstate;
 982 | 	char	   *change_raw;
 983 | 	ConcurrentChange *change;
 984 | 	MemoryContext oldcontext;
 985 | 	bool		flattened = false;
 986 | 	Size		size;
 987 | 	Datum		values[1];
 988 | 	bool		isnull[1];
 989 | 	char	   *dst;
 990 | 
 991 | 	dstate = (DecodingOutputState *) ctx->output_writer_private;
 992 | 
 993 | 	/*
 994 | 	 * ReorderBufferCommit() stores the TOAST chunks in its private memory
 995 | 	 * context and frees them after having called apply_change(). Therefore we
 996 | 	 * need flat copy (including TOAST) that we eventually copy into the
 997 | 	 * memory context which is available to
 998 | 	 * pg_rewrite_decode_concurrent_changes().
 999 | 	 */
1000 | 	if (HeapTupleHasExternal(tuple))
1001 | 	{
1002 | 		/*
1003 | 		 * toast_flatten_tuple_to_datum() might be more convenient but we
1004 | 		 * don't want the decompression it does.
1005 | 		 */
1006 | 		tuple = toast_flatten_tuple(tuple, dstate->tupdesc_src);
1007 | 		flattened = true;
1008 | 	}
1009 | 
1010 | 	size = MAXALIGN(VARHDRSZ) + sizeof(ConcurrentChange) + tuple->t_len;
1011 | 	/* XXX Isn't there any function / macro to do this? */
1012 | 	if (size >= 0x3FFFFFFF)
1013 | 		elog(ERROR, "Change is too big.");
1014 | 
1015 | 	oldcontext = MemoryContextSwitchTo(ctx->context);
1016 | 	change_raw = (char *) palloc(size);
1017 | 	MemoryContextSwitchTo(oldcontext);
1018 | 
1019 | 	SET_VARSIZE(change_raw, size);
1020 | 	change = (ConcurrentChange *) VARDATA(change_raw);
1021 | 
1022 | 	/*
1023 | 	 * Copy the tuple.
1024 | 	 *
1025 | 	 * CAUTION: change->tup_data.t_data must be fixed on retrieval!
1026 | 	 */
1027 | 	memcpy(&change->tup_data, tuple, sizeof(HeapTupleData));
1028 | 	dst = (char *) change + sizeof(ConcurrentChange);
1029 | 	memcpy(dst, tuple->t_data, tuple->t_len);
1030 | 
1031 | 	/* The other field. */
1032 | 	change->kind = kind;
1033 | 
1034 | 	/* The data has been copied. */
1035 | 	if (flattened)
1036 | 		pfree(tuple);
1037 | 
1038 | 	/* Store as tuple of 1 bytea column. */
1039 | 	values[0] = PointerGetDatum(change_raw);
1040 | 	isnull[0] = false;
1041 | 	tuplestore_putvalues(dstate->tstore, dstate->tupdesc_change,
1042 | 						 values, isnull);
1043 | 
1044 | 	/* Accounting. */
1045 | 	dstate->nchanges++;
1046 | 
1047 | 	/* Cleanup. */
1048 | 	pfree(change_raw);
1049 | }
1050 | 
1051 | /*
1052 |  * Retrieve tuple from a change structure. As for the change, no alignment is
1053 |  * assumed.
1054 |  */
1055 | static HeapTuple
1056 | get_changed_tuple(ConcurrentChange *change)
1057 | {
1058 | 	HeapTupleData tup_data;
1059 | 	HeapTuple	result;
1060 | 	char	   *src;
1061 | 
1062 | 	/*
1063 | 	 * Ensure alignment before accessing the fields. (This is why we can't use
1064 | 	 * heap_copytuple() instead of this function.)
1065 | 	 */
1066 | 	memcpy(&tup_data, &change->tup_data, sizeof(HeapTupleData));
1067 | 
1068 | 	result = (HeapTuple) palloc(HEAPTUPLESIZE + tup_data.t_len);
1069 | 	memcpy(result, &tup_data, sizeof(HeapTupleData));
1070 | 	result->t_data = (HeapTupleHeader) ((char *) result + HEAPTUPLESIZE);
1071 | 	src = (char *) change + sizeof(ConcurrentChange);
1072 | 	memcpy(result->t_data, src, result->t_len);
1073 | 
1074 | 	return result;
1075 | }
1076 | 
1077 | /*
1078 |  * A filter that recognizes changes produced by the initial load.
1079 |  */
1080 | static bool
1081 | plugin_filter(LogicalDecodingContext *ctx, RepOriginId origin_id)
1082 | {
1083 | 	DecodingOutputState *dstate;
1084 | 
1085 | 	dstate = (DecodingOutputState *) ctx->output_writer_private;
1086 | 
1087 | 	/* dstate is not initialized during decoding setup - should it be? */
1088 | 	if (dstate && dstate->rorigin != InvalidRepOriginId &&
1089 | 		origin_id == dstate->rorigin)
1090 | 		return true;
1091 | 
1092 | 	return false;
1093 | }
1094 | 


--------------------------------------------------------------------------------