├── .gitignore
├── LICENSE
├── META.json
├── Makefile
├── README.md
├── expected
    ├── pg_sortstats.out
    └── pg_sortstats_12.out
├── include
    ├── pg_sortstats_import.h
    ├── pg_sortstats_import_pg10.h
    ├── pg_sortstats_import_pg11.h
    ├── pg_sortstats_import_pg12.h
    ├── pg_sortstats_import_pg13.h
    ├── pg_sortstats_import_pg14.h
    ├── pg_sortstats_import_pg9_4.h
    ├── pg_sortstats_import_pg9_5.h
    └── pg_sortstats_import_pg9_6.h
├── pg_sortstats--0.0.1.sql
├── pg_sortstats.c
├── pg_sortstats.control
├── pg_sortstats_import.c
└── sql
    ├── pg_sortstats.sql
    └── pg_sortstats_12.sql


/.gitignore:
--------------------------------------------------------------------------------
1 | .*.sw*
2 | *.o
3 | *.so
4 | *.zip
5 | *.bc
6 | *.gcno
7 | tags
8 | results/
9 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2018-2023, The PoWA-team
 2 | 
 3 | Permission to use, copy, modify, and distribute this software and its
 4 | documentation for any purpose, without fee, and without a written agreement is
 5 | hereby granted, provided that the above copyright notice and this paragraph and
 6 | the following two paragraphs appear in all copies.
 7 | 
 8 | IN NO EVENT SHALL The PoWA-team BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT,
 9 | SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING
10 | OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF The PoWA-team
11 | HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
12 | 
13 | The PoWA-team SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED
14 | TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
15 | PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND The
16 | PoWA-teamRouhaud HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
17 | ENHANCEMENTS, OR MODIFICATIONS.
18 | 


--------------------------------------------------------------------------------
/META.json:
--------------------------------------------------------------------------------
 1 | {
 2 |    "name": "pg_sortstats",
 3 |    "abstract": "An extension collecting statistics about sorts",
 4 |    "version": "__VERSION__",
 5 |    "maintainer": "Julien Rouhaud <rjuju123@gmail.com>",
 6 |    "license": "postgresql",
 7 |    "release_status": "stable",
 8 |    "provides": {
 9 |         "pg_sortstats": {
10 |             "abstract": "An extension collecting statistics about sorts",
11 |             "file": "pg_sortstats.sql",
12 |             "docfile": "README.md",
13 |             "version": "__VERSION__"
14 |         }
15 |     },
16 |    "meta-spec": {
17 |       "version": "1.0.0",
18 |       "url": "http://pgxn.org/meta/spec.txt"
19 |    }
20 | }
21 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | EXTENSION    = pg_sortstats
 2 | EXTVERSION   = $(shell grep default_version $(EXTENSION).control | sed -e "s/default_version[[:space:]]*=[[:space:]]*'\([^']*\)'/\1/")
 3 | REGRESS_OPTS = --inputdir=test
 4 | REGRESS      = pg_sortstats # Can be overloaded later
 5 | 
 6 | PG_CONFIG    ?= pg_config
 7 | 
 8 | MODULE_big = pg_sortstats
 9 | OBJS = pg_sortstats.o pg_sortstats_import.o
10 | 
11 | all:
12 | 
13 | release-zip: all
14 | 	git archive --format zip --prefix=${EXTENSION}-$(EXTVERSION)/ --output ./${EXTENSION}-$(EXTVERSION).zip HEAD
15 | 	unzip ./${EXTENSION}-$(EXTVERSION).zip
16 | 	rm ./${EXTENSION}-$(EXTVERSION).zip
17 | 	sed -i -e "s/__VERSION__/$(EXTVERSION)/g"  ./${EXTENSION}-$(EXTVERSION)/META.json
18 | 	zip -r ./${EXTENSION}-$(EXTVERSION).zip ./${EXTENSION}-$(EXTVERSION)/
19 | 	rm ./${EXTENSION}-$(EXTVERSION) -rf
20 | 
21 | 
22 | DATA = $(wildcard *--*.sql)
23 | PGXS := $(shell $(PG_CONFIG) --pgxs)
24 | include $(PGXS)
25 | 
26 | # Change the regression test for pg12+
27 | ifneq ($(MAJORVERSION),$(filter $(MAJORVERSION), 9.2 9.3 9.4 9.5 9.6 10 11))
28 | 	REGRESS = pg_sortstats_12
29 | endif
30 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | pg_sortstats
 2 | ============
 3 | 
 4 | **/!\ This extension is under development and not production ready.  Use at
 5 | your own risk.**
 6 | 
 7 | PostgreSQL extension to gather and cumulate various statistics about sorts, and
 8 | estimate how much work\_mem would be needed to have the sort done in memory.
 9 | 
10 | Statistics are aggregated per queryid (query identifier as computed by
11 | pg\_stat\_statements), userid, dbid and sort\_key (the textual representation
12 | of the sort being performed).
13 | 
14 | pg\_stat\_statements is needed to provide the queryid field.
15 | 
16 | Installation
17 | ============
18 | 
19 | Compiling
20 | --------
21 | 
22 | The module can be built using the standard PGXS infrastructure. For this to
23 | work, the ``pg_config`` program must be available in your $PATH. Instruction to
24 | install follows::
25 | 
26 |  git clone https://github.com/powa-team/pg_sortstats.git
27 |  cd pg_sortstats
28 |  make
29 |  make install
30 | 
31 | NOTE: The "make install" part may require root privilege.
32 | 
33 | PostgreSQL setup
34 | ----------------
35 | 
36 | The extension is now available. But, as it requires some shared memory to hold
37 | its counters, the module must be loaded at PostgreSQL startup. Thus, you must
38 | add the module to ``shared_preload_libraries`` in your ``postgresql.conf``. You
39 | need a server restart to take the change into account.  As this extension
40 | depends on pg_stat_statements, it also need to be added to
41 | ``shared_preload_libraries``.
42 | 
43 | Add the following parameters into you ``postgresql.conf``::
44 | 
45 |  # postgresql.conf
46 |  shared_preload_libraries = 'pg_stat_statements,pg_sortstats'
47 | 
48 | Once your PostgreSQL cluster is restarted, you can install the extension in
49 | every database where you need to access the statistics::
50 | 
51 |  mydb=# CREATE EXTENSION pg_sortstats;
52 | 
53 | Usage
54 | -----
55 | 
56 | The `pg_sortstats` view provides the following fields:
57 | 
58 | | fieldname       | description                                                                      |
59 | |-----------------|----------------------------------------------------------------------------------|
60 | | queryid         | pg_stat_statements' queryid                                                      |
61 | | userid          | user identifier                                                                  |
62 | | dbid            | database identifier                                                              |
63 | | sort_key        | the textual sort expression                                                      |
64 | | lines           | total number of lines the sort node had in input                                 |
65 | | lines_to_sort   | total number of lines the sort node has to sort (different when there's a LIMIT) |
66 | | work_mems       | total size of needed work_mem that was estimated to perform the sort in memory   |
67 | | topn_sorts      | total number of sorts done using Top-N heapsort algorithm                        |
68 | | quicksorts      | total number of sorts done using quicksort algorithm                             |
69 | | external_sorts  | total number of sorts done using external sort algorithm                         |
70 | | external_merges | total number of sorts done using external merge algorithm                        |
71 | | nbtapes         | total number of tapes used for external merge sorts                              |
72 | | space_disk      | total disk space used to perform the sort                                        |
73 | | space_memory    | total memory space used to perform the sort                                      |
74 | | non_parallels   | total number of sorts not done in parallel                                       |
75 | | nb_workers      | total number of processes used to perform the sort                                 |
76 | 
77 | The `pg_sortstats(showtext)` can be used instead, passing **false** as
78 | paremeter if you don't need the sort_key field.
79 | 
80 | The `pg_sortstats_reset()` function can be used to remove all stored
81 | statistics.
82 | 


--------------------------------------------------------------------------------
/expected/pg_sortstats.out:
--------------------------------------------------------------------------------
 1 | CREATE EXTENSION pg_sortstats;
 2 | SELECT pg_sortstats_reset();
 3 |  pg_sortstats_reset 
 4 | --------------------
 5 |  
 6 | (1 row)
 7 | 
 8 | CREATE TABLE sorts (id integer, val text COLLATE "C");
 9 | INSERT INTO sorts SELECT i, 'line ' || i FROM generate_series(1, 100000) i;
10 | VACUUM ANALYZE sorts;
11 | SET work_mem = '64kB';
12 | WITH src AS (
13 |     SELECT * FROM sorts ORDER BY val, id DESC
14 | )
15 | SELECT * FROM src LIMIT 1;
16 |  id |  val   
17 | ----+--------
18 |   1 | line 1
19 | (1 row)
20 | 
21 | SELECT * FROM sorts ORDER BY id DESC LIMIT 1;
22 |    id   |     val     
23 | --------+-------------
24 |  100000 | line 100000
25 | (1 row)
26 | 
27 | SELECT nb_keys, sort_keys, lines, lines_to_sort,
28 |     work_mems < (12 * 1024) AS "exp_less_12MB",
29 |     topn_sorts, quicksorts, external_sorts, external_merges,
30 |     nb_tapes > 2 AS multiple_tapes,
31 |     space_disk > 1024 AS "disk_more_1MB",
32 |     space_memory > 1024 AS  "mem_more_1MB",
33 |     non_parallels, COALESCE(nb_workers, 0) AS nb_workers
34 | FROM pg_sortstats(true) ORDER BY nb_keys;
35 |  nb_keys |              sort_keys               | lines  | lines_to_sort | exp_less_12MB | topn_sorts | quicksorts | external_sorts | external_merges | multiple_tapes | disk_more_1MB | mem_more_1MB | non_parallels | nb_workers 
36 | ---------+--------------------------------------+--------+---------------+---------------+------------+------------+----------------+-----------------+----------------+---------------+--------------+---------------+------------
37 |        1 | id DESC                              | 100000 |             1 | t             |          1 |          0 |              0 |               0 | f              | f             | f            |             1 |          0
38 |        2 | sorts.val COLLATE "C", sorts.id DESC | 100000 |        100000 | t             |          0 |          0 |              0 |               1 | t              | t             | f            |             1 |          0
39 | (2 rows)
40 | 
41 | SELECT pg_sortstats_reset();
42 |  pg_sortstats_reset 
43 | --------------------
44 |  
45 | (1 row)
46 | 
47 | SET work_mem = '12MB';
48 | WITH src AS (
49 |     SELECT * FROM sorts ORDER BY val, id DESC
50 | )
51 | SELECT * FROM src LIMIT 1;
52 |  id |  val   
53 | ----+--------
54 |   1 | line 1
55 | (1 row)
56 | 
57 | SELECT nb_keys, sort_keys, lines, lines_to_sort,
58 |     work_mems < (12 * 1024) AS "exp_less_12MB",
59 |     topn_sorts, quicksorts, external_sorts, external_merges,
60 |     nb_tapes > 2 AS multiple_tapes,
61 |     space_disk > 1024 AS "disk_more_1MB",
62 |     space_memory > 1024 AS  "mem_more_1MB",
63 |     non_parallels, COALESCE(nb_workers, 0) AS nb_workers
64 | FROM pg_sortstats(true) ORDER BY nb_keys;
65 |  nb_keys |              sort_keys               | lines  | lines_to_sort | exp_less_12MB | topn_sorts | quicksorts | external_sorts | external_merges | multiple_tapes | disk_more_1MB | mem_more_1MB | non_parallels | nb_workers 
66 | ---------+--------------------------------------+--------+---------------+---------------+------------+------------+----------------+-----------------+----------------+---------------+--------------+---------------+------------
67 |        2 | sorts.val COLLATE "C", sorts.id DESC | 100000 |        100000 | t             |          0 |          1 |              0 |               0 | f              | f             | t            |             1 |          0
68 | (1 row)
69 | 
70 | SELECT pg_sortstats_reset();
71 |  pg_sortstats_reset 
72 | --------------------
73 |  
74 | (1 row)
75 | 
76 | 


--------------------------------------------------------------------------------
/expected/pg_sortstats_12.out:
--------------------------------------------------------------------------------
 1 | CREATE EXTENSION pg_sortstats;
 2 | SELECT pg_sortstats_reset();
 3 |  pg_sortstats_reset 
 4 | --------------------
 5 |  
 6 | (1 row)
 7 | 
 8 | CREATE TABLE sorts (id integer, val text COLLATE "C");
 9 | INSERT INTO sorts SELECT i, 'line ' || i FROM generate_series(1, 100000) i;
10 | VACUUM ANALYZE sorts;
11 | SET work_mem = '64kB';
12 | WITH src AS MATERIALIZED (
13 |     SELECT * FROM sorts ORDER BY val, id DESC
14 | )
15 | SELECT * FROM src LIMIT 1;
16 |  id |  val   
17 | ----+--------
18 |   1 | line 1
19 | (1 row)
20 | 
21 | SELECT * FROM sorts ORDER BY id DESC LIMIT 1;
22 |    id   |     val     
23 | --------+-------------
24 |  100000 | line 100000
25 | (1 row)
26 | 
27 | SELECT nb_keys, sort_keys, lines, lines_to_sort,
28 |     work_mems < (12 * 1024) AS "exp_less_12MB",
29 |     topn_sorts, quicksorts, external_sorts, external_merges,
30 |     nb_tapes > 2 AS multiple_tapes,
31 |     space_disk > 1024 AS "disk_more_1MB",
32 |     space_memory > 1024 AS  "mem_more_1MB",
33 |     non_parallels, COALESCE(nb_workers, 0) AS nb_workers
34 | FROM pg_sortstats(true) ORDER BY nb_keys;
35 |  nb_keys |              sort_keys               | lines  | lines_to_sort | exp_less_12MB | topn_sorts | quicksorts | external_sorts | external_merges | multiple_tapes | disk_more_1MB | mem_more_1MB | non_parallels | nb_workers 
36 | ---------+--------------------------------------+--------+---------------+---------------+------------+------------+----------------+-----------------+----------------+---------------+--------------+---------------+------------
37 |        1 | id DESC                              | 100000 |             1 | t             |          1 |          0 |              0 |               0 | f              | f             | f            |             1 |          0
38 |        2 | sorts.val COLLATE "C", sorts.id DESC | 100000 |        100000 | t             |          0 |          0 |              0 |               1 | t              | t             | f            |             1 |          0
39 | (2 rows)
40 | 
41 | SELECT pg_sortstats_reset();
42 |  pg_sortstats_reset 
43 | --------------------
44 |  
45 | (1 row)
46 | 
47 | SET work_mem = '12MB';
48 | WITH src AS MATERIALIZED (
49 |     SELECT * FROM sorts ORDER BY val, id DESC
50 | )
51 | SELECT * FROM src LIMIT 1;
52 |  id |  val   
53 | ----+--------
54 |   1 | line 1
55 | (1 row)
56 | 
57 | SELECT nb_keys, sort_keys, lines, lines_to_sort,
58 |     work_mems < (12 * 1024) AS "exp_less_12MB",
59 |     topn_sorts, quicksorts, external_sorts, external_merges,
60 |     nb_tapes > 2 AS multiple_tapes,
61 |     space_disk > 1024 AS "disk_more_1MB",
62 |     space_memory > 1024 AS  "mem_more_1MB",
63 |     non_parallels, COALESCE(nb_workers, 0) AS nb_workers
64 | FROM pg_sortstats(true) ORDER BY nb_keys;
65 |  nb_keys |              sort_keys               | lines  | lines_to_sort | exp_less_12MB | topn_sorts | quicksorts | external_sorts | external_merges | multiple_tapes | disk_more_1MB | mem_more_1MB | non_parallels | nb_workers 
66 | ---------+--------------------------------------+--------+---------------+---------------+------------+------------+----------------+-----------------+----------------+---------------+--------------+---------------+------------
67 |        2 | sorts.val COLLATE "C", sorts.id DESC | 100000 |        100000 | t             |          0 |          1 |              0 |               0 | f              | f             | t            |             1 |          0
68 | (1 row)
69 | 
70 | SELECT pg_sortstats_reset();
71 |  pg_sortstats_reset 
72 | --------------------
73 |  
74 | (1 row)
75 | 
76 | 


--------------------------------------------------------------------------------
/include/pg_sortstats_import.h:
--------------------------------------------------------------------------------
 1 | #ifndef PG_SORTSTATS_IMPORT_h
 2 | #define PG_SORTSTATS_IMPORT_h
 3 | 
 4 | #include "nodes/execnodes.h"
 5 | #include "utils/logtape.h"
 6 | #if PG_VERSION_NUM < 90500
 7 | #include "lib/stringinfo.h"
 8 | #endif
 9 | 
10 | #if PG_VERSION_NUM >= 90400 && PG_VERSION_NUM < 90500
11 | #include "include/pg_sortstats_import_pg9_4.h"
12 | #elif PG_VERSION_NUM >= 90500 && PG_VERSION_NUM < 90600
13 | #include "include/pg_sortstats_import_pg9_5.h"
14 | #elif PG_VERSION_NUM >= 90600 && PG_VERSION_NUM < 100000
15 | #include "include/pg_sortstats_import_pg9_6.h"
16 | #elif PG_VERSION_NUM >= 100000 && PG_VERSION_NUM < 110000
17 | #include "include/pg_sortstats_import_pg10.h"
18 | #elif PG_VERSION_NUM >= 110000 && PG_VERSION_NUM < 120000
19 | #include "include/pg_sortstats_import_pg11.h"
20 | #elif PG_VERSION_NUM >= 120000 && PG_VERSION_NUM < 130000
21 | #include "include/pg_sortstats_import_pg12.h"
22 | #elif PG_VERSION_NUM >= 130000 && PG_VERSION_NUM < 140000
23 | #include "include/pg_sortstats_import_pg13.h"
24 | #elif PG_VERSION_NUM >= 140000 && PG_VERSION_NUM < 150000
25 | #include "include/pg_sortstats_import_pg14.h"
26 | #else
27 | #error "PostgreSQL version not supported"
28 | #endif
29 | 
30 | #if PG_VERSION_NUM < 140000
31 | #define ParallelLeaderBackendId		ParallelMasterBackendId
32 | #endif
33 | 
34 | /*
35 |  * Import some define that are stable enough so that we don't need a
36 |  * per-major-version definition
37 |  */
38 | #define PGSRT_ALLOC_MINBITS		3	/* smallest chunk size is 8 bytes */
39 | #define PGSRT_ALLOCSET_NUM_FREELISTS	11
40 | #define PGSRT_ALLOC_CHUNK_LIMIT	(1 << (PGSRT_ALLOCSET_NUM_FREELISTS-1+PGSRT_ALLOC_MINBITS))
41 | 
42 | bool pgsrt_PreScanNode(PlanState *planstate, Bitmapset **rels_used);
43 | void pgsrt_show_sortorder_options(StringInfo buf, Node *sortexpr,
44 | 					   Oid sortOperator, Oid collation, bool nullsFirst);
45 | 
46 | #if PG_VERSION_NUM < 90600
47 | 
48 | bool planstate_tree_walker(PlanState *planstate,
49 | 					  bool (*walker) (),
50 | 					  void *context);
51 | 
52 | bool planstate_walk_subplans(List *plans,
53 | 						bool (*walker) (),
54 | 						void *context);
55 | 
56 | 
57 | bool planstate_walk_members(PlanState **planstates, int nplans,
58 | 					   bool (*walker) (), void *context);
59 | 
60 | #endif		/* PG_VERSION_NUM < 90600 */
61 | 
62 | #endif
63 | 


--------------------------------------------------------------------------------
/include/pg_sortstats_import_pg10.h:
--------------------------------------------------------------------------------
  1 | #ifndef PG_SORTSTATS_IMPORT_PG10_H
  2 | #define PG_SORTSTATS_IMPORT_PG10_H
  3 | 
  4 | #define PGSRT_ALLOC_CHUNKHDRSZ	sizeof(struct pgsrt_AllocChunkData)
  5 | 
  6 | /*
  7 |  * AllocChunk
  8 |  *		The prefix of each piece of memory in an AllocBlock
  9 |  *
 10 |  * NB: this MUST match StandardChunkHeader as defined by utils/memutils.h.
 11 |  */
 12 | typedef struct pgsrt_AllocChunkData
 13 | {
 14 | 	/* aset is the owning aset if allocated, or the freelist link if free */
 15 | 	void	   *aset;
 16 | 	/* size is always the size of the usable space in the chunk */
 17 | 	Size		size;
 18 | #ifdef MEMORY_CONTEXT_CHECKING
 19 | 	/* when debugging memory usage, also store actual requested size */
 20 | 	/* this is zero in a free chunk */
 21 | 	Size		requested_size;
 22 | #endif
 23 | }	pgsrt_AllocChunkData;
 24 | 
 25 | 
 26 | #define SLAB_SLOT_SIZE 1024
 27 | typedef union SlabSlot
 28 | {
 29 | 	union SlabSlot *nextfree;
 30 | 	char		buffer[SLAB_SLOT_SIZE];
 31 | } SlabSlot;
 32 | 
 33 | typedef struct
 34 | {
 35 | 	void	   *tuple;			/* the tuple itself */
 36 | 	Datum		datum1;			/* value of first key column */
 37 | 	bool		isnull1;		/* is first key column NULL? */
 38 | 	int			tupindex;		/* see notes above */
 39 | } SortTuple;
 40 | 
 41 | typedef int (*SortTupleComparator) (const SortTuple *a, const SortTuple *b,
 42 | 									Tuplesortstate *state);
 43 | 
 44 | typedef enum
 45 | {
 46 | 	TSS_INITIAL,				/* Loading tuples; still within memory limit */
 47 | 	TSS_BOUNDED,				/* Loading tuples into bounded-size heap */
 48 | 	TSS_BUILDRUNS,				/* Loading tuples; writing to tape */
 49 | 	TSS_SORTEDINMEM,			/* Sort completed entirely in memory */
 50 | 	TSS_SORTEDONTAPE,			/* Sort completed, final run is on tape */
 51 | 	TSS_FINALMERGE				/* Performing final merge on-the-fly */
 52 | } TupSortStatus;
 53 | 
 54 | typedef struct pgsrt_Tuplesortstate
 55 | {
 56 | 	TupSortStatus status;		/* enumerated value as shown above */
 57 | 	int			nKeys;			/* number of columns in sort key */
 58 | 	bool		randomAccess;	/* did caller request random access? */
 59 | 	bool		bounded;		/* did caller specify a maximum number of
 60 | 								 * tuples to return? */
 61 | 	bool		boundUsed;		/* true if we made use of a bounded heap */
 62 | 	int			bound;			/* if bounded, the maximum number of tuples */
 63 | 	bool		tuples;			/* Can SortTuple.tuple ever be set? */
 64 | 	int64		availMem;		/* remaining memory available, in bytes */
 65 | 	int64		allowedMem;		/* total memory allowed, in bytes */
 66 | 	int			maxTapes;		/* number of tapes (Knuth's T) */
 67 | 	int			tapeRange;		/* maxTapes-1 (Knuth's P) */
 68 | 	MemoryContext sortcontext;	/* memory context holding most sort data */
 69 | 	MemoryContext tuplecontext; /* sub-context of sortcontext for tuple data */
 70 | 	LogicalTapeSet *tapeset;	/* logtape.c object for tapes in a temp file */
 71 | 
 72 | 	/*
 73 | 	 * These function pointers decouple the routines that must know what kind
 74 | 	 * of tuple we are sorting from the routines that don't need to know it.
 75 | 	 * They are set up by the tuplesort_begin_xxx routines.
 76 | 	 *
 77 | 	 * Function to compare two tuples; result is per qsort() convention, ie:
 78 | 	 * <0, 0, >0 according as a<b, a=b, a>b.  The API must match
 79 | 	 * qsort_arg_comparator.
 80 | 	 */
 81 | 	SortTupleComparator comparetup;
 82 | 
 83 | 	/*
 84 | 	 * Function to copy a supplied input tuple into palloc'd space and set up
 85 | 	 * its SortTuple representation (ie, set tuple/datum1/isnull1).  Also,
 86 | 	 * state->availMem must be decreased by the amount of space used for the
 87 | 	 * tuple copy (note the SortTuple struct itself is not counted).
 88 | 	 */
 89 | 	void		(*copytup) (Tuplesortstate *state, SortTuple *stup, void *tup);
 90 | 
 91 | 	/*
 92 | 	 * Function to write a stored tuple onto tape.  The representation of the
 93 | 	 * tuple on tape need not be the same as it is in memory; requirements on
 94 | 	 * the tape representation are given below.  Unless the slab allocator is
 95 | 	 * used, after writing the tuple, pfree() the out-of-line data (not the
 96 | 	 * SortTuple struct!), and increase state->availMem by the amount of
 97 | 	 * memory space thereby released.
 98 | 	 */
 99 | 	void		(*writetup) (Tuplesortstate *state, int tapenum,
100 | 							 SortTuple *stup);
101 | 
102 | 	/*
103 | 	 * Function to read a stored tuple from tape back into memory. 'len' is
104 | 	 * the already-read length of the stored tuple.  The tuple is allocated
105 | 	 * from the slab memory arena, or is palloc'd, see readtup_alloc().
106 | 	 */
107 | 	void		(*readtup) (Tuplesortstate *state, SortTuple *stup,
108 | 							int tapenum, unsigned int len);
109 | 
110 | 	/*
111 | 	 * This array holds the tuples now in sort memory.  If we are in state
112 | 	 * INITIAL, the tuples are in no particular order; if we are in state
113 | 	 * SORTEDINMEM, the tuples are in final sorted order; in states BUILDRUNS
114 | 	 * and FINALMERGE, the tuples are organized in "heap" order per Algorithm
115 | 	 * H.  In state SORTEDONTAPE, the array is not used.
116 | 	 */
117 | 	SortTuple  *memtuples;		/* array of SortTuple structs */
118 | 	int			memtupcount;	/* number of tuples currently present */
119 | 	int			memtupsize;		/* allocated length of memtuples array */
120 | 	bool		growmemtuples;	/* memtuples' growth still underway? */
121 | 
122 | 	/*
123 | 	 * Memory for tuples is sometimes allocated using a simple slab allocator,
124 | 	 * rather than with palloc().  Currently, we switch to slab allocation
125 | 	 * when we start merging.  Merging only needs to keep a small, fixed
126 | 	 * number of tuples in memory at any time, so we can avoid the
127 | 	 * palloc/pfree overhead by recycling a fixed number of fixed-size slots
128 | 	 * to hold the tuples.
129 | 	 *
130 | 	 * For the slab, we use one large allocation, divided into SLAB_SLOT_SIZE
131 | 	 * slots.  The allocation is sized to have one slot per tape, plus one
132 | 	 * additional slot.  We need that many slots to hold all the tuples kept
133 | 	 * in the heap during merge, plus the one we have last returned from the
134 | 	 * sort, with tuplesort_gettuple.
135 | 	 *
136 | 	 * Initially, all the slots are kept in a linked list of free slots.  When
137 | 	 * a tuple is read from a tape, it is put to the next available slot, if
138 | 	 * it fits.  If the tuple is larger than SLAB_SLOT_SIZE, it is palloc'd
139 | 	 * instead.
140 | 	 *
141 | 	 * When we're done processing a tuple, we return the slot back to the free
142 | 	 * list, or pfree() if it was palloc'd.  We know that a tuple was
143 | 	 * allocated from the slab, if its pointer value is between
144 | 	 * slabMemoryBegin and -End.
145 | 	 *
146 | 	 * When the slab allocator is used, the USEMEM/LACKMEM mechanism of
147 | 	 * tracking memory usage is not used.
148 | 	 */
149 | 	bool		slabAllocatorUsed;
150 | 
151 | 	char	   *slabMemoryBegin;	/* beginning of slab memory arena */
152 | 	char	   *slabMemoryEnd;	/* end of slab memory arena */
153 | 	SlabSlot   *slabFreeHead;	/* head of free list */
154 | 
155 | 	/* Buffer size to use for reading input tapes, during merge. */
156 | 	size_t		read_buffer_size;
157 | 
158 | 	/*
159 | 	 * When we return a tuple to the caller in tuplesort_gettuple_XXX, that
160 | 	 * came from a tape (that is, in TSS_SORTEDONTAPE or TSS_FINALMERGE
161 | 	 * modes), we remember the tuple in 'lastReturnedTuple', so that we can
162 | 	 * recycle the memory on next gettuple call.
163 | 	 */
164 | 	void	   *lastReturnedTuple;
165 | 
166 | 	/*
167 | 	 * While building initial runs, this indicates if the replacement
168 | 	 * selection strategy is in use.  When it isn't, then a simple hybrid
169 | 	 * sort-merge strategy is in use instead (runs are quicksorted).
170 | 	 */
171 | 	bool		replaceActive;
172 | 
173 | 	/*
174 | 	 * While building initial runs, this is the current output run number
175 | 	 * (starting at RUN_FIRST).  Afterwards, it is the number of initial runs
176 | 	 * we made.
177 | 	 */
178 | 	int			currentRun;
179 | } pgsrt_Tuplesortstate;
180 | 
181 | #endif		/* PG_SORTSTATS_IMPORT_PG10_H */
182 | 


--------------------------------------------------------------------------------
/include/pg_sortstats_import_pg11.h:
--------------------------------------------------------------------------------
  1 | #ifndef PG_SORTSTATS_IMPORT_PG11_H
  2 | #define PG_SORTSTATS_IMPORT_PG11_H
  3 | 
  4 | #define PGSRT_ALLOC_CHUNKHDRSZ	sizeof(struct pgsrt_AllocChunkData)
  5 | 
  6 | /*
  7 |  * AllocChunk
  8 |  *		The prefix of each piece of memory in an AllocBlock
  9 |  *
 10 |  * Note: to meet the memory context APIs, the payload area of the chunk must
 11 |  * be maxaligned, and the "aset" link must be immediately adjacent to the
 12 |  * payload area (cf. GetMemoryChunkContext).  We simplify matters for this
 13 |  * module by requiring sizeof(AllocChunkData) to be maxaligned, and then
 14 |  * we can ensure things work by adding any required alignment padding before
 15 |  * the "aset" field.  There is a static assertion below that the alignment
 16 |  * is done correctly.
 17 |  */
 18 | typedef struct pgsrt_AllocChunkData
 19 | {
 20 | 	/* size is always the size of the usable space in the chunk */
 21 | 	Size		size;
 22 | #ifdef MEMORY_CONTEXT_CHECKING
 23 | 	/* when debugging memory usage, also store actual requested size */
 24 | 	/* this is zero in a free chunk */
 25 | 	Size		requested_size;
 26 | 
 27 | #define ALLOCCHUNK_RAWSIZE  (SIZEOF_SIZE_T * 2 + SIZEOF_VOID_P)
 28 | #else
 29 | #define ALLOCCHUNK_RAWSIZE  (SIZEOF_SIZE_T + SIZEOF_VOID_P)
 30 | #endif							/* MEMORY_CONTEXT_CHECKING */
 31 | 
 32 | 	/* ensure proper alignment by adding padding if needed */
 33 | #if (ALLOCCHUNK_RAWSIZE % MAXIMUM_ALIGNOF) != 0
 34 | 	char		padding[MAXIMUM_ALIGNOF - ALLOCCHUNK_RAWSIZE % MAXIMUM_ALIGNOF];
 35 | #endif
 36 | 
 37 | 	/* aset is the owning aset if allocated, or the freelist link if free */
 38 | 	void	   *aset;
 39 | 	/* there must not be any padding to reach a MAXALIGN boundary here! */
 40 | }			pgsrt_AllocChunkData;
 41 | 
 42 | 
 43 | #define SLAB_SLOT_SIZE 1024
 44 | typedef union SlabSlot
 45 | {
 46 | 	union SlabSlot *nextfree;
 47 | 	char		buffer[SLAB_SLOT_SIZE];
 48 | } SlabSlot;
 49 | 
 50 | typedef struct
 51 | {
 52 | 	void	   *tuple;			/* the tuple itself */
 53 | 	Datum		datum1;			/* value of first key column */
 54 | 	bool		isnull1;		/* is first key column NULL? */
 55 | 	int			tupindex;		/* see notes above */
 56 | } SortTuple;
 57 | 
 58 | typedef int (*SortTupleComparator) (const SortTuple *a, const SortTuple *b,
 59 | 									Tuplesortstate *state);
 60 | 
 61 | typedef enum
 62 | {
 63 | 	TSS_INITIAL,				/* Loading tuples; still within memory limit */
 64 | 	TSS_BOUNDED,				/* Loading tuples into bounded-size heap */
 65 | 	TSS_BUILDRUNS,				/* Loading tuples; writing to tape */
 66 | 	TSS_SORTEDINMEM,			/* Sort completed entirely in memory */
 67 | 	TSS_SORTEDONTAPE,			/* Sort completed, final run is on tape */
 68 | 	TSS_FINALMERGE				/* Performing final merge on-the-fly */
 69 | } TupSortStatus;
 70 | 
 71 | typedef struct pgsrt_Tuplesortstate
 72 | {
 73 | 	TupSortStatus status;		/* enumerated value as shown above */
 74 | 	int			nKeys;			/* number of columns in sort key */
 75 | 	bool		randomAccess;	/* did caller request random access? */
 76 | 	bool		bounded;		/* did caller specify a maximum number of
 77 | 								 * tuples to return? */
 78 | 	bool		boundUsed;		/* true if we made use of a bounded heap */
 79 | 	int			bound;			/* if bounded, the maximum number of tuples */
 80 | 	bool		tuples;			/* Can SortTuple.tuple ever be set? */
 81 | 	int64		availMem;		/* remaining memory available, in bytes */
 82 | 	int64		allowedMem;		/* total memory allowed, in bytes */
 83 | 	int			maxTapes;		/* number of tapes (Knuth's T) */
 84 | 	int			tapeRange;		/* maxTapes-1 (Knuth's P) */
 85 | 	MemoryContext sortcontext;	/* memory context holding most sort data */
 86 | 	MemoryContext tuplecontext; /* sub-context of sortcontext for tuple data */
 87 | 	LogicalTapeSet *tapeset;	/* logtape.c object for tapes in a temp file */
 88 | 
 89 | 	/*
 90 | 	 * These function pointers decouple the routines that must know what kind
 91 | 	 * of tuple we are sorting from the routines that don't need to know it.
 92 | 	 * They are set up by the tuplesort_begin_xxx routines.
 93 | 	 *
 94 | 	 * Function to compare two tuples; result is per qsort() convention, ie:
 95 | 	 * <0, 0, >0 according as a<b, a=b, a>b.  The API must match
 96 | 	 * qsort_arg_comparator.
 97 | 	 */
 98 | 	SortTupleComparator comparetup;
 99 | 
100 | 	/*
101 | 	 * Function to copy a supplied input tuple into palloc'd space and set up
102 | 	 * its SortTuple representation (ie, set tuple/datum1/isnull1).  Also,
103 | 	 * state->availMem must be decreased by the amount of space used for the
104 | 	 * tuple copy (note the SortTuple struct itself is not counted).
105 | 	 */
106 | 	void		(*copytup) (Tuplesortstate *state, SortTuple *stup, void *tup);
107 | 
108 | 	/*
109 | 	 * Function to write a stored tuple onto tape.  The representation of the
110 | 	 * tuple on tape need not be the same as it is in memory; requirements on
111 | 	 * the tape representation are given below.  Unless the slab allocator is
112 | 	 * used, after writing the tuple, pfree() the out-of-line data (not the
113 | 	 * SortTuple struct!), and increase state->availMem by the amount of
114 | 	 * memory space thereby released.
115 | 	 */
116 | 	void		(*writetup) (Tuplesortstate *state, int tapenum,
117 | 							 SortTuple *stup);
118 | 
119 | 	/*
120 | 	 * Function to read a stored tuple from tape back into memory. 'len' is
121 | 	 * the already-read length of the stored tuple.  The tuple is allocated
122 | 	 * from the slab memory arena, or is palloc'd, see readtup_alloc().
123 | 	 */
124 | 	void		(*readtup) (Tuplesortstate *state, SortTuple *stup,
125 | 							int tapenum, unsigned int len);
126 | 
127 | 	/*
128 | 	 * This array holds the tuples now in sort memory.  If we are in state
129 | 	 * INITIAL, the tuples are in no particular order; if we are in state
130 | 	 * SORTEDINMEM, the tuples are in final sorted order; in states BUILDRUNS
131 | 	 * and FINALMERGE, the tuples are organized in "heap" order per Algorithm
132 | 	 * H.  In state SORTEDONTAPE, the array is not used.
133 | 	 */
134 | 	SortTuple  *memtuples;		/* array of SortTuple structs */
135 | 	int			memtupcount;	/* number of tuples currently present */
136 | 	int			memtupsize;		/* allocated length of memtuples array */
137 | 	bool		growmemtuples;	/* memtuples' growth still underway? */
138 | 
139 | 	/*
140 | 	 * Memory for tuples is sometimes allocated using a simple slab allocator,
141 | 	 * rather than with palloc().  Currently, we switch to slab allocation
142 | 	 * when we start merging.  Merging only needs to keep a small, fixed
143 | 	 * number of tuples in memory at any time, so we can avoid the
144 | 	 * palloc/pfree overhead by recycling a fixed number of fixed-size slots
145 | 	 * to hold the tuples.
146 | 	 *
147 | 	 * For the slab, we use one large allocation, divided into SLAB_SLOT_SIZE
148 | 	 * slots.  The allocation is sized to have one slot per tape, plus one
149 | 	 * additional slot.  We need that many slots to hold all the tuples kept
150 | 	 * in the heap during merge, plus the one we have last returned from the
151 | 	 * sort, with tuplesort_gettuple.
152 | 	 *
153 | 	 * Initially, all the slots are kept in a linked list of free slots.  When
154 | 	 * a tuple is read from a tape, it is put to the next available slot, if
155 | 	 * it fits.  If the tuple is larger than SLAB_SLOT_SIZE, it is palloc'd
156 | 	 * instead.
157 | 	 *
158 | 	 * When we're done processing a tuple, we return the slot back to the free
159 | 	 * list, or pfree() if it was palloc'd.  We know that a tuple was
160 | 	 * allocated from the slab, if its pointer value is between
161 | 	 * slabMemoryBegin and -End.
162 | 	 *
163 | 	 * When the slab allocator is used, the USEMEM/LACKMEM mechanism of
164 | 	 * tracking memory usage is not used.
165 | 	 */
166 | 	bool		slabAllocatorUsed;
167 | 
168 | 	char	   *slabMemoryBegin;	/* beginning of slab memory arena */
169 | 	char	   *slabMemoryEnd;	/* end of slab memory arena */
170 | 	SlabSlot   *slabFreeHead;	/* head of free list */
171 | 
172 | 	/* Buffer size to use for reading input tapes, during merge. */
173 | 	size_t		read_buffer_size;
174 | 
175 | 	/*
176 | 	 * When we return a tuple to the caller in tuplesort_gettuple_XXX, that
177 | 	 * came from a tape (that is, in TSS_SORTEDONTAPE or TSS_FINALMERGE
178 | 	 * modes), we remember the tuple in 'lastReturnedTuple', so that we can
179 | 	 * recycle the memory on next gettuple call.
180 | 	 */
181 | 	void	   *lastReturnedTuple;
182 | 
183 | 	/*
184 | 	 * While building initial runs, this is the current output run number.
185 | 	 * Afterwards, it is the number of initial runs we made.
186 | 	 */
187 | 	int			currentRun;
188 | } pgsrt_Tuplesortstate;
189 | 
190 | #endif		/* PG_SORTSTATS_IMPORT_PG11_H */
191 | 


--------------------------------------------------------------------------------
/include/pg_sortstats_import_pg12.h:
--------------------------------------------------------------------------------
  1 | #ifndef PG_SORTSTATS_IMPORT_PG12_H
  2 | #define PG_SORTSTATS_IMPORT_PG12_H
  3 | 
  4 | #define PGSRT_ALLOC_CHUNKHDRSZ	sizeof(struct pgsrt_AllocChunkData)
  5 | 
  6 | /*
  7 |  * AllocChunk
  8 |  *		The prefix of each piece of memory in an AllocBlock
  9 |  *
 10 |  * Note: to meet the memory context APIs, the payload area of the chunk must
 11 |  * be maxaligned, and the "aset" link must be immediately adjacent to the
 12 |  * payload area (cf. GetMemoryChunkContext).  We simplify matters for this
 13 |  * module by requiring sizeof(AllocChunkData) to be maxaligned, and then
 14 |  * we can ensure things work by adding any required alignment padding before
 15 |  * the "aset" field.  There is a static assertion below that the alignment
 16 |  * is done correctly.
 17 |  */
 18 | typedef struct pgsrt_AllocChunkData
 19 | {
 20 | 	/* size is always the size of the usable space in the chunk */
 21 | 	Size		size;
 22 | #ifdef MEMORY_CONTEXT_CHECKING
 23 | 	/* when debugging memory usage, also store actual requested size */
 24 | 	/* this is zero in a free chunk */
 25 | 	Size		requested_size;
 26 | 
 27 | #define ALLOCCHUNK_RAWSIZE  (SIZEOF_SIZE_T * 2 + SIZEOF_VOID_P)
 28 | #else
 29 | #define ALLOCCHUNK_RAWSIZE  (SIZEOF_SIZE_T + SIZEOF_VOID_P)
 30 | #endif							/* MEMORY_CONTEXT_CHECKING */
 31 | 
 32 | 	/* ensure proper alignment by adding padding if needed */
 33 | #if (ALLOCCHUNK_RAWSIZE % MAXIMUM_ALIGNOF) != 0
 34 | 	char		padding[MAXIMUM_ALIGNOF - ALLOCCHUNK_RAWSIZE % MAXIMUM_ALIGNOF];
 35 | #endif
 36 | 
 37 | 	/* aset is the owning aset if allocated, or the freelist link if free */
 38 | 	void	   *aset;
 39 | 	/* there must not be any padding to reach a MAXALIGN boundary here! */
 40 | }			pgsrt_AllocChunkData;
 41 | 
 42 | 
 43 | #define SLAB_SLOT_SIZE 1024
 44 | typedef union SlabSlot
 45 | {
 46 | 	union SlabSlot *nextfree;
 47 | 	char		buffer[SLAB_SLOT_SIZE];
 48 | } SlabSlot;
 49 | 
 50 | typedef struct
 51 | {
 52 | 	void	   *tuple;			/* the tuple itself */
 53 | 	Datum		datum1;			/* value of first key column */
 54 | 	bool		isnull1;		/* is first key column NULL? */
 55 | 	int			tupindex;		/* see notes above */
 56 | } SortTuple;
 57 | 
 58 | typedef int (*SortTupleComparator) (const SortTuple *a, const SortTuple *b,
 59 | 									Tuplesortstate *state);
 60 | typedef enum
 61 | {
 62 | 	TSS_INITIAL,				/* Loading tuples; still within memory limit */
 63 | 	TSS_BOUNDED,				/* Loading tuples into bounded-size heap */
 64 | 	TSS_BUILDRUNS,				/* Loading tuples; writing to tape */
 65 | 	TSS_SORTEDINMEM,			/* Sort completed entirely in memory */
 66 | 	TSS_SORTEDONTAPE,			/* Sort completed, final run is on tape */
 67 | 	TSS_FINALMERGE				/* Performing final merge on-the-fly */
 68 | } TupSortStatus;
 69 | 
 70 | typedef struct pgsrt_Tuplesortstate
 71 | {
 72 | 	TupSortStatus status;		/* enumerated value as shown above */
 73 | 	int			nKeys;			/* number of columns in sort key */
 74 | 	bool		randomAccess;	/* did caller request random access? */
 75 | 	bool		bounded;		/* did caller specify a maximum number of
 76 | 								 * tuples to return? */
 77 | 	bool		boundUsed;		/* true if we made use of a bounded heap */
 78 | 	int			bound;			/* if bounded, the maximum number of tuples */
 79 | 	bool		tuples;			/* Can SortTuple.tuple ever be set? */
 80 | 	int64		availMem;		/* remaining memory available, in bytes */
 81 | 	int64		allowedMem;		/* total memory allowed, in bytes */
 82 | 	int			maxTapes;		/* number of tapes (Knuth's T) */
 83 | 	int			tapeRange;		/* maxTapes-1 (Knuth's P) */
 84 | 	MemoryContext sortcontext;	/* memory context holding most sort data */
 85 | 	MemoryContext tuplecontext; /* sub-context of sortcontext for tuple data */
 86 | 	LogicalTapeSet *tapeset;	/* logtape.c object for tapes in a temp file */
 87 | 
 88 | 	/*
 89 | 	 * These function pointers decouple the routines that must know what kind
 90 | 	 * of tuple we are sorting from the routines that don't need to know it.
 91 | 	 * They are set up by the tuplesort_begin_xxx routines.
 92 | 	 *
 93 | 	 * Function to compare two tuples; result is per qsort() convention, ie:
 94 | 	 * <0, 0, >0 according as a<b, a=b, a>b.  The API must match
 95 | 	 * qsort_arg_comparator.
 96 | 	 */
 97 | 	SortTupleComparator comparetup;
 98 | 
 99 | 	/*
100 | 	 * Function to copy a supplied input tuple into palloc'd space and set up
101 | 	 * its SortTuple representation (ie, set tuple/datum1/isnull1).  Also,
102 | 	 * state->availMem must be decreased by the amount of space used for the
103 | 	 * tuple copy (note the SortTuple struct itself is not counted).
104 | 	 */
105 | 	void		(*copytup) (Tuplesortstate *state, SortTuple *stup, void *tup);
106 | 
107 | 	/*
108 | 	 * Function to write a stored tuple onto tape.  The representation of the
109 | 	 * tuple on tape need not be the same as it is in memory; requirements on
110 | 	 * the tape representation are given below.  Unless the slab allocator is
111 | 	 * used, after writing the tuple, pfree() the out-of-line data (not the
112 | 	 * SortTuple struct!), and increase state->availMem by the amount of
113 | 	 * memory space thereby released.
114 | 	 */
115 | 	void		(*writetup) (Tuplesortstate *state, int tapenum,
116 | 							 SortTuple *stup);
117 | 
118 | 	/*
119 | 	 * Function to read a stored tuple from tape back into memory. 'len' is
120 | 	 * the already-read length of the stored tuple.  The tuple is allocated
121 | 	 * from the slab memory arena, or is palloc'd, see readtup_alloc().
122 | 	 */
123 | 	void		(*readtup) (Tuplesortstate *state, SortTuple *stup,
124 | 							int tapenum, unsigned int len);
125 | 
126 | 	/*
127 | 	 * This array holds the tuples now in sort memory.  If we are in state
128 | 	 * INITIAL, the tuples are in no particular order; if we are in state
129 | 	 * SORTEDINMEM, the tuples are in final sorted order; in states BUILDRUNS
130 | 	 * and FINALMERGE, the tuples are organized in "heap" order per Algorithm
131 | 	 * H.  In state SORTEDONTAPE, the array is not used.
132 | 	 */
133 | 	SortTuple  *memtuples;		/* array of SortTuple structs */
134 | 	int			memtupcount;	/* number of tuples currently present */
135 | 	int			memtupsize;		/* allocated length of memtuples array */
136 | 	bool		growmemtuples;	/* memtuples' growth still underway? */
137 | 
138 | 	/*
139 | 	 * Memory for tuples is sometimes allocated using a simple slab allocator,
140 | 	 * rather than with palloc().  Currently, we switch to slab allocation
141 | 	 * when we start merging.  Merging only needs to keep a small, fixed
142 | 	 * number of tuples in memory at any time, so we can avoid the
143 | 	 * palloc/pfree overhead by recycling a fixed number of fixed-size slots
144 | 	 * to hold the tuples.
145 | 	 *
146 | 	 * For the slab, we use one large allocation, divided into SLAB_SLOT_SIZE
147 | 	 * slots.  The allocation is sized to have one slot per tape, plus one
148 | 	 * additional slot.  We need that many slots to hold all the tuples kept
149 | 	 * in the heap during merge, plus the one we have last returned from the
150 | 	 * sort, with tuplesort_gettuple.
151 | 	 *
152 | 	 * Initially, all the slots are kept in a linked list of free slots.  When
153 | 	 * a tuple is read from a tape, it is put to the next available slot, if
154 | 	 * it fits.  If the tuple is larger than SLAB_SLOT_SIZE, it is palloc'd
155 | 	 * instead.
156 | 	 *
157 | 	 * When we're done processing a tuple, we return the slot back to the free
158 | 	 * list, or pfree() if it was palloc'd.  We know that a tuple was
159 | 	 * allocated from the slab, if its pointer value is between
160 | 	 * slabMemoryBegin and -End.
161 | 	 *
162 | 	 * When the slab allocator is used, the USEMEM/LACKMEM mechanism of
163 | 	 * tracking memory usage is not used.
164 | 	 */
165 | 	bool		slabAllocatorUsed;
166 | 
167 | 	char	   *slabMemoryBegin;	/* beginning of slab memory arena */
168 | 	char	   *slabMemoryEnd;	/* end of slab memory arena */
169 | 	SlabSlot   *slabFreeHead;	/* head of free list */
170 | 
171 | 	/* Buffer size to use for reading input tapes, during merge. */
172 | 	size_t		read_buffer_size;
173 | 
174 | 	/*
175 | 	 * When we return a tuple to the caller in tuplesort_gettuple_XXX, that
176 | 	 * came from a tape (that is, in TSS_SORTEDONTAPE or TSS_FINALMERGE
177 | 	 * modes), we remember the tuple in 'lastReturnedTuple', so that we can
178 | 	 * recycle the memory on next gettuple call.
179 | 	 */
180 | 	void	   *lastReturnedTuple;
181 | 
182 | 	/*
183 | 	 * While building initial runs, this is the current output run number.
184 | 	 * Afterwards, it is the number of initial runs we made.
185 | 	 */
186 | 	int			currentRun;
187 | } pgsrt_Tuplesortstate;
188 | 
189 | #endif		/* PG_SORTSTATS_IMPORT_PG12_H */
190 | 


--------------------------------------------------------------------------------
/include/pg_sortstats_import_pg13.h:
--------------------------------------------------------------------------------
  1 | #ifndef PG_SORTSTATS_IMPORT_PG13_H
  2 | #define PG_SORTSTATS_IMPORT_PG13_H
  3 | 
  4 | #include "utils/pg_rusage.h"
  5 | 
  6 | #define PGSRT_ALLOC_CHUNKHDRSZ	sizeof(struct pgsrt_AllocChunkData)
  7 | 
  8 | /*
  9 |  * AllocChunk
 10 |  *		The prefix of each piece of memory in an AllocBlock
 11 |  *
 12 |  * Note: to meet the memory context APIs, the payload area of the chunk must
 13 |  * be maxaligned, and the "aset" link must be immediately adjacent to the
 14 |  * payload area (cf. GetMemoryChunkContext).  We simplify matters for this
 15 |  * module by requiring sizeof(AllocChunkData) to be maxaligned, and then
 16 |  * we can ensure things work by adding any required alignment padding before
 17 |  * the "aset" field.  There is a static assertion below that the alignment
 18 |  * is done correctly.
 19 |  */
 20 | typedef struct pgsrt_AllocChunkData
 21 | {
 22 | 	/* size is always the size of the usable space in the chunk */
 23 | 	Size		size;
 24 | #ifdef MEMORY_CONTEXT_CHECKING
 25 | 	/* when debugging memory usage, also store actual requested size */
 26 | 	/* this is zero in a free chunk */
 27 | 	Size		requested_size;
 28 | 
 29 | #define ALLOCCHUNK_RAWSIZE  (SIZEOF_SIZE_T * 2 + SIZEOF_VOID_P)
 30 | #else
 31 | #define ALLOCCHUNK_RAWSIZE  (SIZEOF_SIZE_T + SIZEOF_VOID_P)
 32 | #endif							/* MEMORY_CONTEXT_CHECKING */
 33 | 
 34 | 	/* ensure proper alignment by adding padding if needed */
 35 | #if (ALLOCCHUNK_RAWSIZE % MAXIMUM_ALIGNOF) != 0
 36 | 	char		padding[MAXIMUM_ALIGNOF - ALLOCCHUNK_RAWSIZE % MAXIMUM_ALIGNOF];
 37 | #endif
 38 | 
 39 | 	/* aset is the owning aset if allocated, or the freelist link if free */
 40 | 	void	   *aset;
 41 | 	/* there must not be any padding to reach a MAXALIGN boundary here! */
 42 | }			pgsrt_AllocChunkData;
 43 | 
 44 | 
 45 | #define SLAB_SLOT_SIZE 1024
 46 | typedef union SlabSlot
 47 | {
 48 | 	union SlabSlot *nextfree;
 49 | 	char		buffer[SLAB_SLOT_SIZE];
 50 | } SlabSlot;
 51 | 
 52 | typedef struct
 53 | {
 54 | 	void	   *tuple;			/* the tuple itself */
 55 | 	Datum		datum1;			/* value of first key column */
 56 | 	bool		isnull1;		/* is first key column NULL? */
 57 | 	int			tupindex;		/* see notes above */
 58 | } SortTuple;
 59 | 
 60 | typedef int (*SortTupleComparator) (const SortTuple *a, const SortTuple *b,
 61 | 									Tuplesortstate *state);
 62 | typedef enum
 63 | {
 64 | 	TSS_INITIAL,				/* Loading tuples; still within memory limit */
 65 | 	TSS_BOUNDED,				/* Loading tuples into bounded-size heap */
 66 | 	TSS_BUILDRUNS,				/* Loading tuples; writing to tape */
 67 | 	TSS_SORTEDINMEM,			/* Sort completed entirely in memory */
 68 | 	TSS_SORTEDONTAPE,			/* Sort completed, final run is on tape */
 69 | 	TSS_FINALMERGE				/* Performing final merge on-the-fly */
 70 | } TupSortStatus;
 71 | 
 72 | typedef struct pgsrt_Tuplesortstate
 73 | {
 74 | 	TupSortStatus status;		/* enumerated value as shown above */
 75 | 	int			nKeys;			/* number of columns in sort key */
 76 | 	bool		randomAccess;	/* did caller request random access? */
 77 | 	bool		bounded;		/* did caller specify a maximum number of
 78 | 								 * tuples to return? */
 79 | 	bool		boundUsed;		/* true if we made use of a bounded heap */
 80 | 	int			bound;			/* if bounded, the maximum number of tuples */
 81 | 	bool		tuples;			/* Can SortTuple.tuple ever be set? */
 82 | 	int64		availMem;		/* remaining memory available, in bytes */
 83 | 	int64		allowedMem;		/* total memory allowed, in bytes */
 84 | 	int			maxTapes;		/* number of tapes (Knuth's T) */
 85 | 	int			tapeRange;		/* maxTapes-1 (Knuth's P) */
 86 | 	int64		maxSpace;		/* maximum amount of space occupied among sort
 87 | 								 * of groups, either in-memory or on-disk */
 88 | 	bool		isMaxSpaceDisk; /* true when maxSpace is value for on-disk
 89 | 								 * space, false when it's value for in-memory
 90 | 								 * space */
 91 | 	TupSortStatus maxSpaceStatus;	/* sort status when maxSpace was reached */
 92 | 	MemoryContext maincontext;	/* memory context for tuple sort metadata that
 93 | 								 * persists across multiple batches */
 94 | 	MemoryContext sortcontext;	/* memory context holding most sort data */
 95 | 	MemoryContext tuplecontext; /* sub-context of sortcontext for tuple data */
 96 | 	LogicalTapeSet *tapeset;	/* logtape.c object for tapes in a temp file */
 97 | 
 98 | 	/*
 99 | 	 * These function pointers decouple the routines that must know what kind
100 | 	 * of tuple we are sorting from the routines that don't need to know it.
101 | 	 * They are set up by the tuplesort_begin_xxx routines.
102 | 	 *
103 | 	 * Function to compare two tuples; result is per qsort() convention, ie:
104 | 	 * <0, 0, >0 according as a<b, a=b, a>b.  The API must match
105 | 	 * qsort_arg_comparator.
106 | 	 */
107 | 	SortTupleComparator comparetup;
108 | 
109 | 	/*
110 | 	 * Function to copy a supplied input tuple into palloc'd space and set up
111 | 	 * its SortTuple representation (ie, set tuple/datum1/isnull1).  Also,
112 | 	 * state->availMem must be decreased by the amount of space used for the
113 | 	 * tuple copy (note the SortTuple struct itself is not counted).
114 | 	 */
115 | 	void		(*copytup) (Tuplesortstate *state, SortTuple *stup, void *tup);
116 | 
117 | 	/*
118 | 	 * Function to write a stored tuple onto tape.  The representation of the
119 | 	 * tuple on tape need not be the same as it is in memory; requirements on
120 | 	 * the tape representation are given below.  Unless the slab allocator is
121 | 	 * used, after writing the tuple, pfree() the out-of-line data (not the
122 | 	 * SortTuple struct!), and increase state->availMem by the amount of
123 | 	 * memory space thereby released.
124 | 	 */
125 | 	void		(*writetup) (Tuplesortstate *state, int tapenum,
126 | 							 SortTuple *stup);
127 | 
128 | 	/*
129 | 	 * Function to read a stored tuple from tape back into memory. 'len' is
130 | 	 * the already-read length of the stored tuple.  The tuple is allocated
131 | 	 * from the slab memory arena, or is palloc'd, see readtup_alloc().
132 | 	 */
133 | 	void		(*readtup) (Tuplesortstate *state, SortTuple *stup,
134 | 							int tapenum, unsigned int len);
135 | 
136 | 	/*
137 | 	 * This array holds the tuples now in sort memory.  If we are in state
138 | 	 * INITIAL, the tuples are in no particular order; if we are in state
139 | 	 * SORTEDINMEM, the tuples are in final sorted order; in states BUILDRUNS
140 | 	 * and FINALMERGE, the tuples are organized in "heap" order per Algorithm
141 | 	 * H.  In state SORTEDONTAPE, the array is not used.
142 | 	 */
143 | 	SortTuple  *memtuples;		/* array of SortTuple structs */
144 | 	int			memtupcount;	/* number of tuples currently present */
145 | 	int			memtupsize;		/* allocated length of memtuples array */
146 | 	bool		growmemtuples;	/* memtuples' growth still underway? */
147 | 
148 | 	/*
149 | 	 * Memory for tuples is sometimes allocated using a simple slab allocator,
150 | 	 * rather than with palloc().  Currently, we switch to slab allocation
151 | 	 * when we start merging.  Merging only needs to keep a small, fixed
152 | 	 * number of tuples in memory at any time, so we can avoid the
153 | 	 * palloc/pfree overhead by recycling a fixed number of fixed-size slots
154 | 	 * to hold the tuples.
155 | 	 *
156 | 	 * For the slab, we use one large allocation, divided into SLAB_SLOT_SIZE
157 | 	 * slots.  The allocation is sized to have one slot per tape, plus one
158 | 	 * additional slot.  We need that many slots to hold all the tuples kept
159 | 	 * in the heap during merge, plus the one we have last returned from the
160 | 	 * sort, with tuplesort_gettuple.
161 | 	 *
162 | 	 * Initially, all the slots are kept in a linked list of free slots.  When
163 | 	 * a tuple is read from a tape, it is put to the next available slot, if
164 | 	 * it fits.  If the tuple is larger than SLAB_SLOT_SIZE, it is palloc'd
165 | 	 * instead.
166 | 	 *
167 | 	 * When we're done processing a tuple, we return the slot back to the free
168 | 	 * list, or pfree() if it was palloc'd.  We know that a tuple was
169 | 	 * allocated from the slab, if its pointer value is between
170 | 	 * slabMemoryBegin and -End.
171 | 	 *
172 | 	 * When the slab allocator is used, the USEMEM/LACKMEM mechanism of
173 | 	 * tracking memory usage is not used.
174 | 	 */
175 | 	bool		slabAllocatorUsed;
176 | 
177 | 	char	   *slabMemoryBegin;	/* beginning of slab memory arena */
178 | 	char	   *slabMemoryEnd;	/* end of slab memory arena */
179 | 	SlabSlot   *slabFreeHead;	/* head of free list */
180 | 
181 | 	/* Buffer size to use for reading input tapes, during merge. */
182 | 	size_t		read_buffer_size;
183 | 
184 | 	/*
185 | 	 * When we return a tuple to the caller in tuplesort_gettuple_XXX, that
186 | 	 * came from a tape (that is, in TSS_SORTEDONTAPE or TSS_FINALMERGE
187 | 	 * modes), we remember the tuple in 'lastReturnedTuple', so that we can
188 | 	 * recycle the memory on next gettuple call.
189 | 	 */
190 | 	void	   *lastReturnedTuple;
191 | 
192 | 	/*
193 | 	 * While building initial runs, this is the current output run number.
194 | 	 * Afterwards, it is the number of initial runs we made.
195 | 	 */
196 | 	int			currentRun;
197 | 
198 | 	/*
199 | 	 * Unless otherwise noted, all pointer variables below are pointers to
200 | 	 * arrays of length maxTapes, holding per-tape data.
201 | 	 */
202 | 
203 | 	/*
204 | 	 * This variable is only used during merge passes.  mergeactive[i] is true
205 | 	 * if we are reading an input run from (actual) tape number i and have not
206 | 	 * yet exhausted that run.
207 | 	 */
208 | 	bool	   *mergeactive;	/* active input run source? */
209 | 
210 | 	/*
211 | 	 * Variables for Algorithm D.  Note that destTape is a "logical" tape
212 | 	 * number, ie, an index into the tp_xxx[] arrays.  Be careful to keep
213 | 	 * "logical" and "actual" tape numbers straight!
214 | 	 */
215 | 	int			Level;			/* Knuth's l */
216 | 	int			destTape;		/* current output tape (Knuth's j, less 1) */
217 | 	int		   *tp_fib;			/* Target Fibonacci run counts (A[]) */
218 | 	int		   *tp_runs;		/* # of real runs on each tape */
219 | 	int		   *tp_dummy;		/* # of dummy runs for each tape (D[]) */
220 | 	int		   *tp_tapenum;		/* Actual tape numbers (TAPE[]) */
221 | 	int			activeTapes;	/* # of active input tapes in merge pass */
222 | 
223 | 	/*
224 | 	 * These variables are used after completion of sorting to keep track of
225 | 	 * the next tuple to return.  (In the tape case, the tape's current read
226 | 	 * position is also critical state.)
227 | 	 */
228 | 	int			result_tape;	/* actual tape number of finished output */
229 | 	int			current;		/* array index (only used if SORTEDINMEM) */
230 | 	bool		eof_reached;	/* reached EOF (needed for cursors) */
231 | 
232 | 	/* markpos_xxx holds marked position for mark and restore */
233 | 	long		markpos_block;	/* tape block# (only used if SORTEDONTAPE) */
234 | 	int			markpos_offset; /* saved "current", or offset in tape block */
235 | 	bool		markpos_eof;	/* saved "eof_reached" */
236 | 
237 | 	/*
238 | 	 * These variables are used during parallel sorting.
239 | 	 *
240 | 	 * worker is our worker identifier.  Follows the general convention that
241 | 	 * -1 value relates to a leader tuplesort, and values >= 0 worker
242 | 	 * tuplesorts. (-1 can also be a serial tuplesort.)
243 | 	 *
244 | 	 * shared is mutable shared memory state, which is used to coordinate
245 | 	 * parallel sorts.
246 | 	 *
247 | 	 * nParticipants is the number of worker Tuplesortstates known by the
248 | 	 * leader to have actually been launched, which implies that they must
249 | 	 * finish a run leader can merge.  Typically includes a worker state held
250 | 	 * by the leader process itself.  Set in the leader Tuplesortstate only.
251 | 	 */
252 | 	int			worker;
253 | 	Sharedsort *shared;
254 | 	int			nParticipants;
255 | 
256 | 	/*
257 | 	 * The sortKeys variable is used by every case other than the hash index
258 | 	 * case; it is set by tuplesort_begin_xxx.  tupDesc is only used by the
259 | 	 * MinimalTuple and CLUSTER routines, though.
260 | 	 */
261 | 	TupleDesc	tupDesc;
262 | 	SortSupport sortKeys;		/* array of length nKeys */
263 | 
264 | 	/*
265 | 	 * This variable is shared by the single-key MinimalTuple case and the
266 | 	 * Datum case (which both use qsort_ssup()).  Otherwise it's NULL.
267 | 	 */
268 | 	SortSupport onlyKey;
269 | 
270 | 	/*
271 | 	 * Additional state for managing "abbreviated key" sortsupport routines
272 | 	 * (which currently may be used by all cases except the hash index case).
273 | 	 * Tracks the intervals at which the optimization's effectiveness is
274 | 	 * tested.
275 | 	 */
276 | 	int64		abbrevNext;		/* Tuple # at which to next check
277 | 								 * applicability */
278 | 
279 | 	/*
280 | 	 * These variables are specific to the CLUSTER case; they are set by
281 | 	 * tuplesort_begin_cluster.
282 | 	 */
283 | 	IndexInfo  *indexInfo;		/* info about index being used for reference */
284 | 	EState	   *estate;			/* for evaluating index expressions */
285 | 
286 | 	/*
287 | 	 * These variables are specific to the IndexTuple case; they are set by
288 | 	 * tuplesort_begin_index_xxx and used only by the IndexTuple routines.
289 | 	 */
290 | 	Relation	heapRel;		/* table the index is being built on */
291 | 	Relation	indexRel;		/* index being built */
292 | 
293 | 	/* These are specific to the index_btree subcase: */
294 | 	bool		enforceUnique;	/* complain if we find duplicate tuples */
295 | 
296 | 	/* These are specific to the index_hash subcase: */
297 | 	uint32		high_mask;		/* masks for sortable part of hash code */
298 | 	uint32		low_mask;
299 | 	uint32		max_buckets;
300 | 
301 | 	/*
302 | 	 * These variables are specific to the Datum case; they are set by
303 | 	 * tuplesort_begin_datum and used only by the DatumTuple routines.
304 | 	 */
305 | 	Oid			datumType;
306 | 	/* we need typelen in order to know how to copy the Datums. */
307 | 	int			datumTypeLen;
308 | 
309 | 	/*
310 | 	 * Resource snapshot for time of sort start.
311 | 	 */
312 | #ifdef TRACE_SORT
313 | 	PGRUsage	ru_start;
314 | #endif
315 | } pgsrt_Tuplesortstate;
316 | 
317 | #endif		/* PG_SORTSTATS_IMPORT_PG13_H */
318 | 


--------------------------------------------------------------------------------
/include/pg_sortstats_import_pg14.h:
--------------------------------------------------------------------------------
  1 | #ifndef PG_SORTSTATS_IMPORT_PG14_H
  2 | #define PG_SORTSTATS_IMPORT_PG14_H
  3 | 
  4 | #include "utils/pg_rusage.h"
  5 | 
  6 | #define PGSRT_ALLOC_CHUNKHDRSZ	sizeof(struct pgsrt_AllocChunkData)
  7 | 
  8 | /*
  9 |  * AllocChunk
 10 |  *		The prefix of each piece of memory in an AllocBlock
 11 |  *
 12 |  * Note: to meet the memory context APIs, the payload area of the chunk must
 13 |  * be maxaligned, and the "aset" link must be immediately adjacent to the
 14 |  * payload area (cf. GetMemoryChunkContext).  We simplify matters for this
 15 |  * module by requiring sizeof(AllocChunkData) to be maxaligned, and then
 16 |  * we can ensure things work by adding any required alignment padding before
 17 |  * the "aset" field.  There is a static assertion below that the alignment
 18 |  * is done correctly.
 19 |  */
 20 | typedef struct pgsrt_AllocChunkData
 21 | {
 22 | 	/* size is always the size of the usable space in the chunk */
 23 | 	Size		size;
 24 | #ifdef MEMORY_CONTEXT_CHECKING
 25 | 	/* when debugging memory usage, also store actual requested size */
 26 | 	/* this is zero in a free chunk */
 27 | 	Size		requested_size;
 28 | 
 29 | #define ALLOCCHUNK_RAWSIZE  (SIZEOF_SIZE_T * 2 + SIZEOF_VOID_P)
 30 | #else
 31 | #define ALLOCCHUNK_RAWSIZE  (SIZEOF_SIZE_T + SIZEOF_VOID_P)
 32 | #endif							/* MEMORY_CONTEXT_CHECKING */
 33 | 
 34 | 	/* ensure proper alignment by adding padding if needed */
 35 | #if (ALLOCCHUNK_RAWSIZE % MAXIMUM_ALIGNOF) != 0
 36 | 	char		padding[MAXIMUM_ALIGNOF - ALLOCCHUNK_RAWSIZE % MAXIMUM_ALIGNOF];
 37 | #endif
 38 | 
 39 | 	/* aset is the owning aset if allocated, or the freelist link if free */
 40 | 	void	   *aset;
 41 | 	/* there must not be any padding to reach a MAXALIGN boundary here! */
 42 | }			pgsrt_AllocChunkData;
 43 | 
 44 | 
 45 | #define SLAB_SLOT_SIZE 1024
 46 | typedef union SlabSlot
 47 | {
 48 | 	union SlabSlot *nextfree;
 49 | 	char		buffer[SLAB_SLOT_SIZE];
 50 | } SlabSlot;
 51 | 
 52 | typedef struct
 53 | {
 54 | 	void	   *tuple;			/* the tuple itself */
 55 | 	Datum		datum1;			/* value of first key column */
 56 | 	bool		isnull1;		/* is first key column NULL? */
 57 | 	int			tupindex;		/* see notes above */
 58 | } SortTuple;
 59 | 
 60 | typedef int (*SortTupleComparator) (const SortTuple *a, const SortTuple *b,
 61 | 									Tuplesortstate *state);
 62 | typedef enum
 63 | {
 64 | 	TSS_INITIAL,				/* Loading tuples; still within memory limit */
 65 | 	TSS_BOUNDED,				/* Loading tuples into bounded-size heap */
 66 | 	TSS_BUILDRUNS,				/* Loading tuples; writing to tape */
 67 | 	TSS_SORTEDINMEM,			/* Sort completed entirely in memory */
 68 | 	TSS_SORTEDONTAPE,			/* Sort completed, final run is on tape */
 69 | 	TSS_FINALMERGE				/* Performing final merge on-the-fly */
 70 | } TupSortStatus;
 71 | 
 72 | typedef struct pgsrt_Tuplesortstate
 73 | {
 74 | 	TupSortStatus status;		/* enumerated value as shown above */
 75 | 	int			nKeys;			/* number of columns in sort key */
 76 | 	bool		randomAccess;	/* did caller request random access? */
 77 | 	bool		bounded;		/* did caller specify a maximum number of
 78 | 								 * tuples to return? */
 79 | 	bool		boundUsed;		/* true if we made use of a bounded heap */
 80 | 	int			bound;			/* if bounded, the maximum number of tuples */
 81 | 	bool		tuples;			/* Can SortTuple.tuple ever be set? */
 82 | 	int64		availMem;		/* remaining memory available, in bytes */
 83 | 	int64		allowedMem;		/* total memory allowed, in bytes */
 84 | 	int			maxTapes;		/* number of tapes (Knuth's T) */
 85 | 	int			tapeRange;		/* maxTapes-1 (Knuth's P) */
 86 | 	int64		maxSpace;		/* maximum amount of space occupied among sort
 87 | 								 * of groups, either in-memory or on-disk */
 88 | 	bool		isMaxSpaceDisk; /* true when maxSpace is value for on-disk
 89 | 								 * space, false when it's value for in-memory
 90 | 								 * space */
 91 | 	TupSortStatus maxSpaceStatus;	/* sort status when maxSpace was reached */
 92 | 	MemoryContext maincontext;	/* memory context for tuple sort metadata that
 93 | 								 * persists across multiple batches */
 94 | 	MemoryContext sortcontext;	/* memory context holding most sort data */
 95 | 	MemoryContext tuplecontext; /* sub-context of sortcontext for tuple data */
 96 | 	LogicalTapeSet *tapeset;	/* logtape.c object for tapes in a temp file */
 97 | 
 98 | 	/*
 99 | 	 * These function pointers decouple the routines that must know what kind
100 | 	 * of tuple we are sorting from the routines that don't need to know it.
101 | 	 * They are set up by the tuplesort_begin_xxx routines.
102 | 	 *
103 | 	 * Function to compare two tuples; result is per qsort() convention, ie:
104 | 	 * <0, 0, >0 according as a<b, a=b, a>b.  The API must match
105 | 	 * qsort_arg_comparator.
106 | 	 */
107 | 	SortTupleComparator comparetup;
108 | 
109 | 	/*
110 | 	 * Function to copy a supplied input tuple into palloc'd space and set up
111 | 	 * its SortTuple representation (ie, set tuple/datum1/isnull1).  Also,
112 | 	 * state->availMem must be decreased by the amount of space used for the
113 | 	 * tuple copy (note the SortTuple struct itself is not counted).
114 | 	 */
115 | 	void		(*copytup) (Tuplesortstate *state, SortTuple *stup, void *tup);
116 | 
117 | 	/*
118 | 	 * Function to write a stored tuple onto tape.  The representation of the
119 | 	 * tuple on tape need not be the same as it is in memory; requirements on
120 | 	 * the tape representation are given below.  Unless the slab allocator is
121 | 	 * used, after writing the tuple, pfree() the out-of-line data (not the
122 | 	 * SortTuple struct!), and increase state->availMem by the amount of
123 | 	 * memory space thereby released.
124 | 	 */
125 | 	void		(*writetup) (Tuplesortstate *state, int tapenum,
126 | 							 SortTuple *stup);
127 | 
128 | 	/*
129 | 	 * Function to read a stored tuple from tape back into memory. 'len' is
130 | 	 * the already-read length of the stored tuple.  The tuple is allocated
131 | 	 * from the slab memory arena, or is palloc'd, see readtup_alloc().
132 | 	 */
133 | 	void		(*readtup) (Tuplesortstate *state, SortTuple *stup,
134 | 							int tapenum, unsigned int len);
135 | 
136 | 	/*
137 | 	 * This array holds the tuples now in sort memory.  If we are in state
138 | 	 * INITIAL, the tuples are in no particular order; if we are in state
139 | 	 * SORTEDINMEM, the tuples are in final sorted order; in states BUILDRUNS
140 | 	 * and FINALMERGE, the tuples are organized in "heap" order per Algorithm
141 | 	 * H.  In state SORTEDONTAPE, the array is not used.
142 | 	 */
143 | 	SortTuple  *memtuples;		/* array of SortTuple structs */
144 | 	int			memtupcount;	/* number of tuples currently present */
145 | 	int			memtupsize;		/* allocated length of memtuples array */
146 | 	bool		growmemtuples;	/* memtuples' growth still underway? */
147 | 
148 | 	/*
149 | 	 * Memory for tuples is sometimes allocated using a simple slab allocator,
150 | 	 * rather than with palloc().  Currently, we switch to slab allocation
151 | 	 * when we start merging.  Merging only needs to keep a small, fixed
152 | 	 * number of tuples in memory at any time, so we can avoid the
153 | 	 * palloc/pfree overhead by recycling a fixed number of fixed-size slots
154 | 	 * to hold the tuples.
155 | 	 *
156 | 	 * For the slab, we use one large allocation, divided into SLAB_SLOT_SIZE
157 | 	 * slots.  The allocation is sized to have one slot per tape, plus one
158 | 	 * additional slot.  We need that many slots to hold all the tuples kept
159 | 	 * in the heap during merge, plus the one we have last returned from the
160 | 	 * sort, with tuplesort_gettuple.
161 | 	 *
162 | 	 * Initially, all the slots are kept in a linked list of free slots.  When
163 | 	 * a tuple is read from a tape, it is put to the next available slot, if
164 | 	 * it fits.  If the tuple is larger than SLAB_SLOT_SIZE, it is palloc'd
165 | 	 * instead.
166 | 	 *
167 | 	 * When we're done processing a tuple, we return the slot back to the free
168 | 	 * list, or pfree() if it was palloc'd.  We know that a tuple was
169 | 	 * allocated from the slab, if its pointer value is between
170 | 	 * slabMemoryBegin and -End.
171 | 	 *
172 | 	 * When the slab allocator is used, the USEMEM/LACKMEM mechanism of
173 | 	 * tracking memory usage is not used.
174 | 	 */
175 | 	bool		slabAllocatorUsed;
176 | 
177 | 	char	   *slabMemoryBegin;	/* beginning of slab memory arena */
178 | 	char	   *slabMemoryEnd;	/* end of slab memory arena */
179 | 	SlabSlot   *slabFreeHead;	/* head of free list */
180 | 
181 | 	/* Buffer size to use for reading input tapes, during merge. */
182 | 	size_t		read_buffer_size;
183 | 
184 | 	/*
185 | 	 * When we return a tuple to the caller in tuplesort_gettuple_XXX, that
186 | 	 * came from a tape (that is, in TSS_SORTEDONTAPE or TSS_FINALMERGE
187 | 	 * modes), we remember the tuple in 'lastReturnedTuple', so that we can
188 | 	 * recycle the memory on next gettuple call.
189 | 	 */
190 | 	void	   *lastReturnedTuple;
191 | 
192 | 	/*
193 | 	 * While building initial runs, this is the current output run number.
194 | 	 * Afterwards, it is the number of initial runs we made.
195 | 	 */
196 | 	int			currentRun;
197 | 
198 | 	/*
199 | 	 * Unless otherwise noted, all pointer variables below are pointers to
200 | 	 * arrays of length maxTapes, holding per-tape data.
201 | 	 */
202 | 
203 | 	/*
204 | 	 * This variable is only used during merge passes.  mergeactive[i] is true
205 | 	 * if we are reading an input run from (actual) tape number i and have not
206 | 	 * yet exhausted that run.
207 | 	 */
208 | 	bool	   *mergeactive;	/* active input run source? */
209 | 
210 | 	/*
211 | 	 * Variables for Algorithm D.  Note that destTape is a "logical" tape
212 | 	 * number, ie, an index into the tp_xxx[] arrays.  Be careful to keep
213 | 	 * "logical" and "actual" tape numbers straight!
214 | 	 */
215 | 	int			Level;			/* Knuth's l */
216 | 	int			destTape;		/* current output tape (Knuth's j, less 1) */
217 | 	int		   *tp_fib;			/* Target Fibonacci run counts (A[]) */
218 | 	int		   *tp_runs;		/* # of real runs on each tape */
219 | 	int		   *tp_dummy;		/* # of dummy runs for each tape (D[]) */
220 | 	int		   *tp_tapenum;		/* Actual tape numbers (TAPE[]) */
221 | 	int			activeTapes;	/* # of active input tapes in merge pass */
222 | 
223 | 	/*
224 | 	 * These variables are used after completion of sorting to keep track of
225 | 	 * the next tuple to return.  (In the tape case, the tape's current read
226 | 	 * position is also critical state.)
227 | 	 */
228 | 	int			result_tape;	/* actual tape number of finished output */
229 | 	int			current;		/* array index (only used if SORTEDINMEM) */
230 | 	bool		eof_reached;	/* reached EOF (needed for cursors) */
231 | 
232 | 	/* markpos_xxx holds marked position for mark and restore */
233 | 	long		markpos_block;	/* tape block# (only used if SORTEDONTAPE) */
234 | 	int			markpos_offset; /* saved "current", or offset in tape block */
235 | 	bool		markpos_eof;	/* saved "eof_reached" */
236 | 
237 | 	/*
238 | 	 * These variables are used during parallel sorting.
239 | 	 *
240 | 	 * worker is our worker identifier.  Follows the general convention that
241 | 	 * -1 value relates to a leader tuplesort, and values >= 0 worker
242 | 	 * tuplesorts. (-1 can also be a serial tuplesort.)
243 | 	 *
244 | 	 * shared is mutable shared memory state, which is used to coordinate
245 | 	 * parallel sorts.
246 | 	 *
247 | 	 * nParticipants is the number of worker Tuplesortstates known by the
248 | 	 * leader to have actually been launched, which implies that they must
249 | 	 * finish a run leader can merge.  Typically includes a worker state held
250 | 	 * by the leader process itself.  Set in the leader Tuplesortstate only.
251 | 	 */
252 | 	int			worker;
253 | 	Sharedsort *shared;
254 | 	int			nParticipants;
255 | 
256 | 	/*
257 | 	 * The sortKeys variable is used by every case other than the hash index
258 | 	 * case; it is set by tuplesort_begin_xxx.  tupDesc is only used by the
259 | 	 * MinimalTuple and CLUSTER routines, though.
260 | 	 */
261 | 	TupleDesc	tupDesc;
262 | 	SortSupport sortKeys;		/* array of length nKeys */
263 | 
264 | 	/*
265 | 	 * This variable is shared by the single-key MinimalTuple case and the
266 | 	 * Datum case (which both use qsort_ssup()).  Otherwise it's NULL.
267 | 	 */
268 | 	SortSupport onlyKey;
269 | 
270 | 	/*
271 | 	 * Additional state for managing "abbreviated key" sortsupport routines
272 | 	 * (which currently may be used by all cases except the hash index case).
273 | 	 * Tracks the intervals at which the optimization's effectiveness is
274 | 	 * tested.
275 | 	 */
276 | 	int64		abbrevNext;		/* Tuple # at which to next check
277 | 								 * applicability */
278 | 
279 | 	/*
280 | 	 * These variables are specific to the CLUSTER case; they are set by
281 | 	 * tuplesort_begin_cluster.
282 | 	 */
283 | 	IndexInfo  *indexInfo;		/* info about index being used for reference */
284 | 	EState	   *estate;			/* for evaluating index expressions */
285 | 
286 | 	/*
287 | 	 * These variables are specific to the IndexTuple case; they are set by
288 | 	 * tuplesort_begin_index_xxx and used only by the IndexTuple routines.
289 | 	 */
290 | 	Relation	heapRel;		/* table the index is being built on */
291 | 	Relation	indexRel;		/* index being built */
292 | 
293 | 	/* These are specific to the index_btree subcase: */
294 | 	bool		enforceUnique;	/* complain if we find duplicate tuples */
295 | 
296 | 	/* These are specific to the index_hash subcase: */
297 | 	uint32		high_mask;		/* masks for sortable part of hash code */
298 | 	uint32		low_mask;
299 | 	uint32		max_buckets;
300 | 
301 | 	/*
302 | 	 * These variables are specific to the Datum case; they are set by
303 | 	 * tuplesort_begin_datum and used only by the DatumTuple routines.
304 | 	 */
305 | 	Oid			datumType;
306 | 	/* we need typelen in order to know how to copy the Datums. */
307 | 	int			datumTypeLen;
308 | 
309 | 	/*
310 | 	 * Resource snapshot for time of sort start.
311 | 	 */
312 | #ifdef TRACE_SORT
313 | 	PGRUsage	ru_start;
314 | #endif
315 | } pgsrt_Tuplesortstate;
316 | 
317 | #endif		/* PG_SORTSTATS_IMPORT_PG14_H */
318 | 


--------------------------------------------------------------------------------
/include/pg_sortstats_import_pg9_4.h:
--------------------------------------------------------------------------------
  1 | #ifndef PG_SORTSTATS_IMPORT_PG9_4_H
  2 | #define PG_SORTSTATS_IMPORT_PG9_4_H
  3 | 
  4 | #define SizeofMinimalTupleHeader offsetof(MinimalTupleData, t_bits)
  5 | /*
  6 |  * AllocChunk
  7 |  *		The prefix of each piece of memory in an AllocBlock
  8 |  *
  9 |  * NB: this MUST match StandardChunkHeader as defined by utils/memutils.h.
 10 |  */
 11 | typedef struct pgsrt_AllocChunkData
 12 | {
 13 | 	/* aset is the owning aset if allocated, or the freelist link if free */
 14 | 	void	   *aset;
 15 | 	/* size is always the size of the usable space in the chunk */
 16 | 	Size		size;
 17 | #ifdef MEMORY_CONTEXT_CHECKING
 18 | 	/* when debugging memory usage, also store actual requested size */
 19 | 	/* this is zero in a free chunk */
 20 | 	Size		requested_size;
 21 | #endif
 22 | }	pgsrt_AllocChunkData;
 23 | 
 24 | #define PGSRT_ALLOC_CHUNKHDRSZ	MAXALIGN(sizeof(pgsrt_AllocChunkData))
 25 | 
 26 | 
 27 | typedef struct
 28 | {
 29 | 	void	   *tuple;			/* the tuple proper */
 30 | 	Datum		datum1;			/* value of first key column */
 31 | 	bool		isnull1;		/* is first key column NULL? */
 32 | 	int			tupindex;		/* see notes above */
 33 | } SortTuple;
 34 | 
 35 | typedef int (*SortTupleComparator) (const SortTuple *a, const SortTuple *b,
 36 | 												Tuplesortstate *state);
 37 | 
 38 | typedef enum
 39 | {
 40 | 	TSS_INITIAL,				/* Loading tuples; still within memory limit */
 41 | 	TSS_BOUNDED,				/* Loading tuples into bounded-size heap */
 42 | 	TSS_BUILDRUNS,				/* Loading tuples; writing to tape */
 43 | 	TSS_SORTEDINMEM,			/* Sort completed entirely in memory */
 44 | 	TSS_SORTEDONTAPE,			/* Sort completed, final run is on tape */
 45 | 	TSS_FINALMERGE				/* Performing final merge on-the-fly */
 46 | } TupSortStatus;
 47 | 
 48 | typedef struct pgsrt_Tuplesortstate
 49 | {
 50 | 	TupSortStatus status;		/* enumerated value as shown above */
 51 | 	int			nKeys;			/* number of columns in sort key */
 52 | 	bool		randomAccess;	/* did caller request random access? */
 53 | 	bool		bounded;		/* did caller specify a maximum number of
 54 | 								 * tuples to return? */
 55 | 	bool		boundUsed;		/* true if we made use of a bounded heap */
 56 | 	int			bound;			/* if bounded, the maximum number of tuples */
 57 | 	int64		availMem;		/* remaining memory available, in bytes */
 58 | 	int64		allowedMem;		/* total memory allowed, in bytes */
 59 | 	int			maxTapes;		/* number of tapes (Knuth's T) */
 60 | 	int			tapeRange;		/* maxTapes-1 (Knuth's P) */
 61 | 	MemoryContext sortcontext;	/* memory context holding all sort data */
 62 | 	LogicalTapeSet *tapeset;	/* logtape.c object for tapes in a temp file */
 63 | 
 64 | 	/*
 65 | 	 * These function pointers decouple the routines that must know what kind
 66 | 	 * of tuple we are sorting from the routines that don't need to know it.
 67 | 	 * They are set up by the tuplesort_begin_xxx routines.
 68 | 	 *
 69 | 	 * Function to compare two tuples; result is per qsort() convention, ie:
 70 | 	 * <0, 0, >0 according as a<b, a=b, a>b.  The API must match
 71 | 	 * qsort_arg_comparator.
 72 | 	 */
 73 | 	SortTupleComparator comparetup;
 74 | 
 75 | 	/*
 76 | 	 * Function to copy a supplied input tuple into palloc'd space and set up
 77 | 	 * its SortTuple representation (ie, set tuple/datum1/isnull1).  Also,
 78 | 	 * state->availMem must be decreased by the amount of space used for the
 79 | 	 * tuple copy (note the SortTuple struct itself is not counted).
 80 | 	 */
 81 | 	void		(*copytup) (Tuplesortstate *state, SortTuple *stup, void *tup);
 82 | 
 83 | 	/*
 84 | 	 * Function to write a stored tuple onto tape.  The representation of the
 85 | 	 * tuple on tape need not be the same as it is in memory; requirements on
 86 | 	 * the tape representation are given below.  After writing the tuple,
 87 | 	 * pfree() the out-of-line data (not the SortTuple struct!), and increase
 88 | 	 * state->availMem by the amount of memory space thereby released.
 89 | 	 */
 90 | 	void		(*writetup) (Tuplesortstate *state, int tapenum,
 91 | 										 SortTuple *stup);
 92 | 
 93 | 	/*
 94 | 	 * Function to read a stored tuple from tape back into memory. 'len' is
 95 | 	 * the already-read length of the stored tuple.  Create a palloc'd copy,
 96 | 	 * initialize tuple/datum1/isnull1 in the target SortTuple struct, and
 97 | 	 * decrease state->availMem by the amount of memory space consumed.
 98 | 	 */
 99 | 	void		(*readtup) (Tuplesortstate *state, SortTuple *stup,
100 | 										int tapenum, unsigned int len);
101 | 
102 | 	/*
103 | 	 * Function to reverse the sort direction from its current state. (We
104 | 	 * could dispense with this if we wanted to enforce that all variants
105 | 	 * represent the sort key information alike.)
106 | 	 */
107 | 	void		(*reversedirection) (Tuplesortstate *state);
108 | 
109 | 	/*
110 | 	 * This array holds the tuples now in sort memory.  If we are in state
111 | 	 * INITIAL, the tuples are in no particular order; if we are in state
112 | 	 * SORTEDINMEM, the tuples are in final sorted order; in states BUILDRUNS
113 | 	 * and FINALMERGE, the tuples are organized in "heap" order per Algorithm
114 | 	 * H.  (Note that memtupcount only counts the tuples that are part of the
115 | 	 * heap --- during merge passes, memtuples[] entries beyond tapeRange are
116 | 	 * never in the heap and are used to hold pre-read tuples.)  In state
117 | 	 * SORTEDONTAPE, the array is not used.
118 | 	 */
119 | 	SortTuple  *memtuples;		/* array of SortTuple structs */
120 | 	int			memtupcount;	/* number of tuples currently present */
121 | 	int			memtupsize;		/* allocated length of memtuples array */
122 | 	bool		growmemtuples;	/* memtuples' growth still underway? */
123 | 
124 | 	/*
125 | 	 * While building initial runs, this is the current output run number
126 | 	 * (starting at 0).  Afterwards, it is the number of initial runs we made.
127 | 	 */
128 | 	int			currentRun;
129 | } pgsrt_Tuplesortstate;
130 | 
131 | #endif		/* PG_SORTSTATS_IMPORT_PG9_4_H */
132 | 


--------------------------------------------------------------------------------
/include/pg_sortstats_import_pg9_5.h:
--------------------------------------------------------------------------------
  1 | #ifndef PG_SORTSTATS_IMPORT_PG9_5_H
  2 | #define PG_SORTSTATS_IMPORT_PG9_5_H
  3 | 
  4 | 
  5 | /*
  6 |  * AllocChunk
  7 |  *		The prefix of each piece of memory in an AllocBlock
  8 |  *
  9 |  * NB: this MUST match StandardChunkHeader as defined by utils/memutils.h.
 10 |  */
 11 | typedef struct pgsrt_AllocChunkData
 12 | {
 13 | 	/* aset is the owning aset if allocated, or the freelist link if free */
 14 | 	void	   *aset;
 15 | 	/* size is always the size of the usable space in the chunk */
 16 | 	Size		size;
 17 | #ifdef MEMORY_CONTEXT_CHECKING
 18 | 	/* when debugging memory usage, also store actual requested size */
 19 | 	/* this is zero in a free chunk */
 20 | 	Size		requested_size;
 21 | #endif
 22 | }	pgsrt_AllocChunkData;
 23 | 
 24 | #define PGSRT_ALLOC_CHUNKHDRSZ	MAXALIGN(sizeof(pgsrt_AllocChunkData))
 25 | 
 26 | 
 27 | typedef struct
 28 | {
 29 | 	void	   *tuple;			/* the tuple proper */
 30 | 	Datum		datum1;			/* value of first key column */
 31 | 	bool		isnull1;		/* is first key column NULL? */
 32 | 	int			tupindex;		/* see notes above */
 33 | } SortTuple;
 34 | 
 35 | typedef int (*SortTupleComparator) (const SortTuple *a, const SortTuple *b,
 36 | 												Tuplesortstate *state);
 37 | 
 38 | typedef enum
 39 | {
 40 | 	TSS_INITIAL,				/* Loading tuples; still within memory limit */
 41 | 	TSS_BOUNDED,				/* Loading tuples into bounded-size heap */
 42 | 	TSS_BUILDRUNS,				/* Loading tuples; writing to tape */
 43 | 	TSS_SORTEDINMEM,			/* Sort completed entirely in memory */
 44 | 	TSS_SORTEDONTAPE,			/* Sort completed, final run is on tape */
 45 | 	TSS_FINALMERGE				/* Performing final merge on-the-fly */
 46 | } TupSortStatus;
 47 | 
 48 | typedef struct pgsrt_Tuplesortstate
 49 | {
 50 | 	TupSortStatus status;		/* enumerated value as shown above */
 51 | 	int			nKeys;			/* number of columns in sort key */
 52 | 	bool		randomAccess;	/* did caller request random access? */
 53 | 	bool		bounded;		/* did caller specify a maximum number of
 54 | 								 * tuples to return? */
 55 | 	bool		boundUsed;		/* true if we made use of a bounded heap */
 56 | 	int			bound;			/* if bounded, the maximum number of tuples */
 57 | 	int64		availMem;		/* remaining memory available, in bytes */
 58 | 	int64		allowedMem;		/* total memory allowed, in bytes */
 59 | 	int			maxTapes;		/* number of tapes (Knuth's T) */
 60 | 	int			tapeRange;		/* maxTapes-1 (Knuth's P) */
 61 | 	MemoryContext sortcontext;	/* memory context holding all sort data */
 62 | 	LogicalTapeSet *tapeset;	/* logtape.c object for tapes in a temp file */
 63 | 
 64 | 	/*
 65 | 	 * These function pointers decouple the routines that must know what kind
 66 | 	 * of tuple we are sorting from the routines that don't need to know it.
 67 | 	 * They are set up by the tuplesort_begin_xxx routines.
 68 | 	 *
 69 | 	 * Function to compare two tuples; result is per qsort() convention, ie:
 70 | 	 * <0, 0, >0 according as a<b, a=b, a>b.  The API must match
 71 | 	 * qsort_arg_comparator.
 72 | 	 */
 73 | 	SortTupleComparator comparetup;
 74 | 
 75 | 	/*
 76 | 	 * Function to copy a supplied input tuple into palloc'd space and set up
 77 | 	 * its SortTuple representation (ie, set tuple/datum1/isnull1).  Also,
 78 | 	 * state->availMem must be decreased by the amount of space used for the
 79 | 	 * tuple copy (note the SortTuple struct itself is not counted).
 80 | 	 */
 81 | 	void		(*copytup) (Tuplesortstate *state, SortTuple *stup, void *tup);
 82 | 
 83 | 	/*
 84 | 	 * Function to write a stored tuple onto tape.  The representation of the
 85 | 	 * tuple on tape need not be the same as it is in memory; requirements on
 86 | 	 * the tape representation are given below.  After writing the tuple,
 87 | 	 * pfree() the out-of-line data (not the SortTuple struct!), and increase
 88 | 	 * state->availMem by the amount of memory space thereby released.
 89 | 	 */
 90 | 	void		(*writetup) (Tuplesortstate *state, int tapenum,
 91 | 										 SortTuple *stup);
 92 | 
 93 | 	/*
 94 | 	 * Function to read a stored tuple from tape back into memory. 'len' is
 95 | 	 * the already-read length of the stored tuple.  Create a palloc'd copy,
 96 | 	 * initialize tuple/datum1/isnull1 in the target SortTuple struct, and
 97 | 	 * decrease state->availMem by the amount of memory space consumed.
 98 | 	 */
 99 | 	void		(*readtup) (Tuplesortstate *state, SortTuple *stup,
100 | 										int tapenum, unsigned int len);
101 | 
102 | 	/*
103 | 	 * This array holds the tuples now in sort memory.  If we are in state
104 | 	 * INITIAL, the tuples are in no particular order; if we are in state
105 | 	 * SORTEDINMEM, the tuples are in final sorted order; in states BUILDRUNS
106 | 	 * and FINALMERGE, the tuples are organized in "heap" order per Algorithm
107 | 	 * H.  (Note that memtupcount only counts the tuples that are part of the
108 | 	 * heap --- during merge passes, memtuples[] entries beyond tapeRange are
109 | 	 * never in the heap and are used to hold pre-read tuples.)  In state
110 | 	 * SORTEDONTAPE, the array is not used.
111 | 	 */
112 | 	SortTuple  *memtuples;		/* array of SortTuple structs */
113 | 	int			memtupcount;	/* number of tuples currently present */
114 | 	int			memtupsize;		/* allocated length of memtuples array */
115 | 	bool		growmemtuples;	/* memtuples' growth still underway? */
116 | 
117 | 	/*
118 | 	 * While building initial runs, this is the current output run number
119 | 	 * (starting at 0).  Afterwards, it is the number of initial runs we made.
120 | 	 */
121 | 	int			currentRun;
122 | } pgsrt_Tuplesortstate;
123 | 
124 | #endif		/* PG_SORTSTATS_IMPORT_PG9_5_H */
125 | 


--------------------------------------------------------------------------------
/include/pg_sortstats_import_pg9_6.h:
--------------------------------------------------------------------------------
  1 | #ifndef PG_SORTSTATS_IMPORT_PG9_6_H
  2 | #define PG_SORTSTATS_IMPORT_PG9_6_H
  3 | 
  4 | /*
  5 |  * AllocChunk
  6 |  *		The prefix of each piece of memory in an AllocBlock
  7 |  *
  8 |  * NB: this MUST match StandardChunkHeader as defined by utils/memutils.h.
  9 |  */
 10 | typedef struct pgsrt_AllocChunkData
 11 | {
 12 | 	/* aset is the owning aset if allocated, or the freelist link if free */
 13 | 	void	   *aset;
 14 | 	/* size is always the size of the usable space in the chunk */
 15 | 	Size		size;
 16 | #ifdef MEMORY_CONTEXT_CHECKING
 17 | 	/* when debugging memory usage, also store actual requested size */
 18 | 	/* this is zero in a free chunk */
 19 | 	Size		requested_size;
 20 | #endif
 21 | }	pgsrt_AllocChunkData;
 22 | 
 23 | #define PGSRT_ALLOC_CHUNKHDRSZ	MAXALIGN(sizeof(pgsrt_AllocChunkData))
 24 | 
 25 | 
 26 | typedef struct
 27 | {
 28 | 	void	   *tuple;			/* the tuple itself */
 29 | 	Datum		datum1;			/* value of first key column */
 30 | 	bool		isnull1;		/* is first key column NULL? */
 31 | 	int			tupindex;		/* see notes above */
 32 | } SortTuple;
 33 | 
 34 | typedef int (*SortTupleComparator) (const SortTuple *a, const SortTuple *b,
 35 | 												Tuplesortstate *state);
 36 | 
 37 | typedef enum
 38 | {
 39 | 	TSS_INITIAL,				/* Loading tuples; still within memory limit */
 40 | 	TSS_BOUNDED,				/* Loading tuples into bounded-size heap */
 41 | 	TSS_BUILDRUNS,				/* Loading tuples; writing to tape */
 42 | 	TSS_SORTEDINMEM,			/* Sort completed entirely in memory */
 43 | 	TSS_SORTEDONTAPE,			/* Sort completed, final run is on tape */
 44 | 	TSS_FINALMERGE				/* Performing final merge on-the-fly */
 45 | } TupSortStatus;
 46 | 
 47 | typedef struct pgsrt_Tuplesortstate
 48 | {
 49 | 	TupSortStatus status;		/* enumerated value as shown above */
 50 | 	int			nKeys;			/* number of columns in sort key */
 51 | 	bool		randomAccess;	/* did caller request random access? */
 52 | 	bool		bounded;		/* did caller specify a maximum number of
 53 | 								 * tuples to return? */
 54 | 	bool		boundUsed;		/* true if we made use of a bounded heap */
 55 | 	int			bound;			/* if bounded, the maximum number of tuples */
 56 | 	bool		tuples;			/* Can SortTuple.tuple ever be set? */
 57 | 	int64		availMem;		/* remaining memory available, in bytes */
 58 | 	int64		allowedMem;		/* total memory allowed, in bytes */
 59 | 	int			maxTapes;		/* number of tapes (Knuth's T) */
 60 | 	int			tapeRange;		/* maxTapes-1 (Knuth's P) */
 61 | 	MemoryContext sortcontext;	/* memory context holding most sort data */
 62 | 	MemoryContext tuplecontext; /* sub-context of sortcontext for tuple data */
 63 | 	LogicalTapeSet *tapeset;	/* logtape.c object for tapes in a temp file */
 64 | 
 65 | 	/*
 66 | 	 * These function pointers decouple the routines that must know what kind
 67 | 	 * of tuple we are sorting from the routines that don't need to know it.
 68 | 	 * They are set up by the tuplesort_begin_xxx routines.
 69 | 	 *
 70 | 	 * Function to compare two tuples; result is per qsort() convention, ie:
 71 | 	 * <0, 0, >0 according as a<b, a=b, a>b.  The API must match
 72 | 	 * qsort_arg_comparator.
 73 | 	 */
 74 | 	SortTupleComparator comparetup;
 75 | 
 76 | 	/*
 77 | 	 * Function to copy a supplied input tuple into palloc'd space and set up
 78 | 	 * its SortTuple representation (ie, set tuple/datum1/isnull1).  Also,
 79 | 	 * state->availMem must be decreased by the amount of space used for the
 80 | 	 * tuple copy (note the SortTuple struct itself is not counted).
 81 | 	 */
 82 | 	void		(*copytup) (Tuplesortstate *state, SortTuple *stup, void *tup);
 83 | 
 84 | 	/*
 85 | 	 * Function to write a stored tuple onto tape.  The representation of the
 86 | 	 * tuple on tape need not be the same as it is in memory; requirements on
 87 | 	 * the tape representation are given below.  After writing the tuple,
 88 | 	 * pfree() the out-of-line data (not the SortTuple struct!), and increase
 89 | 	 * state->availMem by the amount of memory space thereby released.
 90 | 	 */
 91 | 	void		(*writetup) (Tuplesortstate *state, int tapenum,
 92 | 										 SortTuple *stup);
 93 | 
 94 | 	/*
 95 | 	 * Function to read a stored tuple from tape back into memory. 'len' is
 96 | 	 * the already-read length of the stored tuple.  Create a palloc'd copy,
 97 | 	 * initialize tuple/datum1/isnull1 in the target SortTuple struct, and
 98 | 	 * decrease state->availMem by the amount of memory space consumed. (See
 99 | 	 * batchUsed notes for details on how memory is handled when incremental
100 | 	 * accounting is abandoned.)
101 | 	 */
102 | 	void		(*readtup) (Tuplesortstate *state, SortTuple *stup,
103 | 										int tapenum, unsigned int len);
104 | 
105 | 	/*
106 | 	 * Function to move a caller tuple.  This is usually implemented as a
107 | 	 * memmove() shim, but function may also perform additional fix-up of
108 | 	 * caller tuple where needed.  Batch memory support requires the movement
109 | 	 * of caller tuples from one location in memory to another.
110 | 	 */
111 | 	void		(*movetup) (void *dest, void *src, unsigned int len);
112 | 
113 | 	/*
114 | 	 * This array holds the tuples now in sort memory.  If we are in state
115 | 	 * INITIAL, the tuples are in no particular order; if we are in state
116 | 	 * SORTEDINMEM, the tuples are in final sorted order; in states BUILDRUNS
117 | 	 * and FINALMERGE, the tuples are organized in "heap" order per Algorithm
118 | 	 * H.  (Note that memtupcount only counts the tuples that are part of the
119 | 	 * heap --- during merge passes, memtuples[] entries beyond tapeRange are
120 | 	 * never in the heap and are used to hold pre-read tuples.)  In state
121 | 	 * SORTEDONTAPE, the array is not used.
122 | 	 */
123 | 	SortTuple  *memtuples;		/* array of SortTuple structs */
124 | 	int			memtupcount;	/* number of tuples currently present */
125 | 	int			memtupsize;		/* allocated length of memtuples array */
126 | 	bool		growmemtuples;	/* memtuples' growth still underway? */
127 | 
128 | 	/*
129 | 	 * Memory for tuples is sometimes allocated in batch, rather than
130 | 	 * incrementally.  This implies that incremental memory accounting has
131 | 	 * been abandoned.  Currently, this only happens for the final on-the-fly
132 | 	 * merge step.  Large batch allocations can store tuples (e.g.
133 | 	 * IndexTuples) without palloc() fragmentation and other overhead.
134 | 	 */
135 | 	bool		batchUsed;
136 | 
137 | 	/*
138 | 	 * While building initial runs, this indicates if the replacement
139 | 	 * selection strategy is in use.  When it isn't, then a simple hybrid
140 | 	 * sort-merge strategy is in use instead (runs are quicksorted).
141 | 	 */
142 | 	bool		replaceActive;
143 | 
144 | 	/*
145 | 	 * While building initial runs, this is the current output run number
146 | 	 * (starting at RUN_FIRST).  Afterwards, it is the number of initial runs
147 | 	 * we made.
148 | 	 */
149 | 	int			currentRun;
150 | } pgsrt_Tuplesortstate;
151 | 
152 | #endif		/* PG_SORTSTATS_IMPORT_PG9_6_H */
153 | 


--------------------------------------------------------------------------------
/pg_sortstats--0.0.1.sql:
--------------------------------------------------------------------------------
 1 | -- This program is open source, licensed under the PostgreSQL License.
 2 | -- For license terms, see the LICENSE file.
 3 | --
 4 | -- Copyright (C) 2018-2023: The PoWA-team
 5 | 
 6 | -- complain if script is sourced in psql, rather than via CREATE EXTENSION
 7 | \echo Use "CREATE EXTENSION pg_sortstats" to load this file. \quit
 8 | 
 9 | SET client_encoding = 'UTF8';
10 | 
11 | CREATE FUNCTION pg_sortstats(IN showtext boolean,
12 |     OUT queryid bigint,
13 |     OUT userid oid,
14 |     OUT dbid oid,
15 |     OUT nb_keys integer,
16 |     OUT sort_keys text,
17 |     OUT lines bigint,
18 |     OUT lines_to_sort bigint,
19 |     OUT work_mems bigint,
20 |     OUT topn_sorts bigint,
21 |     OUT quicksorts bigint,
22 |     OUT external_sorts bigint,
23 |     OUT external_merges bigint,
24 |     OUT nb_tapes bigint,
25 |     OUT space_disk bigint,
26 |     OUT space_memory bigint,
27 |     OUT non_parallels bigint,
28 |     OUT nb_workers bigint
29 | )
30 | RETURNS SETOF record
31 | AS '$libdir/pg_sortstats', 'pg_sortstats'
32 | LANGUAGE C STRICT VOLATILE COST 1000;
33 | 
34 | CREATE VIEW pg_sortstats AS SELECT * FROM pg_sortstats(true);
35 | 
36 | CREATE FUNCTION pg_sortstats_reset()
37 |     RETURNS void
38 | LANGUAGE c COST 1000
39 | AS '$libdir/pg_sortstats', 'pg_sortstats_reset';
40 | 


--------------------------------------------------------------------------------
/pg_sortstats.c:
--------------------------------------------------------------------------------
   1 | /*-------------------------------------------------------------------------
   2 |  *
   3 |  * pg_sortstats.c
   4 |  *		Track statistics about sorts performs, and also estimate how much
   5 |  *		work_mem would have been needed to sort data in memory.
   6 |  *
   7 |  * This module is heavily inspired on the great pg_stat_statements official
   8 |  * contrib.  The same locking rules are used, which for reference are:
   9 |  *
  10 |  * Note about locking issues: to create or delete an entry in the shared
  11 |  * hashtable, one must hold pgsrt->lock exclusively.  Modifying any field
  12 |  * in an entry except the counters requires the same.  To look up an entry,
  13 |  * one must hold the lock shared.  To read or update the counters within
  14 |  * an entry, one must hold the lock shared or exclusive (so the entry doesn't
  15 |  * disappear!) and also take the entry's mutex spinlock.
  16 |  * The shared state variable pgsrt->extent (the next free spot in the external
  17 |  * keys-text file) should be accessed only while holding either the
  18 |  * pgsrt->mutex spinlock, or exclusive lock on pgsrt->lock.  We use the mutex
  19 |  * to allow reserving file space while holding only shared lock on pgsrt->lock.
  20 |  * Rewriting the entire external keys-text file, eg for garbage collection,
  21 |  * requires holding pgsrt->lock exclusively; this allows individual entries
  22 |  * in the file to be read or written while holding only shared lock.
  23 |  *
  24 |  * Copyright (c) 2018-2023, The PoWA-team
  25 |  *
  26 |  *-------------------------------------------------------------------------
  27 |  */
  28 | #include "postgres.h"
  29 | 
  30 | #include <sys/stat.h>
  31 | #include <unistd.h>
  32 | 
  33 | #include "fmgr.h"
  34 | #include "funcapi.h"
  35 | #include "miscadmin.h"
  36 | #include "pgstat.h"
  37 | #include "access/hash.h"
  38 | #include "access/htup_details.h"
  39 | #if PG_VERSION_NUM >= 90600
  40 | #include "access/parallel.h"
  41 | #endif
  42 | #include "mb/pg_wchar.h"
  43 | #include "nodes/nodeFuncs.h"
  44 | #include "parser/parsetree.h"
  45 | #if PG_VERSION_NUM >= 90600
  46 | #include "postmaster/autovacuum.h"
  47 | #endif
  48 | #if PG_VERSION_NUM >= 120000
  49 | #include "replication/walsender.h"
  50 | #endif
  51 | #if PG_VERSION_NUM < 100000
  52 | #include "storage/fd.h"
  53 | #endif
  54 | #include "storage/ipc.h"
  55 | #include "storage/lwlock.h"
  56 | #include "storage/shmem.h"
  57 | #if PG_VERSION_NUM < 110000
  58 | #include "storage/spin.h"
  59 | #endif
  60 | #include "utils/builtins.h"
  61 | #include "utils/guc.h"
  62 | #if PG_VERSION_NUM < 110000
  63 | #include "utils/memutils.h"
  64 | #endif
  65 | #if PG_VERSION_NUM >= 90500
  66 | #include "utils/ruleutils.h"
  67 | #endif
  68 | #include "utils/tuplesort.h"
  69 | 
  70 | #include "include/pg_sortstats_import.h"
  71 | 
  72 | PG_MODULE_MAGIC;
  73 | 
  74 | /*--- Macros and structs ---*/
  75 | 
  76 | /* Location of permanent stats file (valid when database is shut down) */
  77 | #define PGSRT_DUMP_FILE	PGSTAT_STAT_PERMANENT_DIRECTORY "/pg_sortstats.stat"
  78 | 
  79 | /*
  80 |  * Location of external keys text file.  We don't keep it in the core
  81 |  * system's stats_temp_directory.  The core system can safely use that GUC
  82 |  * setting, because the statistics collector temp file paths are set only once
  83 |  * as part of changing the GUC, but pg_sortstats has no way of avoiding
  84 |  * race conditions.  Besides, we only expect modest, infrequent I/O for keys
  85 |  * strings, so placing the file on a faster filesystem is not compelling.
  86 |  */
  87 | #define PGSRT_TEXT_FILE	PG_STAT_TMP_DIR "/pgsrt_sortkey_texts.stat"
  88 | 
  89 | /* Magic number identifying the stats file format */
  90 | static const uint32 PGSRT_FILE_HEADER = 0x20180804;
  91 | 
  92 | /* PostgreSQL major version number, changes in which invalidate all entries */
  93 | static const uint32 PGSRT_PG_MAJOR_VERSION = PG_VERSION_NUM / 100;
  94 | 
  95 | #define PGSRT_COLUMNS		17			/* number of columns in pg_sortstats  SRF */
  96 | #define USAGE_DECREASE_FACTOR	(0.99)	/* decreased every pgsrt_entry_dealloc */
  97 | #define USAGE_DEALLOC_PERCENT	5		/* free this % of entries at once */
  98 | #define USAGE_INIT	(1.0)
  99 | #define ASSUMED_MEDIAN_INIT		(10.0)	/* initial assumed median usage */
 100 | #define ASSUMED_LENGTH_INIT		128		/* initial assumed mean keys length */
 101 | 
 102 | #define record_gc_ktexts() \
 103 | 	do { \
 104 | 		volatile pgsrtSharedState *s = (volatile pgsrtSharedState *) pgsrt; \
 105 | 		SpinLockAcquire(&s->mutex); \
 106 | 		s->gc_count++; \
 107 | 		SpinLockRelease(&s->mutex); \
 108 | 	} while(0)
 109 | 
 110 | 
 111 | 
 112 | /* In PostgreSQL 11, queryid becomes a uint64 internally.
 113 |  */
 114 | #if PG_VERSION_NUM >= 110000
 115 | typedef uint64 pgsrt_queryid;
 116 | #else
 117 | typedef uint32 pgsrt_queryid;
 118 | #endif
 119 | 
 120 | typedef struct pgsrtSharedState
 121 | {
 122 | 	LWLockId		lock;				/* protects hashtable search/modification */
 123 | 	double			cur_median_usage;	/* current median usage in hashtable */
 124 | 	Size			mean_keys_len;		/* current mean keys text length */
 125 | 	slock_t			mutex;				/* protects following fields only: */
 126 | 	Size			extent;				/* current extent of keys file */
 127 | 	int				n_writers;			/* number of active writers to keys file */
 128 | 	int				gc_count;			/* keys file garbage collection cycle count */
 129 | #if PG_VERSION_NUM >= 90600
 130 | 	LWLockId		queryids_lock;		/* protects following array */
 131 | 	pgsrt_queryid	queryids[FLEXIBLE_ARRAY_MEMBER]; /* queryid of non-worker processes */
 132 | #endif
 133 | } pgsrtSharedState;
 134 | 
 135 | typedef struct pgsrtHashKey
 136 | {
 137 | 	Oid				userid;			/* user OID */
 138 | 	Oid				dbid;			/* database OID */
 139 | 	pgsrt_queryid	queryid;		/* query identifier */
 140 | 	uint32			sortid;			/* sort identifier withing a query */
 141 | } pgsrtHashKey;
 142 | 
 143 | typedef struct pgsrtCounters
 144 | {
 145 | 	double			usage;					/* usage factor */
 146 | 	int64			lines;					/* total number of lines in input */
 147 | 	int64			lines_to_sort;			/* total number of lines sorted */
 148 | 	int64			work_mems;				/* total size of estimated work_mem */
 149 | 	int64			topn_sorts;				/* number of top-N heapsorts */
 150 | 	int64			quicksorts;				/* number of quicksorts */
 151 | 	int64			external_sorts;			/* number of external sorts */
 152 | 	int64			external_merges;		/* number of external merges */
 153 | 	int64			nbtapes;				/* total number of tapes used */
 154 | 	int64			space_disk;				/* total disk space consumed */
 155 | 	int64			space_memory;			/* total memory space consumed */
 156 | 	int64			non_parallels;			/* number of non parallel sorts */
 157 | 	int64			nb_workers;				/* total number of parallel workers (including gather node) */
 158 | } pgsrtCounters;
 159 | 
 160 | typedef struct pgsrtEntry
 161 | {
 162 | 	pgsrtHashKey	key;
 163 | 	pgsrtCounters	counters;		/* statistics for this sort */
 164 | 	int				nbkeys;			/* # of columns in the sort */
 165 | 	Size			keys_offset;	/* deparsed keys text offset in external file */
 166 | 	int				keys_len;		/* # of valid bytes in deparsed keys string, or -1 */
 167 | 	int				encoding;		/* deparsed keys text encoding */
 168 | 	slock_t			mutex;			/* protects the counters only */
 169 | } pgsrtEntry;
 170 | 
 171 | typedef struct pgsrtWalkerContext
 172 | {
 173 | 	QueryDesc *queryDesc;
 174 | 	List	  *ancestors;
 175 | 	List	  *rtable;
 176 | 	List	  *rtable_names;
 177 | 	List	  *deparse_cxt;
 178 | } pgsrtWalkerContext;
 179 | 
 180 | /*--- Function declarations ---*/
 181 | 
 182 | void		_PG_init(void);
 183 | 
 184 | 
 185 | extern PGDLLEXPORT Datum	pg_sortstats(PG_FUNCTION_ARGS);
 186 | extern PGDLLEXPORT Datum	pg_sortstats_reset(PG_FUNCTION_ARGS);
 187 | 
 188 | PG_FUNCTION_INFO_V1(pg_sortstats);
 189 | PG_FUNCTION_INFO_V1(pg_sortstats_reset);
 190 | 
 191 | #if PG_VERSION_NUM >= 150000
 192 | static void pgsrt_shmem_request(void);
 193 | #endif
 194 | static void pgsrt_shmem_startup(void);
 195 | static void pgsrt_shmem_shutdown(int code, Datum arg);
 196 | static void pgsrt_ExecutorStart(QueryDesc *queryDesc, int eflags);
 197 | static void pgsrt_ExecutorRun(QueryDesc *queryDesc,
 198 | 				 ScanDirection direction,
 199 | #if PG_VERSION_NUM >= 90600
 200 | 				 uint64 count
 201 | #else
 202 | 				 long count
 203 | #endif
 204 | #if PG_VERSION_NUM >= 100000
 205 | 				 ,bool execute_once
 206 | #endif
 207 | );
 208 | static void pgsrt_ExecutorFinish(QueryDesc *queryDesc);
 209 | static void pgsrt_ExecutorEnd(QueryDesc *queryDesc);
 210 | 
 211 | #if PG_VERSION_NUM >= 150000
 212 | static shmem_request_hook_type prev_shmem_request_hook = NULL;
 213 | #endif
 214 | static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
 215 | static ExecutorStart_hook_type prev_ExecutorStart = NULL;
 216 | static ExecutorRun_hook_type prev_ExecutorRun = NULL;
 217 | static ExecutorFinish_hook_type prev_ExecutorFinish = NULL;
 218 | static ExecutorEnd_hook_type prev_ExecutorEnd = NULL;
 219 | 
 220 | static Size pgsrt_memsize(void);
 221 | #if PG_VERSION_NUM >= 90600
 222 | static Size pgsrt_queryids_size(void);
 223 | static pgsrt_queryid pgsrt_get_queryid(void);
 224 | static void pgsrt_set_queryid(pgsrt_queryid);
 225 | #endif
 226 | 
 227 | static pgsrtEntry *pgsrt_entry_alloc(pgsrtHashKey *key, Size keys_offset,
 228 | 		int keys_len, int encoding, int nbkeys);
 229 | static void pgsrt_entry_dealloc(void);
 230 | static void pgsrt_entry_reset(void);
 231 | static void pgsrt_store(pgsrt_queryid queryId, int nbkeys, char *keys,
 232 | 		pgsrtCounters *counters);
 233 | static uint32 pgsrt_hash_fn(const void *key, Size keysize);
 234 | static int	pgsrt_match_fn(const void *key1, const void *key2, Size keysize);
 235 | 
 236 | static bool ktext_store(const char *keys, int keys_len, Size *keys_offset,
 237 | 		int *gc_count);
 238 | static char *ktext_load_file(Size *buffer_size);
 239 | static char *ktext_fetch(Size keys_offset, int keys_len, char *buffer,
 240 | 		Size buffer_size);
 241 | static bool need_gc_ktexts(void);
 242 | static void gc_ktexts(void);
 243 | 
 244 | static void pgsrt_process_sortstate(SortState *srtstate, pgsrtWalkerContext *context);
 245 | static bool pgsrt_planstate_walker(PlanState *ps, pgsrtWalkerContext *context);
 246 | static char * pgsrt_get_sort_group_keys(SortState *srtstate,
 247 | 					 int nkeys, AttrNumber *keycols,
 248 | 					 Oid *sortOperators, Oid *collations, bool *nullsFirst,
 249 | 					 pgsrtWalkerContext *context);
 250 | static void pgsrt_setup_walker_context(pgsrtWalkerContext *context);
 251 | 
 252 | static unsigned long round_up_pow2(int64 val);
 253 | static int get_alignment_overhead(TupleDesc tupdesc);
 254 | 
 255 | /*--- Local variables ---*/
 256 | static int nesting_level = 0;
 257 | 
 258 | static bool pgsrt_enabled;
 259 | static int pgsrt_max;			/* max #of sorts to track */
 260 | static bool pgsrt_save;			/* whether to save stats across shutdown */
 261 | 
 262 | static HTAB *pgsrt_hash = NULL;
 263 | static pgsrtSharedState *pgsrt = NULL;
 264 | 
 265 | 
 266 | void
 267 | _PG_init(void)
 268 | {
 269 | 	if (!process_shared_preload_libraries_in_progress)
 270 | 	{
 271 | 		elog(ERROR, "This module can only be loaded via shared_preload_libraries");
 272 | 		return;
 273 | 	}
 274 | 
 275 | 	DefineCustomBoolVariable("pg_sortstats.enabled",
 276 | 							 "Enable / Disable pg_sortstats",
 277 | 							 NULL,
 278 | 							 &pgsrt_enabled,
 279 | 							 true,
 280 | 							 PGC_USERSET,
 281 | 							 0,
 282 | 							 NULL,
 283 | 							 NULL,
 284 | 							 NULL);
 285 | 
 286 | 	DefineCustomIntVariable("pg_sortstats.max",
 287 | 							"Sets the maximum number of statements tracked by pg_sortstats.",
 288 | 							NULL,
 289 | 							&pgsrt_max,
 290 | 							10000,
 291 | 							100,
 292 | 							INT_MAX,
 293 | 							PGC_POSTMASTER,
 294 | 							0,
 295 | 							NULL,
 296 | 							NULL,
 297 | 							NULL);
 298 | 
 299 | 	DefineCustomBoolVariable("pg_sortstats.save",
 300 | 							 "Save pg_sortstats statistics across server shutdowns.",
 301 | 							 NULL,
 302 | 							 &pgsrt_save,
 303 | 							 true,
 304 | 							 PGC_SIGHUP,
 305 | 							 0,
 306 | 							 NULL,
 307 | 							 NULL,
 308 | 							 NULL);
 309 | 
 310 | 	EmitWarningsOnPlaceholders("pg_sortstats");
 311 | 
 312 | #if PG_VERSION_NUM < 150000
 313 | 	/*
 314 | 	 * Request additional shared resources.  (These are no-ops if we're not in
 315 | 	 * the postmaster process.)  We'll allocate or attach to the shared
 316 | 	 * resources in pgsrt_shmem_startup().
 317 | 	 * If you change code here, don't forget to also report the modifications
 318 | 	 * in pgsrt_shmem_request() for pg15 and later.
 319 | 	 */
 320 | 	RequestAddinShmemSpace(pgsrt_memsize());
 321 | #if PG_VERSION_NUM >= 90600
 322 | 	RequestNamedLWLockTranche("pg_sortstats", 2);
 323 | #else
 324 | 	RequestAddinLWLocks(1);
 325 | #endif		/* pg 9.6+ */
 326 | #endif		/* pg 15- */
 327 | 
 328 | 	/* install hooks */
 329 | 	prev_ExecutorStart = ExecutorStart_hook;
 330 | 	ExecutorStart_hook = pgsrt_ExecutorStart;
 331 | 	prev_ExecutorRun = ExecutorRun_hook;
 332 | 	ExecutorRun_hook = pgsrt_ExecutorRun;
 333 | 	prev_ExecutorFinish = ExecutorFinish_hook;
 334 | 	ExecutorFinish_hook = pgsrt_ExecutorFinish;
 335 | 	prev_ExecutorEnd = ExecutorEnd_hook;
 336 | 	ExecutorEnd_hook = pgsrt_ExecutorEnd;
 337 | #if PG_VERSION_NUM >= 150000
 338 | 	prev_shmem_request_hook = shmem_request_hook;
 339 | 	shmem_request_hook = pgsrt_shmem_request;
 340 | #endif
 341 | 	prev_shmem_startup_hook = shmem_startup_hook;
 342 | 	shmem_startup_hook = pgsrt_shmem_startup;
 343 | }
 344 | 
 345 | #if PG_VERSION_NUM >= 150000
 346 | /*
 347 |  * Request additional shared memory resources.
 348 |  *
 349 |  * If you change code here, don't forget to also report the modifications in
 350 |  * _PG_init() for pg14 and below.
 351 |  */
 352 | static void
 353 | pgsrt_shmem_request(void)
 354 | {
 355 | 	if (prev_shmem_request_hook)
 356 | 		prev_shmem_request_hook();
 357 | 
 358 | 	RequestAddinShmemSpace(pgsrt_memsize());
 359 | 	RequestNamedLWLockTranche("pg_sortstats", 2);
 360 | }
 361 | #endif
 362 | 
 363 | static void
 364 | pgsrt_shmem_startup(void)
 365 | {
 366 | 	bool		found;
 367 | 	HASHCTL		info;
 368 | 	FILE	   *file = NULL;
 369 | 	FILE	   *kfile = NULL;
 370 | 	uint32		header;
 371 | 	int32		num;
 372 | 	int32		pgver;
 373 | 	int32		i;
 374 | 	int			buffer_size;
 375 | 	char	   *buffer = NULL;
 376 | 	Size		tottextlen;
 377 | 	int			nvalidtexts;
 378 | 
 379 | 	if (prev_shmem_startup_hook)
 380 | 		prev_shmem_startup_hook();
 381 | 
 382 | 	/* reset in case this is a restart within the postmaster */
 383 | 	pgsrt = NULL;
 384 | 
 385 | 	/* Create or attach to the shared memory state */
 386 | 	LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
 387 | 
 388 | 	/* global access lock */
 389 | 	pgsrt = ShmemInitStruct("pg_sortstats",
 390 | 					(sizeof(pgsrtSharedState)
 391 | #if PG_VERSION_NUM >= 90600
 392 | 					+ pgsrt_queryids_size()
 393 | #endif
 394 | 					),
 395 | 					&found);
 396 | 
 397 | 	if (!found)
 398 | 	{
 399 | 		/* First time through ... */
 400 | #if PG_VERSION_NUM >= 90600
 401 | 		LWLockPadded *locks = GetNamedLWLockTranche("pg_sortstats");
 402 | 		pgsrt->lock = &(locks[0]).lock;
 403 | 		pgsrt->queryids_lock = &(locks[1]).lock;
 404 | 		memset(pgsrt->queryids, 0, pgsrt_queryids_size());
 405 | #else
 406 | 		pgsrt->lock = LWLockAssign();
 407 | #endif
 408 | 		pgsrt->cur_median_usage = ASSUMED_MEDIAN_INIT;
 409 | 		pgsrt->mean_keys_len = ASSUMED_LENGTH_INIT;
 410 | 		SpinLockInit(&pgsrt->mutex);
 411 | 		pgsrt->extent = 0;
 412 | 		pgsrt->n_writers = 0;
 413 | 		pgsrt->gc_count = 0;
 414 | 	}
 415 | 
 416 | 	memset(&info, 0, sizeof(info));
 417 | 	info.keysize = sizeof(pgsrtHashKey);
 418 | 	info.entrysize = sizeof(pgsrtEntry);
 419 | 	info.hash = pgsrt_hash_fn;
 420 | 	info.match = pgsrt_match_fn;
 421 | 
 422 | 	/* allocate stats shared memory hash */
 423 | 	pgsrt_hash = ShmemInitHash("pg_sortstats hash",
 424 | 							  pgsrt_max, pgsrt_max,
 425 | 							  &info,
 426 | 							  HASH_ELEM | HASH_FUNCTION | HASH_COMPARE);
 427 | 
 428 | 	LWLockRelease(AddinShmemInitLock);
 429 | 
 430 | 	if (!IsUnderPostmaster)
 431 | 		on_shmem_exit(pgsrt_shmem_shutdown, (Datum) 0);
 432 | 
 433 | 	/*
 434 | 	 * Done if some other process already completed our initialization.
 435 | 	 */
 436 | 	if (found)
 437 | 		return;
 438 | 
 439 | 	/*
 440 | 	 * Note: we don't bother with locks here, because there should be no other
 441 | 	 * processes running when this code is reached.
 442 | 	 */
 443 | 
 444 | 	/* Unlink keys text file possibly left over from crash */
 445 | 	unlink(PGSRT_TEXT_FILE);
 446 | 
 447 | 	/* Allocate new keys text temp file */
 448 | 	kfile = AllocateFile(PGSRT_TEXT_FILE, PG_BINARY_W);
 449 | 	if (kfile == NULL)
 450 | 		goto write_error;
 451 | 
 452 | 	/*
 453 | 	 * If we were told not to load old statistics, we're done.  (Note we do
 454 | 	 * not try to unlink any old dump file in this case.  This seems a bit
 455 | 	 * questionable but it's the historical behavior.)
 456 | 	 */
 457 | 	if (!pgsrt_save)
 458 | 	{
 459 | 		FreeFile(kfile);
 460 | 		return;
 461 | 	}
 462 | 
 463 | 	/*
 464 | 	 * Attempt to load old statistics from the dump file.
 465 | 	 */
 466 | 	file = AllocateFile(PGSRT_DUMP_FILE, PG_BINARY_R);
 467 | 	if (file == NULL)
 468 | 	{
 469 | 		if (errno != ENOENT)
 470 | 			goto read_error;
 471 | 		/* No existing persisted stats file, so we're done */
 472 | 		FreeFile(kfile);
 473 | 		return;
 474 | 	}
 475 | 
 476 | 	buffer_size = 2048;
 477 | 	buffer = (char *) palloc(buffer_size);
 478 | 
 479 | 	if (fread(&header, sizeof(uint32), 1, file) != 1 ||
 480 | 		fread(&pgver, sizeof(uint32), 1, file) != 1 ||
 481 | 		fread(&num, sizeof(int32), 1, file) != 1)
 482 | 		goto read_error;
 483 | 
 484 | 	if (header != PGSRT_FILE_HEADER ||
 485 | 		pgver != PGSRT_PG_MAJOR_VERSION)
 486 | 		goto data_error;
 487 | 
 488 | 	tottextlen = 0;
 489 | 	nvalidtexts = 0;
 490 | 
 491 | 	for (i = 0; i < num; i++)
 492 | 	{
 493 | 		pgsrtEntry	temp;
 494 | 		pgsrtEntry  *entry;
 495 | 		Size		keys_offset;
 496 | 
 497 | 		if (fread(&temp, sizeof(pgsrtEntry), 1, file) != 1)
 498 | 			goto read_error;
 499 | 
 500 | 		/* Encoding is the only field we can easily sanity-check */
 501 | 		if (!PG_VALID_BE_ENCODING(temp.encoding))
 502 | 			goto data_error;
 503 | 
 504 | 		/* Resize buffer as needed */
 505 | 		if (temp.keys_len >= buffer_size)
 506 | 		{
 507 | 			buffer_size = Max(buffer_size * 2, temp.keys_len + 1);
 508 | 			buffer = repalloc(buffer, buffer_size);
 509 | 		}
 510 | 
 511 | 		if (fread(buffer, 1, temp.keys_len + 1, file) != temp.keys_len + 1)
 512 | 			goto read_error;
 513 | 
 514 | 		/* Should have a trailing null, but let's make sure */
 515 | 		buffer[temp.keys_len] = '\0';
 516 | 
 517 | 		/* Store the keys text */
 518 | 		keys_offset = pgsrt->extent;
 519 | 		if (fwrite(buffer, 1, temp.keys_len + 1, kfile) != temp.keys_len + 1)
 520 | 			goto write_error;
 521 | 		pgsrt->extent += temp.keys_len + 1;
 522 | 
 523 | 		/* make the hashtable entry (discards old entries if too many) */
 524 | 		entry = pgsrt_entry_alloc(&temp.key, keys_offset, temp.keys_len,
 525 | 				temp.encoding, temp.nbkeys);
 526 | 
 527 | 		/* In the mean length computation, ignore dropped texts. */
 528 | 		if (entry->keys_len >= 0)
 529 | 		{
 530 | 			tottextlen += entry->keys_len + 1;
 531 | 			nvalidtexts++;
 532 | 		}
 533 | 
 534 | 		/* copy in the actual stats */
 535 | 		entry->counters = temp.counters;
 536 | 	}
 537 | 
 538 | 	if (nvalidtexts > 0)
 539 | 		pgsrt->mean_keys_len = tottextlen / nvalidtexts;
 540 | 	else
 541 | 		pgsrt->mean_keys_len = ASSUMED_LENGTH_INIT;
 542 | 
 543 | 	pfree(buffer);
 544 | 	FreeFile(file);
 545 | 	FreeFile(kfile);
 546 | 
 547 | 	/*
 548 | 	 * Remove the persisted stats file so it's not included in
 549 | 	 * backups/replication slaves, etc.  A new file will be written on next
 550 | 	 * shutdown.
 551 | 	 *
 552 | 	 * Note: it's okay if the PGSRT_TEXT_FILE is included in a basebackup,
 553 | 	 * because we remove that file on startup; it acts inversely to
 554 | 	 * PGSRT_DUMP_FILE, in that it is only supposed to be around when the
 555 | 	 * server is running, whereas PGSRT_DUMP_FILE is only supposed to be around
 556 | 	 * when the server is not running.  Leaving the file creates no danger of
 557 | 	 * a newly restored database having a spurious record of execution costs,
 558 | 	 * which is what we're really concerned about here.
 559 | 	 */
 560 | 	unlink(PGSRT_DUMP_FILE);
 561 | 
 562 | 	return;
 563 | 
 564 | read_error:
 565 | 	ereport(LOG,
 566 | 			(errcode_for_file_access(),
 567 | 			 errmsg("could not read file \"%s\": %m",
 568 | 					PGSRT_DUMP_FILE)));
 569 | 	goto fail;
 570 | data_error:
 571 | 	ereport(LOG,
 572 | 			(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 573 | 			 errmsg("ignoring invalid data in file \"%s\"",
 574 | 					PGSRT_DUMP_FILE)));
 575 | 	goto fail;
 576 | write_error:
 577 | 	ereport(LOG,
 578 | 			(errcode_for_file_access(),
 579 | 			 errmsg("could not write file \"%s\": %m",
 580 | 					PGSRT_TEXT_FILE)));
 581 | fail:
 582 | 	if (buffer)
 583 | 		pfree(buffer);
 584 | 	if (file)
 585 | 		FreeFile(file);
 586 | 	if (kfile)
 587 | 		FreeFile(kfile);
 588 | 	/* If possible, throw away the bogus file; ignore any error */
 589 | 	unlink(PGSRT_DUMP_FILE);
 590 | 
 591 | 	/*
 592 | 	 * Don't unlink PGSRT_TEXT_FILE here; it should always be around while the
 593 | 	 * server is running with pg_sortstats enabled
 594 | 	 */
 595 | }
 596 | 
 597 | /* Save the statistics into a file at shutdown */
 598 | static void
 599 | pgsrt_shmem_shutdown(int code, Datum arg)
 600 | {
 601 | 	FILE	   *file;
 602 | 	char	   *kbuffer = NULL;
 603 | 	Size		kbuffer_size = 0;
 604 | 	HASH_SEQ_STATUS hash_seq;
 605 | 	int32		num_entries;
 606 | 	pgsrtEntry  *entry;
 607 | 
 608 | 	/* Don't try to dump during a crash. */
 609 | 	if (code)
 610 | 		return;
 611 | 
 612 | 	/* Safety check ... shouldn't get here unless shmem is set up. */
 613 | 	if (!pgsrt || !pgsrt_hash)
 614 | 		return;
 615 | 
 616 | 	/* Don't dump if told not to. */
 617 | 	if (!pgsrt_save)
 618 | 		return;
 619 | 
 620 | 	file = AllocateFile(PGSRT_DUMP_FILE ".tmp", PG_BINARY_W);
 621 | 	if (file == NULL)
 622 | 		goto error;
 623 | 
 624 | 	if (fwrite(&PGSRT_FILE_HEADER, sizeof(uint32), 1, file) != 1)
 625 | 		goto error;
 626 | 	if (fwrite(&PGSRT_PG_MAJOR_VERSION, sizeof(uint32), 1, file) != 1)
 627 | 		goto error;
 628 | 	num_entries = hash_get_num_entries(pgsrt_hash);
 629 | 	if (fwrite(&num_entries, sizeof(int32), 1, file) != 1)
 630 | 		goto error;
 631 | 
 632 | 	kbuffer = ktext_load_file(&kbuffer_size);
 633 | 	if (kbuffer == NULL)
 634 | 		goto error;
 635 | 
 636 | 	/*
 637 | 	 * When serializing to disk, we store keys texts immediately after their
 638 | 	 * entry data.  Any orphaned keys texts are thereby excluded.
 639 | 	 */
 640 | 	hash_seq_init(&hash_seq, pgsrt_hash);
 641 | 	while ((entry = hash_seq_search(&hash_seq)) != NULL)
 642 | 	{
 643 | 		int			len = entry->keys_len;
 644 | 		char	   *kstr = ktext_fetch(entry->keys_offset, len,
 645 | 									   kbuffer, kbuffer_size);
 646 | 
 647 | 		if (kstr == NULL)
 648 | 			continue;			/* Ignore any entries with bogus texts */
 649 | 
 650 | 		if (fwrite(entry, sizeof(pgsrtEntry), 1, file) != 1 ||
 651 | 			fwrite(kstr, 1, len + 1, file) != len + 1)
 652 | 		{
 653 | 			/* note: we assume hash_seq_term won't change errno */
 654 | 			hash_seq_term(&hash_seq);
 655 | 			goto error;
 656 | 		}
 657 | 	}
 658 | 
 659 | 	free(kbuffer);
 660 | 	kbuffer = NULL;
 661 | 
 662 | 	if (FreeFile(file))
 663 | 	{
 664 | 		file = NULL;
 665 | 		goto error;
 666 | 	}
 667 | 
 668 | 	/*
 669 | 	 * Rename file into place, so we atomically replace any old one.
 670 | 	 */
 671 | 	(void) durable_rename(PGSRT_DUMP_FILE ".tmp", PGSRT_DUMP_FILE, LOG);
 672 | 
 673 | 	/* Unlink keys-texts file; it's not needed while shutdown */
 674 | 	unlink(PGSRT_TEXT_FILE);
 675 | 
 676 | 	return;
 677 | 
 678 | error:
 679 | 	ereport(LOG,
 680 | 			(errcode_for_file_access(),
 681 | 			 errmsg("could not write file \"%s\": %m",
 682 | 					PGSRT_DUMP_FILE ".tmp")));
 683 | 	if (kbuffer)
 684 | 		free(kbuffer);
 685 | 	if (file)
 686 | 		FreeFile(file);
 687 | 	unlink(PGSRT_DUMP_FILE ".tmp");
 688 | 	unlink(PGSRT_TEXT_FILE);
 689 | }
 690 | 
 691 | /*
 692 |  * Save this query's queryId if it's not a parallel worker
 693 |  */
 694 | static void
 695 | pgsrt_ExecutorStart(QueryDesc *queryDesc, int eflags)
 696 | {
 697 | #if PG_VERSION_NUM >= 90600
 698 | 	if (pgsrt_enabled && !IsParallelWorker())
 699 | 		pgsrt_set_queryid(queryDesc->plannedstmt->queryId);
 700 | #endif
 701 | 
 702 | 	if (prev_ExecutorStart)
 703 | 		prev_ExecutorStart(queryDesc, eflags);
 704 | 	else
 705 | 		standard_ExecutorStart(queryDesc, eflags);
 706 | 
 707 | }
 708 | 
 709 | /*
 710 |  * ExecutorRun hook: all we need do is track nesting depth
 711 |  */
 712 | static void
 713 | pgsrt_ExecutorRun(QueryDesc *queryDesc,
 714 | 				 ScanDirection direction,
 715 | #if PG_VERSION_NUM >= 90600
 716 | 				 uint64 count
 717 | #else
 718 | 				 long count
 719 | #endif
 720 | #if PG_VERSION_NUM >= 100000
 721 | 				 ,bool execute_once
 722 | #endif
 723 | )
 724 | {
 725 | 	nesting_level++;
 726 | 	PG_TRY();
 727 | 	{
 728 | 		if (prev_ExecutorRun)
 729 | #if PG_VERSION_NUM >= 100000
 730 | 			prev_ExecutorRun(queryDesc, direction, count, execute_once);
 731 | #else
 732 | 			prev_ExecutorRun(queryDesc, direction, count);
 733 | #endif
 734 | 		else
 735 | #if PG_VERSION_NUM >= 100000
 736 | 			standard_ExecutorRun(queryDesc, direction, count, execute_once);
 737 | #else
 738 | 			standard_ExecutorRun(queryDesc, direction, count);
 739 | #endif
 740 | 		nesting_level--;
 741 | 	}
 742 | 	PG_CATCH();
 743 | 	{
 744 | 		nesting_level--;
 745 | 		PG_RE_THROW();
 746 | 	}
 747 | 	PG_END_TRY();
 748 | }
 749 | 
 750 | /*
 751 |  * ExecutorFinish hook: all we need do is track nesting depth
 752 |  */
 753 | static void
 754 | pgsrt_ExecutorFinish(QueryDesc *queryDesc)
 755 | {
 756 | 	nesting_level++;
 757 | 	PG_TRY();
 758 | 	{
 759 | 		if (prev_ExecutorFinish)
 760 | 			prev_ExecutorFinish(queryDesc);
 761 | 		else
 762 | 			standard_ExecutorFinish(queryDesc);
 763 | 		nesting_level--;
 764 | 	}
 765 | 	PG_CATCH();
 766 | 	{
 767 | 		nesting_level--;
 768 | 		PG_RE_THROW();
 769 | 	}
 770 | 	PG_END_TRY();
 771 | }
 772 | 
 773 | /*
 774 |  * Walk the planstates, find any sorts and gather their statistics.
 775 |  */
 776 | static void
 777 | pgsrt_ExecutorEnd(QueryDesc *queryDesc)
 778 | {
 779 | 	/* retrieve sorts informations, main work starts from here */
 780 | 	if (pgsrt_enabled)
 781 | 	{
 782 | 		pgsrtWalkerContext context;
 783 | 
 784 | 		context.queryDesc = queryDesc;
 785 | 		context.ancestors = NIL;
 786 | 
 787 | 		pgsrt_planstate_walker(queryDesc->planstate, &context);
 788 | 
 789 | #if PG_VERSION_NUM >= 90600
 790 | 		/* Remove the saved queryid for safety */
 791 | 		if (!IsParallelWorker())
 792 | 			pgsrt_set_queryid(0);
 793 | #endif
 794 | 	}
 795 | 
 796 | 	if (prev_ExecutorEnd)
 797 | 		prev_ExecutorEnd(queryDesc);
 798 | 	else
 799 | 		standard_ExecutorEnd(queryDesc);
 800 | }
 801 | 
 802 | static Size
 803 | pgsrt_memsize(void)
 804 | {
 805 | 	Size	size;
 806 | 
 807 | 	size = MAXALIGN(sizeof(pgsrtSharedState));
 808 | 	size = add_size(size, hash_estimate_size(pgsrt_max, sizeof(pgsrtEntry)));
 809 | #if PG_VERSION_NUM >= 90600
 810 | 	size = add_size(size, pgsrt_queryids_size());
 811 | #endif
 812 | 
 813 | 	return size;
 814 | }
 815 | 
 816 | #if PG_VERSION_NUM >= 90600
 817 | /* Parallel workers won't have their queryid setup.  We store the leader
 818 |  * process' queryid in shared memory so that workers can find which queryid
 819 |  * they're actually executing.
 820 |  */
 821 | static Size
 822 | pgsrt_queryids_size(void)
 823 | {
 824 | #if PG_VERSION_NUM >= 150000
 825 | 	Assert(MaxBackends > 0);
 826 | 	/* We need an extra slot since BackendId numerotation starts at 1. */
 827 | #define PGSRT_NB_BACKEND_SLOT (MaxBackends + 1)
 828 | #elif PG_VERSION_NUM >= 120000
 829 | 	/* We need frrom for all possible backends, plus the autovacuum launcher
 830 | 	 * and workers, plus the background workers, and an extra one since
 831 | 	 * BackendId numerotation starts at 1.
 832 | 	 * Starting with pg12, wal senders aren't part of MaxConnections anymore,
 833 | 	 * so they need to be accounted for.
 834 | 	 */
 835 | #define PGSRT_NB_BACKEND_SLOT (MaxConnections \
 836 | 			 + autovacuum_max_workers + 1 \
 837 | 			 + max_worker_processes \
 838 | 			 + max_wal_senders + 1)
 839 | #else
 840 | #define PGSRT_NB_BACKEND_SLOT (MaxConnections \
 841 | 			 + autovacuum_max_workers + 1 \
 842 | 			 + max_worker_processes + 1)
 843 | #endif
 844 | 
 845 | 	return MAXALIGN(sizeof(pgsrt_queryid) * PGSRT_NB_BACKEND_SLOT);
 846 | }
 847 | 
 848 | static pgsrt_queryid
 849 | pgsrt_get_queryid(void)
 850 | {
 851 | 	pgsrt_queryid queryId;
 852 | 
 853 | 	Assert(IsParallelWorker());
 854 | 	Assert(MyBackendId <= PGSRT_NB_BACKEND_SLOT);
 855 | 
 856 | 	LWLockAcquire(pgsrt->queryids_lock, LW_SHARED);
 857 | 	queryId = pgsrt->queryids[ParallelLeaderBackendId];
 858 | 	LWLockRelease(pgsrt->queryids_lock);
 859 | 
 860 | 	return queryId;
 861 | }
 862 | 
 863 | static void
 864 | pgsrt_set_queryid(pgsrt_queryid queryId)
 865 | {
 866 | 	Assert(!IsParallelWorker());
 867 | 	Assert(MyBackendId <= PGSRT_NB_BACKEND_SLOT);
 868 | 
 869 | 	LWLockAcquire(pgsrt->queryids_lock, LW_EXCLUSIVE);
 870 | 	pgsrt->queryids[MyBackendId] = queryId;
 871 | 	LWLockRelease(pgsrt->queryids_lock);
 872 | }
 873 | #endif
 874 | 
 875 | /*
 876 |  * Allocate a new hashtable entry.
 877 |  * caller must hold an exclusive lock on pgsrt->lock
 878 |  */
 879 | static pgsrtEntry *
 880 | pgsrt_entry_alloc(pgsrtHashKey *key, Size keys_offset, int keys_len,
 881 | 		int encoding, int nbkeys)
 882 | {
 883 | 	pgsrtEntry *entry;
 884 | 	bool		found;
 885 | 
 886 | 	/* Make space if needed */
 887 | 	while (hash_get_num_entries(pgsrt_hash) >= pgsrt_max)
 888 | 		pgsrt_entry_dealloc();
 889 | 
 890 | 	/* Find or create an entry with desired hash code */
 891 | 	entry = (pgsrtEntry *) hash_search(pgsrt_hash, key, HASH_ENTER, &found);
 892 | 
 893 | 	if (!found)
 894 | 	{
 895 | 		/* New entry, initialize it */
 896 | 
 897 | 		/* reset the statistics */
 898 | 		memset(&entry->counters, 0, sizeof(pgsrtCounters));
 899 | 		/* set the appropriate initial usage count */
 900 | 		entry->counters.usage = USAGE_INIT;
 901 | 		/* re-initialize the mutex each time ... we assume no one using it */
 902 | 		SpinLockInit(&entry->mutex);
 903 | 		/* set non counters fields */
 904 | 		Assert(keys_len >= 0);
 905 | 		entry->nbkeys = nbkeys;
 906 | 		entry->keys_offset = keys_offset;
 907 | 		entry->keys_len = keys_len;
 908 | 		entry->encoding = encoding;
 909 | 	}
 910 | 
 911 | 	return entry;
 912 | }
 913 | 
 914 | /*
 915 |  * qsort comparator for sorting into increasing usage order
 916 |  */
 917 | static int
 918 | entry_cmp(const void *lhs, const void *rhs)
 919 | {
 920 | 	double		l_usage = (*(pgsrtEntry *const *) lhs)->counters.usage;
 921 | 	double		r_usage = (*(pgsrtEntry *const *) rhs)->counters.usage;
 922 | 
 923 | 	if (l_usage < r_usage)
 924 | 		return -1;
 925 | 	else if (l_usage > r_usage)
 926 | 		return +1;
 927 | 	else
 928 | 		return 0;
 929 | }
 930 | 
 931 | /*
 932 |  * Deallocate least used entries.
 933 |  *
 934 |  * Caller must hold an exclusive lock on pgsrt->lock.
 935 |  */
 936 | static void
 937 | pgsrt_entry_dealloc(void)
 938 | {
 939 | 	HASH_SEQ_STATUS hash_seq;
 940 | 	pgsrtEntry **entries;
 941 | 	pgsrtEntry  *entry;
 942 | 	int			nvictims;
 943 | 	int			i;
 944 | 	Size		tottextlen;
 945 | 	int			nvalidtexts;
 946 | 
 947 | 	/*
 948 | 	 * Sort entries by usage and deallocate USAGE_DEALLOC_PERCENT of them.
 949 | 	 * While we're scanning the table, apply the decay factor to the usage
 950 | 	 * values.
 951 | 	 *
 952 | 	 * Note that the mean query length is almost immediately obsolete, since
 953 | 	 * we compute it before not after discarding the least-used entries.
 954 | 	 * Hopefully, that doesn't affect the mean too much; it doesn't seem worth
 955 | 	 * making two passes to get a more current result.  Likewise, the new
 956 | 	 * cur_median_usage includes the entries we're about to zap.
 957 | 	 */
 958 | 	entries = palloc(hash_get_num_entries(pgsrt_hash) * sizeof(pgsrtEntry *));
 959 | 
 960 | 	i = 0;
 961 | 	tottextlen = 0;
 962 | 	nvalidtexts = 0;
 963 | 
 964 | 	hash_seq_init(&hash_seq, pgsrt_hash);
 965 | 	while ((entry = hash_seq_search(&hash_seq)) != NULL)
 966 | 	{
 967 | 		entries[i++] = entry;
 968 | 		entry->counters.usage *= USAGE_DECREASE_FACTOR;
 969 | 		/* In the mean length computation, ignore dropped texts. */
 970 | 		if (entry->keys_len >= 0)
 971 | 		{
 972 | 			tottextlen += entry->keys_len + 1;
 973 | 			nvalidtexts++;
 974 | 		}
 975 | 	}
 976 | 
 977 | 	/* Sort into increasing order by usage */
 978 | 	qsort(entries, i, sizeof(pgsrtEntry *), entry_cmp);
 979 | 
 980 | 	/* Record the (approximate) median usage */
 981 | 	if (i > 0)
 982 | 		pgsrt->cur_median_usage = entries[i / 2]->counters.usage;
 983 | 	/* Record the mean query length */
 984 | 	if (nvalidtexts > 0)
 985 | 		pgsrt->mean_keys_len = tottextlen / nvalidtexts;
 986 | 	else
 987 | 		pgsrt->mean_keys_len = ASSUMED_LENGTH_INIT;
 988 | 
 989 | 	/* Now zap an appropriate fraction of lowest-usage entries */
 990 | 	nvictims = Max(10, i * USAGE_DEALLOC_PERCENT / 100);
 991 | 	nvictims = Min(nvictims, i);
 992 | 
 993 | 	for (i = 0; i < nvictims; i++)
 994 | 	{
 995 | 		hash_search(pgsrt_hash, &entries[i]->key, HASH_REMOVE, NULL);
 996 | 	}
 997 | 
 998 | 	pfree(entries);
 999 | }
1000 | 
1001 | /* Remove all saved entries in shmem */
1002 | static void
1003 | pgsrt_entry_reset(void)
1004 | {
1005 | 	HASH_SEQ_STATUS hash_seq;
1006 | 	pgsrtEntry  *entry;
1007 | 
1008 | 	LWLockAcquire(pgsrt->lock, LW_EXCLUSIVE);
1009 | 
1010 | 	hash_seq_init(&hash_seq, pgsrt_hash);
1011 | 	while ((entry = hash_seq_search(&hash_seq)) != NULL)
1012 | 	{
1013 | 		hash_search(pgsrt_hash, &entry->key, HASH_REMOVE, NULL);
1014 | 	}
1015 | 
1016 | 	LWLockRelease(pgsrt->lock);
1017 | }
1018 | 
1019 | 
1020 | /*
1021 |  * Store some statistics for a sort.
1022 |  */
1023 | static void
1024 | pgsrt_store(pgsrt_queryid queryId, int nbkeys, char *keys,
1025 | 		pgsrtCounters *counters)
1026 | {
1027 | 	volatile pgsrtEntry *e;
1028 | 	pgsrtHashKey key;
1029 | 	pgsrtEntry  *entry;
1030 | 
1031 | 	Assert(keys != NULL);
1032 | 
1033 | 	/* Safety check... */
1034 | 	if (!pgsrt || !pgsrt_hash)
1035 | 		return;
1036 | 
1037 | 	/* Set up key for hashtable search */
1038 | 	key.userid = GetUserId();
1039 | 	key.dbid = MyDatabaseId;
1040 | 	key.queryid = queryId;
1041 | 	key.sortid = (uint32) hash_any((unsigned char *) keys,
1042 | 			strlen(keys));
1043 | 
1044 | 	/* Lookup the hash table entry with shared lock. */
1045 | 	LWLockAcquire(pgsrt->lock, LW_SHARED);
1046 | 
1047 | 	entry = (pgsrtEntry *) hash_search(pgsrt_hash, &key, HASH_FIND, NULL);
1048 | 
1049 | 	/* Create new entry, if not present */
1050 | 	if (!entry)
1051 | 	{
1052 | 		Size		keys_offset;
1053 | 		int			keys_len = strlen(keys);
1054 | 		int			gc_count;
1055 | 		bool		stored;
1056 | 		bool		do_gc;
1057 | 
1058 | 		/* Append new keys text to file with only shared lock held */
1059 | 		stored = ktext_store(keys, keys_len, &keys_offset, &gc_count);
1060 | 
1061 | 		/*
1062 | 		 * Determine whether we need to garbage collect external keys texts
1063 | 		 * while the shared lock is still held.  This micro-optimization
1064 | 		 * avoids taking the time to decide this while holding exclusive lock.
1065 | 		 */
1066 | 		do_gc = need_gc_ktexts();
1067 | 
1068 | 		/* Need exclusive lock to make a new hashtable entry - promote */
1069 | 		LWLockRelease(pgsrt->lock);
1070 | 		LWLockAcquire(pgsrt->lock, LW_EXCLUSIVE);
1071 | 
1072 | 		/*
1073 | 		 * A garbage collection may have occurred while we weren't holding the
1074 | 		 * lock.  In the unlikely event that this happens, the keys text we
1075 | 		 * stored above will have been garbage collected, so write it again.
1076 | 		 * This should be infrequent enough that doing it while holding
1077 | 		 * exclusive lock isn't a performance problem.
1078 | 		 */
1079 | 		if (!stored || pgsrt->gc_count != gc_count)
1080 | 			stored = ktext_store(keys, keys_len, &keys_offset, NULL);
1081 | 
1082 | 		/* If we failed to write to the text file, give up */
1083 | 		if (!stored)
1084 | 			goto done;
1085 | 
1086 | 		/* OK to create a new hashtable entry */
1087 | 		entry = pgsrt_entry_alloc(&key, keys_offset, keys_len,
1088 | 				GetDatabaseEncoding(), nbkeys);
1089 | 
1090 | 		/* If needed, perform garbage collection while exclusive lock held */
1091 | 		if (do_gc)
1092 | 			gc_ktexts();
1093 | 	}
1094 | 
1095 | 	/*
1096 | 	 * Grab the spinlock while updating the counters */
1097 | 	e = (volatile pgsrtEntry *) entry;
1098 | 
1099 | 	SpinLockAcquire(&e->mutex);
1100 | 
1101 | 	e->counters.usage += 1;
1102 | 	e->counters.lines += counters->lines;
1103 | 	e->counters.lines_to_sort += counters->lines_to_sort;
1104 | 	e->counters.work_mems += counters->work_mems;
1105 | 	e->counters.topn_sorts += counters->topn_sorts;
1106 | 	e->counters.quicksorts += counters->quicksorts;
1107 | 	e->counters.external_sorts += counters->external_sorts;
1108 | 	e->counters.external_merges += counters->external_merges;
1109 | 	e->counters.nbtapes += counters->nbtapes;
1110 | 	e->counters.space_disk += counters->space_disk;
1111 | 	e->counters.space_memory += counters->space_memory;
1112 | 	e->counters.non_parallels += counters->non_parallels;
1113 | 	e->counters.nb_workers += counters->nb_workers;
1114 | 
1115 | 	SpinLockRelease(&e->mutex);
1116 | 
1117 | done:
1118 | 	LWLockRelease(pgsrt->lock);
1119 | }
1120 | 
1121 | /* Compute hash value for a pgsrtHashKey.  sortid is already hashed */
1122 | static uint32
1123 | pgsrt_hash_fn(const void *key, Size keysize)
1124 | {
1125 | 	const pgsrtHashKey *k = (const pgsrtHashKey *) key;
1126 | 
1127 | 	return hash_uint32((uint32) k->userid) ^
1128 | 		hash_uint32((uint32) k->dbid) ^
1129 | 		hash_uint32((uint32) k->queryid) ^
1130 | 		k->sortid;
1131 | }
1132 | 
1133 | /* Compare two pgsrtHashKey keys.  Zero means match */
1134 | static int
1135 | pgsrt_match_fn(const void *key1, const void *key2, Size keysize)
1136 | {
1137 | 	const pgsrtHashKey *k1 = (const pgsrtHashKey *) key1;
1138 | 	const pgsrtHashKey *k2 = (const pgsrtHashKey *) key2;
1139 | 
1140 | 	if (k1->userid == k2->userid &&
1141 | 		k1->dbid == k2->dbid &&
1142 | 		k1->queryid == k2->queryid &&
1143 | 		k1->sortid == k2->sortid)
1144 | 		return 0;
1145 | 	else
1146 | 		return 1;
1147 | }
1148 | 
1149 | /*
1150 |  * Given a keys string (not necessarily null-terminated), allocate a new
1151 |  * entry in the external keys text file and store the string there.
1152 |  *
1153 |  * If successful, returns true, and stores the new entry's offset in the file
1154 |  * into *keys_offset.  Also, if gc_count isn't NULL, *gc_count is set to the
1155 |  * number of garbage collections that have occurred so far.
1156 |  *
1157 |  * On failure, returns false.
1158 |  *
1159 |  * At least a shared lock on pgsrt->lock must be held by the caller, so as
1160 |  * to prevent a concurrent garbage collection.  Share-lock-holding callers
1161 |  * should pass a gc_count pointer to obtain the number of garbage collections,
1162 |  * so that they can recheck the count after obtaining exclusive lock to
1163 |  * detect whether a garbage collection occurred (and removed this entry).
1164 |  */
1165 | static bool
1166 | ktext_store(const char *keys, int keys_len,
1167 | 			Size *keys_offset, int *gc_count)
1168 | {
1169 | 	Size		off;
1170 | 	int			fd;
1171 | 
1172 | 	/*
1173 | 	 * We use a spinlock to protect extent/n_writers/gc_count, so that
1174 | 	 * multiple processes may execute this function concurrently.
1175 | 	 */
1176 | 	{
1177 | 		volatile pgsrtSharedState *s = (volatile pgsrtSharedState *) pgsrt;
1178 | 
1179 | 		SpinLockAcquire(&s->mutex);
1180 | 		off = s->extent;
1181 | 		s->extent += keys_len + 1;
1182 | 		s->n_writers++;
1183 | 		if (gc_count)
1184 | 			*gc_count = s->gc_count;
1185 | 		SpinLockRelease(&s->mutex);
1186 | 	}
1187 | 
1188 | 	*keys_offset = off;
1189 | 
1190 | 	/* Now write the data into the successfully-reserved part of the file */
1191 | #if PG_VERSION_NUM < 110000
1192 | 	fd = OpenTransientFile(PGSRT_TEXT_FILE, O_RDWR | O_CREAT | PG_BINARY,
1193 | 			S_IRUSR | S_IWUSR);
1194 | #else
1195 | 	fd = OpenTransientFile(PGSRT_TEXT_FILE, O_RDWR | O_CREAT | PG_BINARY);
1196 | #endif
1197 | 	if (fd < 0)
1198 | 		goto error;
1199 | 
1200 | 	if (lseek(fd, off, SEEK_SET) != off)
1201 | 		goto error;
1202 | 
1203 | 	if (write(fd, keys, keys_len) != keys_len)
1204 | 		goto error;
1205 | 	if (write(fd, "\0", 1) != 1)
1206 | 		goto error;
1207 | 
1208 | 	CloseTransientFile(fd);
1209 | 
1210 | 	/* Mark our write complete */
1211 | 	{
1212 | 		volatile pgsrtSharedState *s = (volatile pgsrtSharedState *) pgsrt;
1213 | 
1214 | 		SpinLockAcquire(&s->mutex);
1215 | 		s->n_writers--;
1216 | 		SpinLockRelease(&s->mutex);
1217 | 	}
1218 | 
1219 | 	return true;
1220 | 
1221 | error:
1222 | 	ereport(LOG,
1223 | 			(errcode_for_file_access(),
1224 | 			 errmsg("could not write file \"%s\": %m",
1225 | 					PGSRT_TEXT_FILE)));
1226 | 
1227 | 	if (fd >= 0)
1228 | 		CloseTransientFile(fd);
1229 | 
1230 | 	/* Mark our write complete */
1231 | 	{
1232 | 		volatile pgsrtSharedState *s = (volatile pgsrtSharedState *) pgsrt;
1233 | 
1234 | 		SpinLockAcquire(&s->mutex);
1235 | 		s->n_writers--;
1236 | 		SpinLockRelease(&s->mutex);
1237 | 	}
1238 | 
1239 | 	return false;
1240 | }
1241 | 
1242 | /*
1243 |  * Read the external keys text file into a malloc'd buffer.
1244 |  *
1245 |  * Returns NULL (without throwing an error) if unable to read, eg
1246 |  * file not there or insufficient memory.
1247 |  *
1248 |  * On success, the buffer size is also returned into *buffer_size.
1249 |  *
1250 |  * This can be called without any lock on pgsrt->lock, but in that case
1251 |  * the caller is responsible for verifying that the result is sane.
1252 |  */
1253 | static char *
1254 | ktext_load_file(Size *buffer_size)
1255 | {
1256 | 	char	   *buf;
1257 | 	int			fd;
1258 | 	struct stat stat;
1259 | 
1260 | #if PG_VERSION_NUM < 110000
1261 | 	fd = OpenTransientFile(PGSRT_TEXT_FILE, O_RDONLY | PG_BINARY, 0);
1262 | #else
1263 | 	fd = OpenTransientFile(PGSRT_TEXT_FILE, O_RDONLY | PG_BINARY);
1264 | #endif
1265 | 	if (fd < 0)
1266 | 	{
1267 | 		if (errno != ENOENT)
1268 | 			ereport(LOG,
1269 | 					(errcode_for_file_access(),
1270 | 					 errmsg("could not read file \"%s\": %m",
1271 | 							PGSRT_TEXT_FILE)));
1272 | 		return NULL;
1273 | 	}
1274 | 
1275 | 	/* Get file length */
1276 | 	if (fstat(fd, &stat))
1277 | 	{
1278 | 		ereport(LOG,
1279 | 				(errcode_for_file_access(),
1280 | 				 errmsg("could not stat file \"%s\": %m",
1281 | 						PGSRT_TEXT_FILE)));
1282 | 		CloseTransientFile(fd);
1283 | 		return NULL;
1284 | 	}
1285 | 
1286 | 	/* Allocate buffer; beware that off_t might be wider than size_t */
1287 | 	if (stat.st_size <= MaxAllocHugeSize)
1288 | 		buf = (char *) malloc(stat.st_size);
1289 | 	else
1290 | 		buf = NULL;
1291 | 	if (buf == NULL)
1292 | 	{
1293 | 		ereport(LOG,
1294 | 				(errcode(ERRCODE_OUT_OF_MEMORY),
1295 | 				 errmsg("out of memory"),
1296 | 				 errdetail("Could not allocate enough memory to read file \"%s\".",
1297 | 						   PGSRT_TEXT_FILE)));
1298 | 		CloseTransientFile(fd);
1299 | 		return NULL;
1300 | 	}
1301 | 
1302 | 	/*
1303 | 	 * OK, slurp in the file.  If we get a short read and errno doesn't get
1304 | 	 * set, the reason is probably that garbage collection truncated the file
1305 | 	 * since we did the fstat(), so we don't log a complaint --- but we don't
1306 | 	 * return the data, either, since it's most likely corrupt due to
1307 | 	 * concurrent writes from garbage collection.
1308 | 	 */
1309 | 	errno = 0;
1310 | 	if (read(fd, buf, stat.st_size) != stat.st_size)
1311 | 	{
1312 | 		if (errno)
1313 | 			ereport(LOG,
1314 | 					(errcode_for_file_access(),
1315 | 					 errmsg("could not read file \"%s\": %m",
1316 | 							PGSRT_TEXT_FILE)));
1317 | 		free(buf);
1318 | 		CloseTransientFile(fd);
1319 | 		return NULL;
1320 | 	}
1321 | 
1322 | 	CloseTransientFile(fd);
1323 | 
1324 | 	*buffer_size = stat.st_size;
1325 | 	return buf;
1326 | }
1327 | 
1328 | /*
1329 |  * Locate a keys text in the file image previously read by ktext_load_file().
1330 |  *
1331 |  * We validate the given offset/length, and return NULL if bogus.  Otherwise,
1332 |  * the result points to a null-terminated string within the buffer.
1333 |  */
1334 | static char *
1335 | ktext_fetch(Size keys_offset, int keys_len,
1336 | 			char *buffer, Size buffer_size)
1337 | {
1338 | 	/* File read failed? */
1339 | 	if (buffer == NULL)
1340 | 		return NULL;
1341 | 	/* Bogus offset/length? */
1342 | 	if (keys_len < 0 ||
1343 | 		keys_offset + keys_len >= buffer_size)
1344 | 		return NULL;
1345 | 	/* As a further sanity check, make sure there's a trailing null */
1346 | 	if (buffer[keys_offset + keys_len] != '\0')
1347 | 		return NULL;
1348 | 	/* Looks OK */
1349 | 	return buffer + keys_offset;
1350 | }
1351 | 
1352 | /*
1353 |  * Do we need to garbage-collect the external keys text file?
1354 |  *
1355 |  * Caller should hold at least a shared lock on pgsrt->lock.
1356 |  */
1357 | static bool
1358 | need_gc_ktexts(void)
1359 | {
1360 | 	Size		extent;
1361 | 
1362 | 	/* Read shared extent pointer */
1363 | 	{
1364 | 		volatile pgsrtSharedState *s = (volatile pgsrtSharedState *) pgsrt;
1365 | 
1366 | 		SpinLockAcquire(&s->mutex);
1367 | 		extent = s->extent;
1368 | 		SpinLockRelease(&s->mutex);
1369 | 	}
1370 | 
1371 | 	/* Don't proceed if file does not exceed 100 bytes per possible entry */
1372 | 	if (extent < 100 * pgsrt_max)
1373 | 		return false;
1374 | 
1375 | 	/*
1376 | 	 * Don't proceed if file is less than about 50% bloat.  Nothing can or
1377 | 	 * should be done in the event of unusually large keys texts accounting
1378 | 	 * for file's large size.  We go to the trouble of maintaining the mean
1379 | 	 * keys length in order to prevent garbage collection from thrashing
1380 | 	 * uselessly.
1381 | 	 */
1382 | 	if (extent < pgsrt->mean_keys_len * pgsrt_max * 2)
1383 | 		return false;
1384 | 
1385 | 	return true;
1386 | }
1387 | 
1388 | /*
1389 |  * Garbage-collect orphaned keys texts in external file.
1390 |  *
1391 |  * This won't be called often in the typical case, since it's likely that
1392 |  * there won't be too much churn, and besides, a similar compaction process
1393 |  * occurs when serializing to disk at shutdown or as part of resetting.
1394 |  * Despite this, it seems prudent to plan for the edge case where the file
1395 |  * becomes unreasonably large, with no other method of compaction likely to
1396 |  * occur in the foreseeable future.
1397 |  *
1398 |  * The caller must hold an exclusive lock on pgsrt->lock.
1399 |  *
1400 |  * At the first sign of trouble we unlink the keys text file to get a clean
1401 |  * slate (although existing statistics are retained), rather than risk
1402 |  * thrashing by allowing the same problem case to recur indefinitely.
1403 |  */
1404 | static void
1405 | gc_ktexts(void)
1406 | {
1407 | 	char	   *kbuffer;
1408 | 	Size		kbuffer_size;
1409 | 	FILE	   *kfile = NULL;
1410 | 	HASH_SEQ_STATUS hash_seq;
1411 | 	pgsrtEntry  *entry;
1412 | 	Size		extent;
1413 | 	int			nentries;
1414 | 
1415 | 	/*
1416 | 	 * When called from pgsrt_store, some other session might have proceeded
1417 | 	 * with garbage collection in the no-lock-held interim of lock strength
1418 | 	 * escalation.  Check once more that this is actually necessary.
1419 | 	 */
1420 | 	if (!need_gc_ktexts())
1421 | 		return;
1422 | 
1423 | 	/*
1424 | 	 * Load the old texts file.  If we fail (out of memory, for instance),
1425 | 	 * invalidate keys texts.  Hopefully this is rare.  It might seem better
1426 | 	 * to leave things alone on an OOM failure, but the problem is that the
1427 | 	 * file is only going to get bigger; hoping for a future non-OOM result is
1428 | 	 * risky and can easily lead to complete denial of service.
1429 | 	 */
1430 | 	kbuffer = ktext_load_file(&kbuffer_size);
1431 | 	if (kbuffer == NULL)
1432 | 		goto gc_fail;
1433 | 
1434 | 	/*
1435 | 	 * We overwrite the keys texts file in place, so as to reduce the risk of
1436 | 	 * an out-of-disk-space failure.  Since the file is guaranteed not to get
1437 | 	 * larger, this should always work on traditional filesystems; though we
1438 | 	 * could still lose on copy-on-write filesystems.
1439 | 	 */
1440 | 	kfile = AllocateFile(PGSRT_TEXT_FILE, PG_BINARY_W);
1441 | 	if (kfile == NULL)
1442 | 	{
1443 | 		ereport(LOG,
1444 | 				(errcode_for_file_access(),
1445 | 				 errmsg("could not write file \"%s\": %m",
1446 | 						PGSRT_TEXT_FILE)));
1447 | 		goto gc_fail;
1448 | 	}
1449 | 
1450 | 	extent = 0;
1451 | 	nentries = 0;
1452 | 
1453 | 	hash_seq_init(&hash_seq, pgsrt_hash);
1454 | 	while ((entry = hash_seq_search(&hash_seq)) != NULL)
1455 | 	{
1456 | 		int			keys_len = entry->keys_len;
1457 | 		char	   *qry = ktext_fetch(entry->keys_offset,
1458 | 									  keys_len,
1459 | 									  kbuffer,
1460 | 									  kbuffer_size);
1461 | 
1462 | 		if (qry == NULL)
1463 | 		{
1464 | 			/* Trouble ... drop the text */
1465 | 			entry->keys_offset = 0;
1466 | 			entry->keys_len = -1;
1467 | 			/* entry will not be counted in mean keys length computation */
1468 | 			continue;
1469 | 		}
1470 | 
1471 | 		if (fwrite(qry, 1, keys_len + 1, kfile) != keys_len + 1)
1472 | 		{
1473 | 			ereport(LOG,
1474 | 					(errcode_for_file_access(),
1475 | 					 errmsg("could not write file \"%s\": %m",
1476 | 							PGSRT_TEXT_FILE)));
1477 | 			hash_seq_term(&hash_seq);
1478 | 			goto gc_fail;
1479 | 		}
1480 | 
1481 | 		entry->keys_offset = extent;
1482 | 		extent += keys_len + 1;
1483 | 		nentries++;
1484 | 	}
1485 | 
1486 | 	/*
1487 | 	 * Truncate away any now-unused space.  If this fails for some odd reason,
1488 | 	 * we log it, but there's no need to fail.
1489 | 	 */
1490 | 	if (ftruncate(fileno(kfile), extent) != 0)
1491 | 		ereport(LOG,
1492 | 				(errcode_for_file_access(),
1493 | 				 errmsg("could not truncate file \"%s\": %m",
1494 | 						PGSRT_TEXT_FILE)));
1495 | 
1496 | 	if (FreeFile(kfile))
1497 | 	{
1498 | 		ereport(LOG,
1499 | 				(errcode_for_file_access(),
1500 | 				 errmsg("could not write file \"%s\": %m",
1501 | 						PGSRT_TEXT_FILE)));
1502 | 		kfile = NULL;
1503 | 		goto gc_fail;
1504 | 	}
1505 | 
1506 | 	elog(DEBUG1, "pgsrt gc of keys file shrunk size from %zu to %zu",
1507 | 		 pgsrt->extent, extent);
1508 | 
1509 | 	/* Reset the shared extent pointer */
1510 | 	pgsrt->extent = extent;
1511 | 
1512 | 	/*
1513 | 	 * Also update the mean keys length, to be sure that need_gc_ktexts()
1514 | 	 * won't still think we have a problem.
1515 | 	 */
1516 | 	if (nentries > 0)
1517 | 		pgsrt->mean_keys_len = extent / nentries;
1518 | 	else
1519 | 		pgsrt->mean_keys_len = ASSUMED_LENGTH_INIT;
1520 | 
1521 | 	free(kbuffer);
1522 | 
1523 | 	/*
1524 | 	 * OK, count a garbage collection cycle.  (Note: even though we have
1525 | 	 * exclusive lock on pgsrt->lock, we must take pgsrt->mutex for this, since
1526 | 	 * other processes may examine gc_count while holding only the mutex.
1527 | 	 * Also, we have to advance the count *after* we've rewritten the file,
1528 | 	 * else other processes might not realize they read a stale file.)
1529 | 	 */
1530 | 	record_gc_ktexts();
1531 | 
1532 | 	return;
1533 | 
1534 | gc_fail:
1535 | 	/* clean up resources */
1536 | 	if (kfile)
1537 | 		FreeFile(kfile);
1538 | 	if (kbuffer)
1539 | 		free(kbuffer);
1540 | 
1541 | 	/*
1542 | 	 * Since the contents of the external file are now uncertain, mark all
1543 | 	 * hashtable entries as having invalid texts.
1544 | 	 */
1545 | 	hash_seq_init(&hash_seq, pgsrt_hash);
1546 | 	while ((entry = hash_seq_search(&hash_seq)) != NULL)
1547 | 	{
1548 | 		entry->keys_offset = 0;
1549 | 		entry->keys_len = -1;
1550 | 	}
1551 | 
1552 | 	/*
1553 | 	 * Destroy the keys text file and create a new, empty one
1554 | 	 */
1555 | 	(void) unlink(PGSRT_TEXT_FILE);
1556 | 	kfile = AllocateFile(PGSRT_TEXT_FILE, PG_BINARY_W);
1557 | 	if (kfile == NULL)
1558 | 		ereport(LOG,
1559 | 				(errcode_for_file_access(),
1560 | 				 errmsg("could not recreate file \"%s\": %m",
1561 | 						PGSRT_TEXT_FILE)));
1562 | 	else
1563 | 		FreeFile(kfile);
1564 | 
1565 | 	/* Reset the shared extent pointer */
1566 | 	pgsrt->extent = 0;
1567 | 
1568 | 	/* Reset mean_keys_len to match the new state */
1569 | 	pgsrt->mean_keys_len = ASSUMED_LENGTH_INIT;
1570 | 
1571 | 	/*
1572 | 	 * Bump the GC count even though we failed.
1573 | 	 *
1574 | 	 * This is needed to make concurrent readers of file without any lock on
1575 | 	 * pgsrt->lock notice existence of new version of file.  Once readers
1576 | 	 * subsequently observe a change in GC count with pgsrt->lock held, that
1577 | 	 * forces a safe reopen of file.  Writers also require that we bump here,
1578 | 	 * of course.  (As required by locking protocol, readers and writers don't
1579 | 	 * trust earlier file contents until gc_count is found unchanged after
1580 | 	 * pgsrt->lock acquired in shared or exclusive mode respectively.)
1581 | 	 */
1582 | 	record_gc_ktexts();
1583 | }
1584 | 
1585 | static void
1586 | pgsrt_process_sortstate(SortState *srtstate, pgsrtWalkerContext *context)
1587 | {
1588 | 	Plan *plan = srtstate->ss.ps.plan;
1589 | 	Tuplesortstate *state = (Tuplesortstate *) srtstate->tuplesortstate;
1590 | #if PG_VERSION_NUM >= 110000
1591 | 	TuplesortInstrumentation stats;
1592 | #endif
1593 | 	Sort *sort = (Sort *) plan;
1594 | 	pgsrt_queryid queryId;
1595 | 	pgsrtCounters counters;
1596 | 	char *deparsed;
1597 | 	int nbtapes = 0;
1598 | #if PG_VERSION_NUM < 110000
1599 | 	const char *sortMethod;
1600 | 	const char *spaceType;
1601 | #endif
1602 | 	long		spaceUsed;
1603 | 	bool found;
1604 | 	int memtupsize_palloc;		/* tuplesort's main storage total size,
1605 | 								   including palloc overhead */
1606 | 	int tuple_palloc;			/* average tuple size, including palloc overhead */
1607 | 	int64 lines,				/* number of lines underlying node returned */
1608 | 		  lines_to_sort,		/* number of lines the sort will actually
1609 | 								   process (may differ when bounded) */
1610 | 		  memtupsize_length,	/* tuplesort's main storage array size */
1611 | 		  w_m;					/* estimated work_mem */
1612 | 
1613 | 	Assert(state);
1614 | 
1615 | 	/* First estimate the size of the main array that stores the lines */
1616 | 	lines = 0;
1617 | 	/* get effective number of lines fed to the sort if available */
1618 | 	if (srtstate->ss.ps.instrument)
1619 | 		lines = srtstate->ss.ps.instrument->ntuples;
1620 | 
1621 | 	/* fallback to estimated # of lines if no value */
1622 | 	if (lines == 0)
1623 | 		lines = sort->plan.plan_rows;
1624 | 
1625 | 	/*
1626 | 	 * If the sort is bounded, set the number of lines to sort
1627 | 	 * accordingly, otherwise use the Sort input lines count.
1628 | 	 */
1629 | 	if (srtstate->bounded)
1630 | 		lines_to_sort = srtstate->bound;
1631 | 	else
1632 | 		lines_to_sort = lines;
1633 | 
1634 | 	/* The minimal memtupsize is 1024 */
1635 | 	memtupsize_length = Max(1024, lines_to_sort);
1636 | 	/*
1637 | 	 * growth is done by doubling the size each time with a minimum of 1024
1638 | 	 * entries, so we'll have a power of 2.  No need to deal with the the last
1639 | 	 * growth special rule, there's no way we can exhaust the work_mem for the
1640 | 	 * main array and still put all the rows to sort in memory
1641 | 	 */
1642 | 	memtupsize_length = round_up_pow2(memtupsize_length);
1643 | 
1644 | 	/* compute the memtupsize palloc size */
1645 | 	memtupsize_palloc = sizeof(SortTuple) * memtupsize_length;
1646 | 	memtupsize_palloc += PGSRT_ALLOC_CHUNKHDRSZ;
1647 | 
1648 | 	/*
1649 | 	 * Then estimate the per-line space used.  We use the average row width,
1650 | 	 * and add the fixed MnimalTuple header overhead
1651 | 	 * FIXME: take into account NULLs, OIDs and alignment lost bytes
1652 | 	 */
1653 | 	tuple_palloc = sort->plan.plan_width + MAXALIGN(SizeofMinimalTupleHeader);
1654 | 
1655 | 	/* Add lost space due to alignment */
1656 | 	tuple_palloc += get_alignment_overhead(srtstate->ss.ss_ScanTupleSlot->tts_tupleDescriptor);
1657 | 
1658 | 	/*
1659 | 	 * Each tuple is palloced, and a palloced chunk uses a 2^N size unless size
1660 | 	 * is more then PGSRT_ALLOC_CHUNK_LIMIT
1661 | 	 */
1662 | 	if (tuple_palloc < PGSRT_ALLOC_CHUNK_LIMIT)
1663 | 		tuple_palloc = round_up_pow2(tuple_palloc);
1664 | 
1665 | 	/* Add the palloc overhead */
1666 | 	tuple_palloc += PGSRT_ALLOC_CHUNKHDRSZ;
1667 | 
1668 | 	/*
1669 | 	 * compute the estimated total work_mem that's needed to perform the
1670 | 	 * sort in memory.  First add the space needed for the lines
1671 | 	 */
1672 | 	w_m = lines_to_sort * tuple_palloc;
1673 | 
1674 | 	/*
1675 | 	 * If a bounded sort was asked, we'll try to sort only the bound limit
1676 | 	 * number of line, but a Top-N heapsort may need to be able to store twice
1677 | 	 * the amount of rows, so use twice the memory, assuming that the worst
1678 | 	 * case always happens.
1679 | 	 */
1680 | 	if (srtstate->bounded)
1681 | 		w_m *= 2;
1682 | 
1683 | 	/* add the tuplesort's main array storage and we're done */
1684 | 	w_m += memtupsize_palloc;
1685 | 
1686 | 	/* convert in kB, and add 1 kB as a quick round up */
1687 | 	w_m /= 1024;
1688 | 	w_m += 1;
1689 | 
1690 | 	/* deparse the sort keys */
1691 | 	deparsed = pgsrt_get_sort_group_keys(srtstate, sort->numCols,
1692 | 			sort->sortColIdx, sort->sortOperators, sort->collations,
1693 | 			sort->nullsFirst, context);
1694 | 
1695 | #if PG_VERSION_NUM >= 110000
1696 | 	tuplesort_get_stats(state, &stats);
1697 | 	//sortMethod = tuplesort_method_name(stats.sortMethod);
1698 | 	//spaceType = tuplesort_space_type_name(stats.spaceType);
1699 | 	spaceUsed = stats.spaceUsed;
1700 | #else
1701 | 	tuplesort_get_stats(state, &sortMethod, &spaceType, &spaceUsed);
1702 | #endif
1703 | 
1704 | 	counters.lines = lines;
1705 | 	counters.lines_to_sort = lines_to_sort;
1706 | 	counters.work_mems = w_m;
1707 | 	found = false;
1708 | #if PG_VERSION_NUM >= 110000
1709 | 	if (stats.sortMethod == SORT_TYPE_TOP_N_HEAPSORT)
1710 | #else
1711 | 	if (strcmp(sortMethod, "top-N heapsort") == 0)
1712 | #endif
1713 | 	{
1714 | 		counters.topn_sorts = 1;
1715 | 		found = true;
1716 | 	}
1717 | 	else
1718 | 		counters.topn_sorts = 0;
1719 | 
1720 | #if PG_VERSION_NUM >= 110000
1721 | 	if (stats.sortMethod == SORT_TYPE_QUICKSORT)
1722 | #else
1723 | 	if (!found && strcmp(sortMethod, "quicksort") == 0)
1724 | #endif
1725 | 	{
1726 | 		counters.quicksorts = 1;
1727 | 		found = true;
1728 | 	}
1729 | 	else
1730 | 		counters.quicksorts = 0;
1731 | 
1732 | #if PG_VERSION_NUM >= 110000
1733 | 	if (stats.sortMethod == SORT_TYPE_EXTERNAL_SORT)
1734 | #else
1735 | 	if (!found && strcmp(sortMethod, "external sort") == 0)
1736 | #endif
1737 | 	{
1738 | 		counters.external_sorts = 1;
1739 | 		found = true;
1740 | 	}
1741 | 	else
1742 | 		counters.external_sorts = 0;
1743 | 
1744 | #if PG_VERSION_NUM >= 110000
1745 | 	if (stats.sortMethod == SORT_TYPE_EXTERNAL_MERGE)
1746 | #else
1747 | 	if (!found && strcmp(sortMethod, "external merge") == 0)
1748 | #endif
1749 | 	{
1750 | 		counters.external_merges = 1;
1751 | 		nbtapes = ((pgsrt_Tuplesortstate *) state)->currentRun + 1;
1752 | 		found = true;
1753 | 	}
1754 | 	else
1755 | 		counters.external_merges = 0;
1756 | 
1757 | 	if (!found)
1758 | #if PG_VERSION_NUM >= 110000
1759 | 		elog(ERROR, "Unexpected sort method: %d", stats.sortMethod);
1760 | #else
1761 | 		elog(ERROR, "Unexpected sort method: %s", sortMethod);
1762 | #endif
1763 | 
1764 | 	counters.nbtapes = nbtapes;
1765 | 
1766 | #if PG_VERSION_NUM >= 110000
1767 | 	if (stats.spaceType == SORT_SPACE_TYPE_DISK)
1768 | #else
1769 | 	if (strcmp(spaceType, "Disk") == 0)
1770 | #endif
1771 | 	{
1772 | 		counters.space_disk = spaceUsed;
1773 | 		counters.space_memory = 0;
1774 | 	}
1775 | 	else
1776 | 	{
1777 | 		counters.space_disk = 0;
1778 | 		counters.space_memory = spaceUsed;
1779 | 	}
1780 | 
1781 | #if PG_VERSION_NUM >= 110000
1782 | 	if (srtstate->shared_info){
1783 | 		counters.non_parallels = 0;
1784 | 		/*
1785 | 		 * we compute the total number of processes participating to the sort,
1786 | 		 * so we have to increment the number of workers to take the gather
1787 | 		 * node into account
1788 | 		 */
1789 | 		counters.nb_workers = srtstate->shared_info->num_workers + 1;
1790 | 	}
1791 | 	else
1792 | 	{
1793 | 		counters.non_parallels = 1;
1794 | 		counters.nb_workers = 0;
1795 | 	}
1796 | #else
1797 | 	counters.non_parallels = 1;
1798 | 	counters.nb_workers = 0;
1799 | #endif
1800 | 
1801 | #if PG_VERSION_NUM >= 90600
1802 | 	if (IsParallelWorker())
1803 | 		queryId = pgsrt_get_queryid();
1804 | 	else
1805 | 		queryId = context->queryDesc->plannedstmt->queryId;
1806 | #else
1807 | 	queryId = context->queryDesc->plannedstmt->queryId;
1808 | #endif
1809 | 
1810 | 	pgsrt_store(queryId, sort->numCols, deparsed, &counters);
1811 | 
1812 | 	//elog(WARNING, "sort info:\n"
1813 | 	//		"keys: %s\n"
1814 | 	//		"type: %s\n"
1815 | 	//		"space type: %s\n"
1816 | 	//		"space: %ld kB\n"
1817 | 	//		"lines to sort: %ld\n"
1818 | 	//		"w_m estimated: %ld kB\n"
1819 | 	//		"nbTapes: %d\n"
1820 | #if PG_VERSION_NUM >= 110000
1821 | 	//		"parallel: %s (%d)\n"
1822 | #endif
1823 | 	//		"bounded? %s - %s , bound %ld - %ld",
1824 | 	//		deparsed,
1825 | 	//		sortMethod,
1826 | 	//		spaceType,
1827 | 	//		spaceUsed,
1828 | 	//		lines_to_sort,
1829 | 	//		w_m,
1830 | 	//		nbtapes,
1831 | #if PG_VERSION_NUM >= 110000
1832 | 	//		(srtstate->shared_info ? "yes" : "no"),(srtstate->shared_info ? srtstate->shared_info->num_workers : -1),
1833 | #endif
1834 | 	//		(srtstate->bounded ? "yes":"no"),(srtstate->bounded_Done ? "yes":"no"), srtstate->bound, srtstate->bound_Done);
1835 | }
1836 | 
1837 | /*
1838 |  * walker functions that recurse the planstate tree looking for sort nodes.
1839 |  */
1840 | static bool pgsrt_planstate_walker(PlanState *ps, pgsrtWalkerContext *context)
1841 | {
1842 | 	if (IsA(ps, SortState))
1843 | 	{
1844 | 		SortState *srtstate = (SortState *) ps;
1845 | 
1846 | 		if (srtstate->tuplesortstate)
1847 | 			pgsrt_process_sortstate(srtstate, context);
1848 | 	}
1849 | 
1850 | 	context->ancestors = lcons(ps, context->ancestors);
1851 | 
1852 | 	return planstate_tree_walker(ps, pgsrt_planstate_walker, context);
1853 | }
1854 | 
1855 | /* Adapted from ExplainPrintPlan */
1856 | static void
1857 | pgsrt_setup_walker_context(pgsrtWalkerContext *context)
1858 | {
1859 | 	Bitmapset  *rels_used = NULL;
1860 | 
1861 | 	/* Set up ExplainState fields associated with this plan tree */
1862 | 	Assert(context->queryDesc->plannedstmt != NULL);
1863 | 
1864 | 	context->rtable = context->queryDesc->plannedstmt->rtable;
1865 | 	pgsrt_PreScanNode(context->queryDesc->planstate, &rels_used);
1866 | 	context->rtable_names = select_rtable_names_for_explain(context->rtable,
1867 | 			rels_used);
1868 | #if PG_VERSION_NUM < 130000
1869 | 	context->deparse_cxt = deparse_context_for_plan_rtable(context->rtable,
1870 | 			context->rtable_names);
1871 | #else
1872 | 	context->deparse_cxt = deparse_context_for_plan_tree(
1873 | 			context->queryDesc->plannedstmt,
1874 | 			context->rtable_names);
1875 | #endif
1876 | }
1877 | 
1878 | /* Adapted from show_sort_group_keys */
1879 | static char *
1880 | pgsrt_get_sort_group_keys(SortState *srtstate,
1881 | 					 int nkeys, AttrNumber *keycols,
1882 | 					 Oid *sortOperators, Oid *collations, bool *nullsFirst,
1883 | 					 pgsrtWalkerContext *context)
1884 | {
1885 | 	Plan	   *plan = srtstate->ss.ps.plan;
1886 | 	List	   *dp_context = NIL;
1887 | 	StringInfoData sortkeybuf;
1888 | 	bool		useprefix;
1889 | 	int			keyno;
1890 | 
1891 | 	if (nkeys <= 0)
1892 | 		return "nothing?";
1893 | 
1894 | 	pgsrt_setup_walker_context(context);
1895 | 
1896 | 	initStringInfo(&sortkeybuf);
1897 | 
1898 | 	/* Set up deparsing context */
1899 | #if PG_VERSION_NUM < 130000
1900 | 	dp_context = set_deparse_context_planstate(context->deparse_cxt,
1901 | 											(Node *) srtstate,
1902 | 											context->ancestors);
1903 | #else
1904 | 	dp_context = set_deparse_context_plan(context->deparse_cxt,
1905 | 										  plan,
1906 | 										  context->ancestors);
1907 | #endif
1908 | 	useprefix = (list_length(context->rtable) > 1);
1909 | 
1910 | 	for (keyno = 0; keyno < nkeys; keyno++)
1911 | 	{
1912 | 		/* find key expression in tlist */
1913 | 		AttrNumber	keyresno = keycols[keyno];
1914 | 		TargetEntry *target = get_tle_by_resno(plan->targetlist,
1915 | 											   keyresno);
1916 | 		char	   *exprstr;
1917 | 
1918 | 		if (keyno != 0)
1919 | 			appendStringInfoString(&sortkeybuf, ", ");
1920 | 
1921 | 		if (!target)
1922 | 			elog(ERROR, "no tlist entry for key %d", keyresno);
1923 | 		/* Deparse the expression, showing any top-level cast */
1924 | 		exprstr = deparse_expression((Node *) target->expr, dp_context,
1925 | 									 useprefix, true);
1926 | 		appendStringInfoString(&sortkeybuf, exprstr);
1927 | 
1928 | 		/* Append sort order information, if relevant */
1929 | 		if (sortOperators != NULL)
1930 | 			pgsrt_show_sortorder_options(&sortkeybuf,
1931 | 								   (Node *) target->expr,
1932 | 								   sortOperators[keyno],
1933 | 								   collations[keyno],
1934 | 								   nullsFirst[keyno]);
1935 | 	}
1936 | 
1937 | 	return sortkeybuf.data;
1938 | }
1939 | 
1940 | /*
1941 |  * Reset statistics.
1942 |  */
1943 | PGDLLEXPORT Datum
1944 | pg_sortstats_reset(PG_FUNCTION_ARGS)
1945 | {
1946 | 	if (!pgsrt)
1947 | 		ereport(ERROR,
1948 | 				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1949 | 				 errmsg("pg_sortstats must be loaded via shared_preload_libraries")));
1950 | 
1951 | 	pgsrt_entry_reset();
1952 | 	PG_RETURN_VOID();
1953 | }
1954 | 
1955 | Datum
1956 | pg_sortstats(PG_FUNCTION_ARGS)
1957 | {
1958 | 	bool			showtext = PG_GETARG_BOOL(0);
1959 | 	ReturnSetInfo  *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
1960 | 	MemoryContext	per_query_ctx;
1961 | 	MemoryContext	oldcontext;
1962 | 	TupleDesc		tupdesc;
1963 | 	Tuplestorestate	*tupstore;
1964 | 	char		   *kbuffer = NULL;
1965 | 	Size			kbuffer_size = 0;
1966 | 	Size			extent = 0;
1967 | 	int				gc_count = 0;
1968 | 	HASH_SEQ_STATUS hash_seq;
1969 | 	pgsrtEntry		*entry;
1970 | 
1971 | 
1972 | 	if (!pgsrt)
1973 | 		ereport(ERROR,
1974 | 				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1975 | 				 errmsg("pg_sortstats must be loaded via shared_preload_libraries")));
1976 | 	/* check to see if caller supports us returning a tuplestore */
1977 | 	if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
1978 | 		ereport(ERROR,
1979 | 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1980 | 				 errmsg("set-valued function called in context that cannot accept a set")));
1981 | 	if (!(rsinfo->allowedModes & SFRM_Materialize))
1982 | 		ereport(ERROR,
1983 | 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1984 | 				 errmsg("materialize mode required, but it is not " \
1985 | 							"allowed in this context")));
1986 | 
1987 | 	/* Switch into long-lived context to construct returned data structures */
1988 | 	per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
1989 | 	oldcontext = MemoryContextSwitchTo(per_query_ctx);
1990 | 
1991 | 	/* Build a tuple descriptor for our result type */
1992 | 	if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
1993 | 		elog(ERROR, "return type must be a row type");
1994 | 
1995 | 	tupstore = tuplestore_begin_heap(true, false, work_mem);
1996 | 	rsinfo->returnMode = SFRM_Materialize;
1997 | 	rsinfo->setResult = tupstore;
1998 | 	rsinfo->setDesc = tupdesc;
1999 | 
2000 | 	MemoryContextSwitchTo(oldcontext);
2001 | 
2002 | 	/*
2003 | 	 * We'd like to load the keys text file (if needed) while not holding any
2004 | 	 * lock on pgsrt->lock.  In the worst case we'll have to do this again
2005 | 	 * after we have the lock, but it's unlikely enough to make this a win
2006 | 	 * despite occasional duplicated work.  We need to reload if anybody
2007 | 	 * writes to the file (either a retail ktext_store(), or a garbage
2008 | 	 * collection) between this point and where we've gotten shared lock.  If
2009 | 	 * a ktext_store is actually in progress when we look, we might as well
2010 | 	 * skip the speculative load entirely.
2011 | 	 */
2012 | 	if (showtext)
2013 | 	{
2014 | 		int			n_writers;
2015 | 
2016 | 		/* Take the mutex so we can examine variables */
2017 | 		{
2018 | 			volatile pgsrtSharedState *s = (volatile pgsrtSharedState *) pgsrt;
2019 | 
2020 | 			SpinLockAcquire(&s->mutex);
2021 | 			extent = s->extent;
2022 | 			n_writers = s->n_writers;
2023 | 			gc_count = s->gc_count;
2024 | 			SpinLockRelease(&s->mutex);
2025 | 		}
2026 | 
2027 | 		/* No point in loading file now if there are active writers */
2028 | 		if (n_writers == 0)
2029 | 			kbuffer = ktext_load_file(&kbuffer_size);
2030 | 	}
2031 | 
2032 | 	/*
2033 | 	 * Get shared lock, load or reload the keys text file if we must, and
2034 | 	 * iterate over the hashtable entries.
2035 | 	 *
2036 | 	 * With a large hash table, we might be holding the lock rather longer
2037 | 	 * than one could wish.  However, this only blocks creation of new hash
2038 | 	 * table entries, and the larger the hash table the less likely that is to
2039 | 	 * be needed.  So we can hope this is okay.  Perhaps someday we'll decide
2040 | 	 * we need to partition the hash table to limit the time spent holding any
2041 | 	 * one lock.
2042 | 	 */
2043 | 	LWLockAcquire(pgsrt->lock, LW_SHARED);
2044 | 
2045 | 	if (showtext)
2046 | 	{
2047 | 		/*
2048 | 		 * Here it is safe to examine extent and gc_count without taking the
2049 | 		 * mutex.  Note that although other processes might change
2050 | 		 * pgsrt->extent just after we look at it, the strings they then write
2051 | 		 * into the file cannot yet be referenced in the hashtable, so we
2052 | 		 * don't care whether we see them or not.
2053 | 		 *
2054 | 		 * If ktext_load_file fails, we just press on; we'll return NULL for
2055 | 		 * every keys text.
2056 | 		 */
2057 | 		if (kbuffer == NULL ||
2058 | 			pgsrt->extent != extent ||
2059 | 			pgsrt->gc_count != gc_count)
2060 | 		{
2061 | 			if (kbuffer)
2062 | 				free(kbuffer);
2063 | 			kbuffer = ktext_load_file(&kbuffer_size);
2064 | 		}
2065 | 	}
2066 | 
2067 | 	hash_seq_init(&hash_seq, pgsrt_hash);
2068 | 	while ((entry = hash_seq_search(&hash_seq)) != NULL)
2069 | 	{
2070 | 		Datum			values[PGSRT_COLUMNS];
2071 | 		bool			nulls[PGSRT_COLUMNS];
2072 | 		pgsrtCounters	tmp;
2073 | 		int				i = 0;
2074 | 
2075 | 		memset(values, 0, sizeof(values));
2076 | 		memset(nulls, 0, sizeof(nulls));
2077 | 
2078 | 		/* copy counters to a local variable to keep locking time short */
2079 | 		{
2080 | 			volatile pgsrtEntry *e = (volatile pgsrtEntry *) entry;
2081 | 
2082 | 			SpinLockAcquire(&e->mutex);
2083 | 			tmp = e->counters;
2084 | 			SpinLockRelease(&e->mutex);
2085 | 		}
2086 | 
2087 | 		values[i++] = Int64GetDatumFast(entry->key.queryid);
2088 | 		values[i++] = ObjectIdGetDatum(entry->key.userid);
2089 | 		values[i++] = ObjectIdGetDatum(entry->key.dbid);
2090 | 		values[i++] = Int32GetDatum(entry->nbkeys);
2091 | 		if (showtext)
2092 | 		{
2093 | 			char	   *kstr = ktext_fetch(entry->keys_offset,
2094 | 					entry->keys_len,
2095 | 					kbuffer,
2096 | 					kbuffer_size);
2097 | 
2098 | 			if (kstr)
2099 | 			{
2100 | 				char	   *enc;
2101 | 
2102 | 				enc = pg_any_to_server(kstr,
2103 | 						entry->keys_len,
2104 | 						entry->encoding);
2105 | 
2106 | 				values[i++] = CStringGetTextDatum(enc);
2107 | 
2108 | 				if (enc != kstr)
2109 | 					pfree(enc);
2110 | 				}
2111 | 				else
2112 | 				{
2113 | 					/* Just return a null if we fail to find the text */
2114 | 					nulls[i++] = true;
2115 | 				}
2116 | 		}
2117 | 		else
2118 | 		{
2119 | 			/* keys text not requested */
2120 | 			nulls[i++] = true;
2121 | 		}
2122 | 		values[i++] = Int64GetDatumFast(tmp.lines);
2123 | 		values[i++] = Int64GetDatumFast(tmp.lines_to_sort);
2124 | 		values[i++] = Int64GetDatumFast(tmp.work_mems);
2125 | 		values[i++] = Int64GetDatumFast(tmp.topn_sorts);
2126 | 		values[i++] = Int64GetDatumFast(tmp.quicksorts);
2127 | 		values[i++] = Int64GetDatumFast(tmp.external_sorts);
2128 | 		values[i++] = Int64GetDatumFast(tmp.external_merges);
2129 | 		values[i++] = Int64GetDatumFast(tmp.nbtapes);
2130 | 		values[i++] = Int64GetDatumFast(tmp.space_disk);
2131 | 		values[i++] = Int64GetDatumFast(tmp.space_memory);
2132 | 		values[i++] = Int64GetDatumFast(tmp.non_parallels);
2133 | #if PG_VERSION_NUM >= 110000
2134 | 		values[i++] = Int64GetDatumFast(tmp.nb_workers);
2135 | #else
2136 | 		nulls[i++] = true;
2137 | #endif
2138 | 
2139 | 		Assert(i == PGSRT_COLUMNS);
2140 | 
2141 | 		tuplestore_putvalues(tupstore, tupdesc, values, nulls);
2142 | 	}
2143 | 
2144 | 	/* clean up and return the tuplestore */
2145 | 	LWLockRelease(pgsrt->lock);
2146 | 
2147 | 	if (kbuffer)
2148 | 		free(kbuffer);
2149 | 
2150 | 	tuplestore_donestoring(tupstore);
2151 | 	return (Datum) 0;
2152 | }
2153 | 
2154 | static unsigned long
2155 | round_up_pow2(int64 val)
2156 | {
2157 | 	val--;
2158 | 	val |= val >> 1;
2159 | 	val |= val >> 2;
2160 | 	val |= val >> 4;
2161 | 	val |= val >> 8;
2162 | 	val |= val >> 16;
2163 | 	val++;
2164 | 	return val;
2165 | }
2166 | 
2167 | static int
2168 | get_alignment_overhead(TupleDesc tupdesc)
2169 | {
2170 | 	int align_overhead = 0;
2171 | 	int off = 0;
2172 | 	int i;
2173 | 
2174 | 	for (i = 0; i < tupdesc->natts; i++)
2175 | 	{
2176 | 		Form_pg_attribute attr = TupleDescAttr(tupdesc, i);
2177 | 		int newoff;
2178 | 
2179 | 		/* FIXME use better heuristic for varlena :) */
2180 | 		if (attr->attlen > 0)
2181 | 			off += attr->attlen;
2182 | 		else
2183 | 			off += 1;
2184 | 
2185 | 		newoff = att_align_nominal(off, attr->attalign);
2186 | 		if (newoff != off)
2187 | 		{
2188 | 			align_overhead += (newoff - off);
2189 | 			off = newoff;
2190 | 		}
2191 | 	}
2192 | 
2193 | 	return align_overhead;
2194 | }
2195 | 


--------------------------------------------------------------------------------
/pg_sortstats.control:
--------------------------------------------------------------------------------
1 | comment = 'An extension collecting statistics about sorts'
2 | default_version = '0.0.1'
3 | module_pathname = '$libdir/pg_sortstats'
4 | relocatable = true
5 | 


--------------------------------------------------------------------------------
/pg_sortstats_import.c:
--------------------------------------------------------------------------------
  1 | /*-------------------------------------------------------------------------
  2 |  *
  3 |  * pg_sortstats_import.c
  4 |  *		Imported needed function.
  5 |  *
  6 |  *-------------------------------------------------------------------------
  7 |  */
  8 | 
  9 | #include "postgres.h"
 10 | 
 11 | #if PG_VERSION_NUM >= 110000
 12 | #include "catalog/pg_collation_d.h"
 13 | #else
 14 | #include "catalog/pg_collation.h"
 15 | #endif
 16 | #include "nodes/execnodes.h"
 17 | #include "nodes/nodeFuncs.h"
 18 | #include "utils/builtins.h"
 19 | #include "utils/lsyscache.h"
 20 | #include "utils/tuplesort.h"
 21 | #include "utils/typcache.h"
 22 | 
 23 | #include "include/pg_sortstats_import.h"
 24 | 
 25 | /*
 26 |  * Imported from ExplainPreScanNode
 27 |  */
 28 | bool
 29 | pgsrt_PreScanNode(PlanState *planstate, Bitmapset **rels_used)
 30 | {
 31 | 	Plan	   *plan = planstate->plan;
 32 | 
 33 | 	switch (nodeTag(plan))
 34 | 	{
 35 | 		case T_SeqScan:
 36 | #if PG_VERSION_NUM >= 90500
 37 | 		case T_SampleScan:
 38 | #endif
 39 | 		case T_IndexScan:
 40 | 		case T_IndexOnlyScan:
 41 | 		case T_BitmapHeapScan:
 42 | 		case T_TidScan:
 43 | 		case T_SubqueryScan:
 44 | 		case T_FunctionScan:
 45 | #if PG_VERSION_NUM >= 100000
 46 | 		case T_TableFuncScan:
 47 | #endif
 48 | 		case T_ValuesScan:
 49 | 		case T_CteScan:
 50 | #if PG_VERSION_NUM >= 100000
 51 | 		case T_NamedTuplestoreScan:
 52 | #endif
 53 | 		case T_WorkTableScan:
 54 | 			*rels_used = bms_add_member(*rels_used,
 55 | 										((Scan *) plan)->scanrelid);
 56 | 			break;
 57 | 		case T_ForeignScan:
 58 | #if PG_VERSION_NUM >= 90500
 59 | 			*rels_used = bms_add_members(*rels_used,
 60 | 										 ((ForeignScan *) plan)->fs_relids);
 61 | #else
 62 | 			*rels_used = bms_add_member(*rels_used,
 63 | 										((Scan *) plan)->scanrelid);
 64 | #endif
 65 | 			break;
 66 | #if PG_VERSION_NUM >= 90500
 67 | 		case T_CustomScan:
 68 | 			*rels_used = bms_add_members(*rels_used,
 69 | 										 ((CustomScan *) plan)->custom_relids);
 70 | 			break;
 71 | #endif
 72 | 		case T_ModifyTable:
 73 | #if PG_VERSION_NUM >= 90500
 74 | 			*rels_used = bms_add_member(*rels_used,
 75 | 										((ModifyTable *) plan)->nominalRelation);
 76 | 			if (((ModifyTable *) plan)->exclRelRTI)
 77 | 				*rels_used = bms_add_member(*rels_used,
 78 | 											((ModifyTable *) plan)->exclRelRTI);
 79 | #else
 80 | 			/* cf ExplainModifyTarget */
 81 | 			*rels_used = bms_add_member(*rels_used,
 82 | 					  linitial_int(((ModifyTable *) plan)->resultRelations));
 83 | #endif
 84 | 			break;
 85 | 		default:
 86 | 			break;
 87 | 	}
 88 | 
 89 | 	return planstate_tree_walker(planstate, pgsrt_PreScanNode, rels_used);
 90 | }
 91 | 
 92 | /* Imported from show_sortorder_options */
 93 | void
 94 | pgsrt_show_sortorder_options(StringInfo buf, Node *sortexpr,
 95 | 					   Oid sortOperator, Oid collation, bool nullsFirst)
 96 | {
 97 | 	Oid			sortcoltype = exprType(sortexpr);
 98 | 	bool		reverse = false;
 99 | 	TypeCacheEntry *typentry;
100 | 
101 | 	typentry = lookup_type_cache(sortcoltype,
102 | 								 TYPECACHE_LT_OPR | TYPECACHE_GT_OPR);
103 | 
104 | 	/*
105 | 	 * Print COLLATE if it's not default.  There are some cases where this is
106 | 	 * redundant, eg if expression is a column whose declared collation is
107 | 	 * that collation, but it's hard to distinguish that here.
108 | 	 */
109 | 	if (OidIsValid(collation) && collation != DEFAULT_COLLATION_OID)
110 | 	{
111 | 		char	   *collname = get_collation_name(collation);
112 | 
113 | 		if (collname == NULL)
114 | 			elog(ERROR, "cache lookup failed for collation %u", collation);
115 | 		appendStringInfo(buf, " COLLATE %s", quote_identifier(collname));
116 | 	}
117 | 
118 | 	/* Print direction if not ASC, or USING if non-default sort operator */
119 | 	if (sortOperator == typentry->gt_opr)
120 | 	{
121 | 		appendStringInfoString(buf, " DESC");
122 | 		reverse = true;
123 | 	}
124 | 	else if (sortOperator != typentry->lt_opr)
125 | 	{
126 | 		char	   *opname = get_opname(sortOperator);
127 | 
128 | 		if (opname == NULL)
129 | 			elog(ERROR, "cache lookup failed for operator %u", sortOperator);
130 | 		appendStringInfo(buf, " USING %s", opname);
131 | 		/* Determine whether operator would be considered ASC or DESC */
132 | 		(void) get_equality_op_for_ordering_op(sortOperator, &reverse);
133 | 	}
134 | 
135 | 	/* Add NULLS FIRST/LAST only if it wouldn't be default */
136 | 	if (nullsFirst && !reverse)
137 | 	{
138 | 		appendStringInfoString(buf, " NULLS FIRST");
139 | 	}
140 | 	else if (!nullsFirst && reverse)
141 | 	{
142 | 		appendStringInfoString(buf, " NULLS LAST");
143 | 	}
144 | }
145 | 
146 | #if PG_VERSION_NUM < 90600
147 | 
148 | #include "nodes/nodes.h"
149 | #include "nodes/pg_list.h"
150 | #include "utils/logtape.h"
151 | #include "utils/tuplesort.h"
152 | 
153 | #include "include/pg_sortstats_import.h"
154 | 
155 | bool
156 | planstate_tree_walker(PlanState *planstate,
157 | 					  bool (*walker) (),
158 | 					  void *context)
159 | {
160 | 	Plan	   *plan = planstate->plan;
161 | #if PG_VERSION_NUM >= 90500
162 | 	ListCell   *lc;
163 | #endif
164 | 
165 | 	/* initPlan-s */
166 | 	if (planstate_walk_subplans(planstate->initPlan, walker, context))
167 | 		return true;
168 | 
169 | 	/* lefttree */
170 | 	if (outerPlanState(planstate))
171 | 	{
172 | 		if (walker(outerPlanState(planstate), context))
173 | 			return true;
174 | 	}
175 | 
176 | 	/* righttree */
177 | 	if (innerPlanState(planstate))
178 | 	{
179 | 		if (walker(innerPlanState(planstate), context))
180 | 			return true;
181 | 	}
182 | 
183 | 	/* special child plans */
184 | 	switch (nodeTag(plan))
185 | 	{
186 | 		case T_ModifyTable:
187 | 			if (planstate_walk_members(((ModifyTableState *) planstate)->mt_plans,
188 | 									   ((ModifyTableState *) planstate)->mt_nplans,
189 | 									   walker, context))
190 | 				return true;
191 | 			break;
192 | 		case T_Append:
193 | 			if (planstate_walk_members(((AppendState *) planstate)->appendplans,
194 | 									   ((AppendState *) planstate)->as_nplans,
195 | 									   walker, context))
196 | 				return true;
197 | 			break;
198 | 		case T_MergeAppend:
199 | 			if (planstate_walk_members(((MergeAppendState *) planstate)->mergeplans,
200 | 									   ((MergeAppendState *) planstate)->ms_nplans,
201 | 									   walker, context))
202 | 				return true;
203 | 			break;
204 | 		case T_BitmapAnd:
205 | 			if (planstate_walk_members(((BitmapAndState *) planstate)->bitmapplans,
206 | 									   ((BitmapAndState *) planstate)->nplans,
207 | 									   walker, context))
208 | 				return true;
209 | 			break;
210 | 		case T_BitmapOr:
211 | 			if (planstate_walk_members(((BitmapOrState *) planstate)->bitmapplans,
212 | 									   ((BitmapOrState *) planstate)->nplans,
213 | 									   walker, context))
214 | 				return true;
215 | 			break;
216 | 		case T_SubqueryScan:
217 | 			if (walker(((SubqueryScanState *) planstate)->subplan, context))
218 | 				return true;
219 | 			break;
220 | #if PG_VERSION_NUM >= 90500
221 | 		case T_CustomScan:
222 | 			foreach(lc, ((CustomScanState *) planstate)->custom_ps)
223 | 			{
224 | 				if (walker((PlanState *) lfirst(lc), context))
225 | 					return true;
226 | 			}
227 | 			break;
228 | #endif
229 | 		default:
230 | 			break;
231 | 	}
232 | 
233 | 	/* subPlan-s */
234 | 	if (planstate_walk_subplans(planstate->subPlan, walker, context))
235 | 		return true;
236 | 
237 | 	return false;
238 | }
239 | 
240 | bool
241 | planstate_walk_subplans(List *plans,
242 | 						bool (*walker) (),
243 | 						void *context)
244 | {
245 | 	ListCell   *lc;
246 | 
247 | 	foreach(lc, plans)
248 | 	{
249 | 		SubPlanState *sps = lfirst_node(SubPlanState, lc);
250 | 
251 | 		if (walker(sps->planstate, context))
252 | 			return true;
253 | 	}
254 | 
255 | 	return false;
256 | }
257 | 
258 | bool
259 | planstate_walk_members(PlanState **planstates, int nplans,
260 | 					   bool (*walker) (), void *context)
261 | {
262 | 	int			j;
263 | 
264 | 	for (j = 0; j < nplans; j++)
265 | 	{
266 | 		if (walker(planstates[j], context))
267 | 			return true;
268 | 	}
269 | 
270 | 	return false;
271 | }
272 | #endif
273 | 


--------------------------------------------------------------------------------
/sql/pg_sortstats.sql:
--------------------------------------------------------------------------------
 1 | CREATE EXTENSION pg_sortstats;
 2 | 
 3 | SELECT pg_sortstats_reset();
 4 | 
 5 | CREATE TABLE sorts (id integer, val text COLLATE "C");
 6 | INSERT INTO sorts SELECT i, 'line ' || i FROM generate_series(1, 100000) i;
 7 | VACUUM ANALYZE sorts;
 8 | 
 9 | SET work_mem = '64kB';
10 | WITH src AS (
11 |     SELECT * FROM sorts ORDER BY val, id DESC
12 | )
13 | SELECT * FROM src LIMIT 1;
14 | SELECT * FROM sorts ORDER BY id DESC LIMIT 1;
15 | 
16 | SELECT nb_keys, sort_keys, lines, lines_to_sort,
17 |     work_mems < (12 * 1024) AS "exp_less_12MB",
18 |     topn_sorts, quicksorts, external_sorts, external_merges,
19 |     nb_tapes > 2 AS multiple_tapes,
20 |     space_disk > 1024 AS "disk_more_1MB",
21 |     space_memory > 1024 AS  "mem_more_1MB",
22 |     non_parallels, COALESCE(nb_workers, 0) AS nb_workers
23 | FROM pg_sortstats(true) ORDER BY nb_keys;
24 | 
25 | SELECT pg_sortstats_reset();
26 | 
27 | SET work_mem = '12MB';
28 | WITH src AS (
29 |     SELECT * FROM sorts ORDER BY val, id DESC
30 | )
31 | SELECT * FROM src LIMIT 1;
32 | 
33 | SELECT nb_keys, sort_keys, lines, lines_to_sort,
34 |     work_mems < (12 * 1024) AS "exp_less_12MB",
35 |     topn_sorts, quicksorts, external_sorts, external_merges,
36 |     nb_tapes > 2 AS multiple_tapes,
37 |     space_disk > 1024 AS "disk_more_1MB",
38 |     space_memory > 1024 AS  "mem_more_1MB",
39 |     non_parallels, COALESCE(nb_workers, 0) AS nb_workers
40 | FROM pg_sortstats(true) ORDER BY nb_keys;
41 | 
42 | SELECT pg_sortstats_reset();
43 | 


--------------------------------------------------------------------------------
/sql/pg_sortstats_12.sql:
--------------------------------------------------------------------------------
 1 | CREATE EXTENSION pg_sortstats;
 2 | 
 3 | SELECT pg_sortstats_reset();
 4 | 
 5 | CREATE TABLE sorts (id integer, val text COLLATE "C");
 6 | INSERT INTO sorts SELECT i, 'line ' || i FROM generate_series(1, 100000) i;
 7 | VACUUM ANALYZE sorts;
 8 | 
 9 | SET work_mem = '64kB';
10 | WITH src AS MATERIALIZED (
11 |     SELECT * FROM sorts ORDER BY val, id DESC
12 | )
13 | SELECT * FROM src LIMIT 1;
14 | SELECT * FROM sorts ORDER BY id DESC LIMIT 1;
15 | 
16 | SELECT nb_keys, sort_keys, lines, lines_to_sort,
17 |     work_mems < (12 * 1024) AS "exp_less_12MB",
18 |     topn_sorts, quicksorts, external_sorts, external_merges,
19 |     nb_tapes > 2 AS multiple_tapes,
20 |     space_disk > 1024 AS "disk_more_1MB",
21 |     space_memory > 1024 AS  "mem_more_1MB",
22 |     non_parallels, COALESCE(nb_workers, 0) AS nb_workers
23 | FROM pg_sortstats(true) ORDER BY nb_keys;
24 | 
25 | SELECT pg_sortstats_reset();
26 | 
27 | SET work_mem = '12MB';
28 | WITH src AS MATERIALIZED (
29 |     SELECT * FROM sorts ORDER BY val, id DESC
30 | )
31 | SELECT * FROM src LIMIT 1;
32 | 
33 | SELECT nb_keys, sort_keys, lines, lines_to_sort,
34 |     work_mems < (12 * 1024) AS "exp_less_12MB",
35 |     topn_sorts, quicksorts, external_sorts, external_merges,
36 |     nb_tapes > 2 AS multiple_tapes,
37 |     space_disk > 1024 AS "disk_more_1MB",
38 |     space_memory > 1024 AS  "mem_more_1MB",
39 |     non_parallels, COALESCE(nb_workers, 0) AS nb_workers
40 | FROM pg_sortstats(true) ORDER BY nb_keys;
41 | 
42 | SELECT pg_sortstats_reset();
43 | 


--------------------------------------------------------------------------------