├── .gitignore ├── LICENSE ├── META.json ├── Makefile ├── README.md ├── expected ├── pg_sortstats.out └── pg_sortstats_12.out ├── include ├── pg_sortstats_import.h ├── pg_sortstats_import_pg10.h ├── pg_sortstats_import_pg11.h ├── pg_sortstats_import_pg12.h ├── pg_sortstats_import_pg13.h ├── pg_sortstats_import_pg14.h ├── pg_sortstats_import_pg9_4.h ├── pg_sortstats_import_pg9_5.h └── pg_sortstats_import_pg9_6.h ├── pg_sortstats--0.0.1.sql ├── pg_sortstats.c ├── pg_sortstats.control ├── pg_sortstats_import.c └── sql ├── pg_sortstats.sql └── pg_sortstats_12.sql /.gitignore: -------------------------------------------------------------------------------- 1 | .*.sw* 2 | *.o 3 | *.so 4 | *.zip 5 | *.bc 6 | *.gcno 7 | tags 8 | results/ 9 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2018-2023, The PoWA-team 2 | 3 | Permission to use, copy, modify, and distribute this software and its 4 | documentation for any purpose, without fee, and without a written agreement is 5 | hereby granted, provided that the above copyright notice and this paragraph and 6 | the following two paragraphs appear in all copies. 7 | 8 | IN NO EVENT SHALL The PoWA-team BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, 9 | SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING 10 | OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF The PoWA-team 11 | HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 12 | 13 | The PoWA-team SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED 14 | TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 15 | PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND The 16 | PoWA-teamRouhaud HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, 17 | ENHANCEMENTS, OR MODIFICATIONS. 18 | -------------------------------------------------------------------------------- /META.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "pg_sortstats", 3 | "abstract": "An extension collecting statistics about sorts", 4 | "version": "__VERSION__", 5 | "maintainer": "Julien Rouhaud ", 6 | "license": "postgresql", 7 | "release_status": "stable", 8 | "provides": { 9 | "pg_sortstats": { 10 | "abstract": "An extension collecting statistics about sorts", 11 | "file": "pg_sortstats.sql", 12 | "docfile": "README.md", 13 | "version": "__VERSION__" 14 | } 15 | }, 16 | "meta-spec": { 17 | "version": "1.0.0", 18 | "url": "http://pgxn.org/meta/spec.txt" 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | EXTENSION = pg_sortstats 2 | EXTVERSION = $(shell grep default_version $(EXTENSION).control | sed -e "s/default_version[[:space:]]*=[[:space:]]*'\([^']*\)'/\1/") 3 | REGRESS_OPTS = --inputdir=test 4 | REGRESS = pg_sortstats # Can be overloaded later 5 | 6 | PG_CONFIG ?= pg_config 7 | 8 | MODULE_big = pg_sortstats 9 | OBJS = pg_sortstats.o pg_sortstats_import.o 10 | 11 | all: 12 | 13 | release-zip: all 14 | git archive --format zip --prefix=${EXTENSION}-$(EXTVERSION)/ --output ./${EXTENSION}-$(EXTVERSION).zip HEAD 15 | unzip ./${EXTENSION}-$(EXTVERSION).zip 16 | rm ./${EXTENSION}-$(EXTVERSION).zip 17 | sed -i -e "s/__VERSION__/$(EXTVERSION)/g" ./${EXTENSION}-$(EXTVERSION)/META.json 18 | zip -r ./${EXTENSION}-$(EXTVERSION).zip ./${EXTENSION}-$(EXTVERSION)/ 19 | rm ./${EXTENSION}-$(EXTVERSION) -rf 20 | 21 | 22 | DATA = $(wildcard *--*.sql) 23 | PGXS := $(shell $(PG_CONFIG) --pgxs) 24 | include $(PGXS) 25 | 26 | # Change the regression test for pg12+ 27 | ifneq ($(MAJORVERSION),$(filter $(MAJORVERSION), 9.2 9.3 9.4 9.5 9.6 10 11)) 28 | REGRESS = pg_sortstats_12 29 | endif 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | pg_sortstats 2 | ============ 3 | 4 | **/!\ This extension is under development and not production ready. Use at 5 | your own risk.** 6 | 7 | PostgreSQL extension to gather and cumulate various statistics about sorts, and 8 | estimate how much work\_mem would be needed to have the sort done in memory. 9 | 10 | Statistics are aggregated per queryid (query identifier as computed by 11 | pg\_stat\_statements), userid, dbid and sort\_key (the textual representation 12 | of the sort being performed). 13 | 14 | pg\_stat\_statements is needed to provide the queryid field. 15 | 16 | Installation 17 | ============ 18 | 19 | Compiling 20 | -------- 21 | 22 | The module can be built using the standard PGXS infrastructure. For this to 23 | work, the ``pg_config`` program must be available in your $PATH. Instruction to 24 | install follows:: 25 | 26 | git clone https://github.com/powa-team/pg_sortstats.git 27 | cd pg_sortstats 28 | make 29 | make install 30 | 31 | NOTE: The "make install" part may require root privilege. 32 | 33 | PostgreSQL setup 34 | ---------------- 35 | 36 | The extension is now available. But, as it requires some shared memory to hold 37 | its counters, the module must be loaded at PostgreSQL startup. Thus, you must 38 | add the module to ``shared_preload_libraries`` in your ``postgresql.conf``. You 39 | need a server restart to take the change into account. As this extension 40 | depends on pg_stat_statements, it also need to be added to 41 | ``shared_preload_libraries``. 42 | 43 | Add the following parameters into you ``postgresql.conf``:: 44 | 45 | # postgresql.conf 46 | shared_preload_libraries = 'pg_stat_statements,pg_sortstats' 47 | 48 | Once your PostgreSQL cluster is restarted, you can install the extension in 49 | every database where you need to access the statistics:: 50 | 51 | mydb=# CREATE EXTENSION pg_sortstats; 52 | 53 | Usage 54 | ----- 55 | 56 | The `pg_sortstats` view provides the following fields: 57 | 58 | | fieldname | description | 59 | |-----------------|----------------------------------------------------------------------------------| 60 | | queryid | pg_stat_statements' queryid | 61 | | userid | user identifier | 62 | | dbid | database identifier | 63 | | sort_key | the textual sort expression | 64 | | lines | total number of lines the sort node had in input | 65 | | lines_to_sort | total number of lines the sort node has to sort (different when there's a LIMIT) | 66 | | work_mems | total size of needed work_mem that was estimated to perform the sort in memory | 67 | | topn_sorts | total number of sorts done using Top-N heapsort algorithm | 68 | | quicksorts | total number of sorts done using quicksort algorithm | 69 | | external_sorts | total number of sorts done using external sort algorithm | 70 | | external_merges | total number of sorts done using external merge algorithm | 71 | | nbtapes | total number of tapes used for external merge sorts | 72 | | space_disk | total disk space used to perform the sort | 73 | | space_memory | total memory space used to perform the sort | 74 | | non_parallels | total number of sorts not done in parallel | 75 | | nb_workers | total number of processes used to perform the sort | 76 | 77 | The `pg_sortstats(showtext)` can be used instead, passing **false** as 78 | paremeter if you don't need the sort_key field. 79 | 80 | The `pg_sortstats_reset()` function can be used to remove all stored 81 | statistics. 82 | -------------------------------------------------------------------------------- /expected/pg_sortstats.out: -------------------------------------------------------------------------------- 1 | CREATE EXTENSION pg_sortstats; 2 | SELECT pg_sortstats_reset(); 3 | pg_sortstats_reset 4 | -------------------- 5 | 6 | (1 row) 7 | 8 | CREATE TABLE sorts (id integer, val text COLLATE "C"); 9 | INSERT INTO sorts SELECT i, 'line ' || i FROM generate_series(1, 100000) i; 10 | VACUUM ANALYZE sorts; 11 | SET work_mem = '64kB'; 12 | WITH src AS ( 13 | SELECT * FROM sorts ORDER BY val, id DESC 14 | ) 15 | SELECT * FROM src LIMIT 1; 16 | id | val 17 | ----+-------- 18 | 1 | line 1 19 | (1 row) 20 | 21 | SELECT * FROM sorts ORDER BY id DESC LIMIT 1; 22 | id | val 23 | --------+------------- 24 | 100000 | line 100000 25 | (1 row) 26 | 27 | SELECT nb_keys, sort_keys, lines, lines_to_sort, 28 | work_mems < (12 * 1024) AS "exp_less_12MB", 29 | topn_sorts, quicksorts, external_sorts, external_merges, 30 | nb_tapes > 2 AS multiple_tapes, 31 | space_disk > 1024 AS "disk_more_1MB", 32 | space_memory > 1024 AS "mem_more_1MB", 33 | non_parallels, COALESCE(nb_workers, 0) AS nb_workers 34 | FROM pg_sortstats(true) ORDER BY nb_keys; 35 | nb_keys | sort_keys | lines | lines_to_sort | exp_less_12MB | topn_sorts | quicksorts | external_sorts | external_merges | multiple_tapes | disk_more_1MB | mem_more_1MB | non_parallels | nb_workers 36 | ---------+--------------------------------------+--------+---------------+---------------+------------+------------+----------------+-----------------+----------------+---------------+--------------+---------------+------------ 37 | 1 | id DESC | 100000 | 1 | t | 1 | 0 | 0 | 0 | f | f | f | 1 | 0 38 | 2 | sorts.val COLLATE "C", sorts.id DESC | 100000 | 100000 | t | 0 | 0 | 0 | 1 | t | t | f | 1 | 0 39 | (2 rows) 40 | 41 | SELECT pg_sortstats_reset(); 42 | pg_sortstats_reset 43 | -------------------- 44 | 45 | (1 row) 46 | 47 | SET work_mem = '12MB'; 48 | WITH src AS ( 49 | SELECT * FROM sorts ORDER BY val, id DESC 50 | ) 51 | SELECT * FROM src LIMIT 1; 52 | id | val 53 | ----+-------- 54 | 1 | line 1 55 | (1 row) 56 | 57 | SELECT nb_keys, sort_keys, lines, lines_to_sort, 58 | work_mems < (12 * 1024) AS "exp_less_12MB", 59 | topn_sorts, quicksorts, external_sorts, external_merges, 60 | nb_tapes > 2 AS multiple_tapes, 61 | space_disk > 1024 AS "disk_more_1MB", 62 | space_memory > 1024 AS "mem_more_1MB", 63 | non_parallels, COALESCE(nb_workers, 0) AS nb_workers 64 | FROM pg_sortstats(true) ORDER BY nb_keys; 65 | nb_keys | sort_keys | lines | lines_to_sort | exp_less_12MB | topn_sorts | quicksorts | external_sorts | external_merges | multiple_tapes | disk_more_1MB | mem_more_1MB | non_parallels | nb_workers 66 | ---------+--------------------------------------+--------+---------------+---------------+------------+------------+----------------+-----------------+----------------+---------------+--------------+---------------+------------ 67 | 2 | sorts.val COLLATE "C", sorts.id DESC | 100000 | 100000 | t | 0 | 1 | 0 | 0 | f | f | t | 1 | 0 68 | (1 row) 69 | 70 | SELECT pg_sortstats_reset(); 71 | pg_sortstats_reset 72 | -------------------- 73 | 74 | (1 row) 75 | 76 | -------------------------------------------------------------------------------- /expected/pg_sortstats_12.out: -------------------------------------------------------------------------------- 1 | CREATE EXTENSION pg_sortstats; 2 | SELECT pg_sortstats_reset(); 3 | pg_sortstats_reset 4 | -------------------- 5 | 6 | (1 row) 7 | 8 | CREATE TABLE sorts (id integer, val text COLLATE "C"); 9 | INSERT INTO sorts SELECT i, 'line ' || i FROM generate_series(1, 100000) i; 10 | VACUUM ANALYZE sorts; 11 | SET work_mem = '64kB'; 12 | WITH src AS MATERIALIZED ( 13 | SELECT * FROM sorts ORDER BY val, id DESC 14 | ) 15 | SELECT * FROM src LIMIT 1; 16 | id | val 17 | ----+-------- 18 | 1 | line 1 19 | (1 row) 20 | 21 | SELECT * FROM sorts ORDER BY id DESC LIMIT 1; 22 | id | val 23 | --------+------------- 24 | 100000 | line 100000 25 | (1 row) 26 | 27 | SELECT nb_keys, sort_keys, lines, lines_to_sort, 28 | work_mems < (12 * 1024) AS "exp_less_12MB", 29 | topn_sorts, quicksorts, external_sorts, external_merges, 30 | nb_tapes > 2 AS multiple_tapes, 31 | space_disk > 1024 AS "disk_more_1MB", 32 | space_memory > 1024 AS "mem_more_1MB", 33 | non_parallels, COALESCE(nb_workers, 0) AS nb_workers 34 | FROM pg_sortstats(true) ORDER BY nb_keys; 35 | nb_keys | sort_keys | lines | lines_to_sort | exp_less_12MB | topn_sorts | quicksorts | external_sorts | external_merges | multiple_tapes | disk_more_1MB | mem_more_1MB | non_parallels | nb_workers 36 | ---------+--------------------------------------+--------+---------------+---------------+------------+------------+----------------+-----------------+----------------+---------------+--------------+---------------+------------ 37 | 1 | id DESC | 100000 | 1 | t | 1 | 0 | 0 | 0 | f | f | f | 1 | 0 38 | 2 | sorts.val COLLATE "C", sorts.id DESC | 100000 | 100000 | t | 0 | 0 | 0 | 1 | t | t | f | 1 | 0 39 | (2 rows) 40 | 41 | SELECT pg_sortstats_reset(); 42 | pg_sortstats_reset 43 | -------------------- 44 | 45 | (1 row) 46 | 47 | SET work_mem = '12MB'; 48 | WITH src AS MATERIALIZED ( 49 | SELECT * FROM sorts ORDER BY val, id DESC 50 | ) 51 | SELECT * FROM src LIMIT 1; 52 | id | val 53 | ----+-------- 54 | 1 | line 1 55 | (1 row) 56 | 57 | SELECT nb_keys, sort_keys, lines, lines_to_sort, 58 | work_mems < (12 * 1024) AS "exp_less_12MB", 59 | topn_sorts, quicksorts, external_sorts, external_merges, 60 | nb_tapes > 2 AS multiple_tapes, 61 | space_disk > 1024 AS "disk_more_1MB", 62 | space_memory > 1024 AS "mem_more_1MB", 63 | non_parallels, COALESCE(nb_workers, 0) AS nb_workers 64 | FROM pg_sortstats(true) ORDER BY nb_keys; 65 | nb_keys | sort_keys | lines | lines_to_sort | exp_less_12MB | topn_sorts | quicksorts | external_sorts | external_merges | multiple_tapes | disk_more_1MB | mem_more_1MB | non_parallels | nb_workers 66 | ---------+--------------------------------------+--------+---------------+---------------+------------+------------+----------------+-----------------+----------------+---------------+--------------+---------------+------------ 67 | 2 | sorts.val COLLATE "C", sorts.id DESC | 100000 | 100000 | t | 0 | 1 | 0 | 0 | f | f | t | 1 | 0 68 | (1 row) 69 | 70 | SELECT pg_sortstats_reset(); 71 | pg_sortstats_reset 72 | -------------------- 73 | 74 | (1 row) 75 | 76 | -------------------------------------------------------------------------------- /include/pg_sortstats_import.h: -------------------------------------------------------------------------------- 1 | #ifndef PG_SORTSTATS_IMPORT_h 2 | #define PG_SORTSTATS_IMPORT_h 3 | 4 | #include "nodes/execnodes.h" 5 | #include "utils/logtape.h" 6 | #if PG_VERSION_NUM < 90500 7 | #include "lib/stringinfo.h" 8 | #endif 9 | 10 | #if PG_VERSION_NUM >= 90400 && PG_VERSION_NUM < 90500 11 | #include "include/pg_sortstats_import_pg9_4.h" 12 | #elif PG_VERSION_NUM >= 90500 && PG_VERSION_NUM < 90600 13 | #include "include/pg_sortstats_import_pg9_5.h" 14 | #elif PG_VERSION_NUM >= 90600 && PG_VERSION_NUM < 100000 15 | #include "include/pg_sortstats_import_pg9_6.h" 16 | #elif PG_VERSION_NUM >= 100000 && PG_VERSION_NUM < 110000 17 | #include "include/pg_sortstats_import_pg10.h" 18 | #elif PG_VERSION_NUM >= 110000 && PG_VERSION_NUM < 120000 19 | #include "include/pg_sortstats_import_pg11.h" 20 | #elif PG_VERSION_NUM >= 120000 && PG_VERSION_NUM < 130000 21 | #include "include/pg_sortstats_import_pg12.h" 22 | #elif PG_VERSION_NUM >= 130000 && PG_VERSION_NUM < 140000 23 | #include "include/pg_sortstats_import_pg13.h" 24 | #elif PG_VERSION_NUM >= 140000 && PG_VERSION_NUM < 150000 25 | #include "include/pg_sortstats_import_pg14.h" 26 | #else 27 | #error "PostgreSQL version not supported" 28 | #endif 29 | 30 | #if PG_VERSION_NUM < 140000 31 | #define ParallelLeaderBackendId ParallelMasterBackendId 32 | #endif 33 | 34 | /* 35 | * Import some define that are stable enough so that we don't need a 36 | * per-major-version definition 37 | */ 38 | #define PGSRT_ALLOC_MINBITS 3 /* smallest chunk size is 8 bytes */ 39 | #define PGSRT_ALLOCSET_NUM_FREELISTS 11 40 | #define PGSRT_ALLOC_CHUNK_LIMIT (1 << (PGSRT_ALLOCSET_NUM_FREELISTS-1+PGSRT_ALLOC_MINBITS)) 41 | 42 | bool pgsrt_PreScanNode(PlanState *planstate, Bitmapset **rels_used); 43 | void pgsrt_show_sortorder_options(StringInfo buf, Node *sortexpr, 44 | Oid sortOperator, Oid collation, bool nullsFirst); 45 | 46 | #if PG_VERSION_NUM < 90600 47 | 48 | bool planstate_tree_walker(PlanState *planstate, 49 | bool (*walker) (), 50 | void *context); 51 | 52 | bool planstate_walk_subplans(List *plans, 53 | bool (*walker) (), 54 | void *context); 55 | 56 | 57 | bool planstate_walk_members(PlanState **planstates, int nplans, 58 | bool (*walker) (), void *context); 59 | 60 | #endif /* PG_VERSION_NUM < 90600 */ 61 | 62 | #endif 63 | -------------------------------------------------------------------------------- /include/pg_sortstats_import_pg10.h: -------------------------------------------------------------------------------- 1 | #ifndef PG_SORTSTATS_IMPORT_PG10_H 2 | #define PG_SORTSTATS_IMPORT_PG10_H 3 | 4 | #define PGSRT_ALLOC_CHUNKHDRSZ sizeof(struct pgsrt_AllocChunkData) 5 | 6 | /* 7 | * AllocChunk 8 | * The prefix of each piece of memory in an AllocBlock 9 | * 10 | * NB: this MUST match StandardChunkHeader as defined by utils/memutils.h. 11 | */ 12 | typedef struct pgsrt_AllocChunkData 13 | { 14 | /* aset is the owning aset if allocated, or the freelist link if free */ 15 | void *aset; 16 | /* size is always the size of the usable space in the chunk */ 17 | Size size; 18 | #ifdef MEMORY_CONTEXT_CHECKING 19 | /* when debugging memory usage, also store actual requested size */ 20 | /* this is zero in a free chunk */ 21 | Size requested_size; 22 | #endif 23 | } pgsrt_AllocChunkData; 24 | 25 | 26 | #define SLAB_SLOT_SIZE 1024 27 | typedef union SlabSlot 28 | { 29 | union SlabSlot *nextfree; 30 | char buffer[SLAB_SLOT_SIZE]; 31 | } SlabSlot; 32 | 33 | typedef struct 34 | { 35 | void *tuple; /* the tuple itself */ 36 | Datum datum1; /* value of first key column */ 37 | bool isnull1; /* is first key column NULL? */ 38 | int tupindex; /* see notes above */ 39 | } SortTuple; 40 | 41 | typedef int (*SortTupleComparator) (const SortTuple *a, const SortTuple *b, 42 | Tuplesortstate *state); 43 | 44 | typedef enum 45 | { 46 | TSS_INITIAL, /* Loading tuples; still within memory limit */ 47 | TSS_BOUNDED, /* Loading tuples into bounded-size heap */ 48 | TSS_BUILDRUNS, /* Loading tuples; writing to tape */ 49 | TSS_SORTEDINMEM, /* Sort completed entirely in memory */ 50 | TSS_SORTEDONTAPE, /* Sort completed, final run is on tape */ 51 | TSS_FINALMERGE /* Performing final merge on-the-fly */ 52 | } TupSortStatus; 53 | 54 | typedef struct pgsrt_Tuplesortstate 55 | { 56 | TupSortStatus status; /* enumerated value as shown above */ 57 | int nKeys; /* number of columns in sort key */ 58 | bool randomAccess; /* did caller request random access? */ 59 | bool bounded; /* did caller specify a maximum number of 60 | * tuples to return? */ 61 | bool boundUsed; /* true if we made use of a bounded heap */ 62 | int bound; /* if bounded, the maximum number of tuples */ 63 | bool tuples; /* Can SortTuple.tuple ever be set? */ 64 | int64 availMem; /* remaining memory available, in bytes */ 65 | int64 allowedMem; /* total memory allowed, in bytes */ 66 | int maxTapes; /* number of tapes (Knuth's T) */ 67 | int tapeRange; /* maxTapes-1 (Knuth's P) */ 68 | MemoryContext sortcontext; /* memory context holding most sort data */ 69 | MemoryContext tuplecontext; /* sub-context of sortcontext for tuple data */ 70 | LogicalTapeSet *tapeset; /* logtape.c object for tapes in a temp file */ 71 | 72 | /* 73 | * These function pointers decouple the routines that must know what kind 74 | * of tuple we are sorting from the routines that don't need to know it. 75 | * They are set up by the tuplesort_begin_xxx routines. 76 | * 77 | * Function to compare two tuples; result is per qsort() convention, ie: 78 | * <0, 0, >0 according as ab. The API must match 79 | * qsort_arg_comparator. 80 | */ 81 | SortTupleComparator comparetup; 82 | 83 | /* 84 | * Function to copy a supplied input tuple into palloc'd space and set up 85 | * its SortTuple representation (ie, set tuple/datum1/isnull1). Also, 86 | * state->availMem must be decreased by the amount of space used for the 87 | * tuple copy (note the SortTuple struct itself is not counted). 88 | */ 89 | void (*copytup) (Tuplesortstate *state, SortTuple *stup, void *tup); 90 | 91 | /* 92 | * Function to write a stored tuple onto tape. The representation of the 93 | * tuple on tape need not be the same as it is in memory; requirements on 94 | * the tape representation are given below. Unless the slab allocator is 95 | * used, after writing the tuple, pfree() the out-of-line data (not the 96 | * SortTuple struct!), and increase state->availMem by the amount of 97 | * memory space thereby released. 98 | */ 99 | void (*writetup) (Tuplesortstate *state, int tapenum, 100 | SortTuple *stup); 101 | 102 | /* 103 | * Function to read a stored tuple from tape back into memory. 'len' is 104 | * the already-read length of the stored tuple. The tuple is allocated 105 | * from the slab memory arena, or is palloc'd, see readtup_alloc(). 106 | */ 107 | void (*readtup) (Tuplesortstate *state, SortTuple *stup, 108 | int tapenum, unsigned int len); 109 | 110 | /* 111 | * This array holds the tuples now in sort memory. If we are in state 112 | * INITIAL, the tuples are in no particular order; if we are in state 113 | * SORTEDINMEM, the tuples are in final sorted order; in states BUILDRUNS 114 | * and FINALMERGE, the tuples are organized in "heap" order per Algorithm 115 | * H. In state SORTEDONTAPE, the array is not used. 116 | */ 117 | SortTuple *memtuples; /* array of SortTuple structs */ 118 | int memtupcount; /* number of tuples currently present */ 119 | int memtupsize; /* allocated length of memtuples array */ 120 | bool growmemtuples; /* memtuples' growth still underway? */ 121 | 122 | /* 123 | * Memory for tuples is sometimes allocated using a simple slab allocator, 124 | * rather than with palloc(). Currently, we switch to slab allocation 125 | * when we start merging. Merging only needs to keep a small, fixed 126 | * number of tuples in memory at any time, so we can avoid the 127 | * palloc/pfree overhead by recycling a fixed number of fixed-size slots 128 | * to hold the tuples. 129 | * 130 | * For the slab, we use one large allocation, divided into SLAB_SLOT_SIZE 131 | * slots. The allocation is sized to have one slot per tape, plus one 132 | * additional slot. We need that many slots to hold all the tuples kept 133 | * in the heap during merge, plus the one we have last returned from the 134 | * sort, with tuplesort_gettuple. 135 | * 136 | * Initially, all the slots are kept in a linked list of free slots. When 137 | * a tuple is read from a tape, it is put to the next available slot, if 138 | * it fits. If the tuple is larger than SLAB_SLOT_SIZE, it is palloc'd 139 | * instead. 140 | * 141 | * When we're done processing a tuple, we return the slot back to the free 142 | * list, or pfree() if it was palloc'd. We know that a tuple was 143 | * allocated from the slab, if its pointer value is between 144 | * slabMemoryBegin and -End. 145 | * 146 | * When the slab allocator is used, the USEMEM/LACKMEM mechanism of 147 | * tracking memory usage is not used. 148 | */ 149 | bool slabAllocatorUsed; 150 | 151 | char *slabMemoryBegin; /* beginning of slab memory arena */ 152 | char *slabMemoryEnd; /* end of slab memory arena */ 153 | SlabSlot *slabFreeHead; /* head of free list */ 154 | 155 | /* Buffer size to use for reading input tapes, during merge. */ 156 | size_t read_buffer_size; 157 | 158 | /* 159 | * When we return a tuple to the caller in tuplesort_gettuple_XXX, that 160 | * came from a tape (that is, in TSS_SORTEDONTAPE or TSS_FINALMERGE 161 | * modes), we remember the tuple in 'lastReturnedTuple', so that we can 162 | * recycle the memory on next gettuple call. 163 | */ 164 | void *lastReturnedTuple; 165 | 166 | /* 167 | * While building initial runs, this indicates if the replacement 168 | * selection strategy is in use. When it isn't, then a simple hybrid 169 | * sort-merge strategy is in use instead (runs are quicksorted). 170 | */ 171 | bool replaceActive; 172 | 173 | /* 174 | * While building initial runs, this is the current output run number 175 | * (starting at RUN_FIRST). Afterwards, it is the number of initial runs 176 | * we made. 177 | */ 178 | int currentRun; 179 | } pgsrt_Tuplesortstate; 180 | 181 | #endif /* PG_SORTSTATS_IMPORT_PG10_H */ 182 | -------------------------------------------------------------------------------- /include/pg_sortstats_import_pg11.h: -------------------------------------------------------------------------------- 1 | #ifndef PG_SORTSTATS_IMPORT_PG11_H 2 | #define PG_SORTSTATS_IMPORT_PG11_H 3 | 4 | #define PGSRT_ALLOC_CHUNKHDRSZ sizeof(struct pgsrt_AllocChunkData) 5 | 6 | /* 7 | * AllocChunk 8 | * The prefix of each piece of memory in an AllocBlock 9 | * 10 | * Note: to meet the memory context APIs, the payload area of the chunk must 11 | * be maxaligned, and the "aset" link must be immediately adjacent to the 12 | * payload area (cf. GetMemoryChunkContext). We simplify matters for this 13 | * module by requiring sizeof(AllocChunkData) to be maxaligned, and then 14 | * we can ensure things work by adding any required alignment padding before 15 | * the "aset" field. There is a static assertion below that the alignment 16 | * is done correctly. 17 | */ 18 | typedef struct pgsrt_AllocChunkData 19 | { 20 | /* size is always the size of the usable space in the chunk */ 21 | Size size; 22 | #ifdef MEMORY_CONTEXT_CHECKING 23 | /* when debugging memory usage, also store actual requested size */ 24 | /* this is zero in a free chunk */ 25 | Size requested_size; 26 | 27 | #define ALLOCCHUNK_RAWSIZE (SIZEOF_SIZE_T * 2 + SIZEOF_VOID_P) 28 | #else 29 | #define ALLOCCHUNK_RAWSIZE (SIZEOF_SIZE_T + SIZEOF_VOID_P) 30 | #endif /* MEMORY_CONTEXT_CHECKING */ 31 | 32 | /* ensure proper alignment by adding padding if needed */ 33 | #if (ALLOCCHUNK_RAWSIZE % MAXIMUM_ALIGNOF) != 0 34 | char padding[MAXIMUM_ALIGNOF - ALLOCCHUNK_RAWSIZE % MAXIMUM_ALIGNOF]; 35 | #endif 36 | 37 | /* aset is the owning aset if allocated, or the freelist link if free */ 38 | void *aset; 39 | /* there must not be any padding to reach a MAXALIGN boundary here! */ 40 | } pgsrt_AllocChunkData; 41 | 42 | 43 | #define SLAB_SLOT_SIZE 1024 44 | typedef union SlabSlot 45 | { 46 | union SlabSlot *nextfree; 47 | char buffer[SLAB_SLOT_SIZE]; 48 | } SlabSlot; 49 | 50 | typedef struct 51 | { 52 | void *tuple; /* the tuple itself */ 53 | Datum datum1; /* value of first key column */ 54 | bool isnull1; /* is first key column NULL? */ 55 | int tupindex; /* see notes above */ 56 | } SortTuple; 57 | 58 | typedef int (*SortTupleComparator) (const SortTuple *a, const SortTuple *b, 59 | Tuplesortstate *state); 60 | 61 | typedef enum 62 | { 63 | TSS_INITIAL, /* Loading tuples; still within memory limit */ 64 | TSS_BOUNDED, /* Loading tuples into bounded-size heap */ 65 | TSS_BUILDRUNS, /* Loading tuples; writing to tape */ 66 | TSS_SORTEDINMEM, /* Sort completed entirely in memory */ 67 | TSS_SORTEDONTAPE, /* Sort completed, final run is on tape */ 68 | TSS_FINALMERGE /* Performing final merge on-the-fly */ 69 | } TupSortStatus; 70 | 71 | typedef struct pgsrt_Tuplesortstate 72 | { 73 | TupSortStatus status; /* enumerated value as shown above */ 74 | int nKeys; /* number of columns in sort key */ 75 | bool randomAccess; /* did caller request random access? */ 76 | bool bounded; /* did caller specify a maximum number of 77 | * tuples to return? */ 78 | bool boundUsed; /* true if we made use of a bounded heap */ 79 | int bound; /* if bounded, the maximum number of tuples */ 80 | bool tuples; /* Can SortTuple.tuple ever be set? */ 81 | int64 availMem; /* remaining memory available, in bytes */ 82 | int64 allowedMem; /* total memory allowed, in bytes */ 83 | int maxTapes; /* number of tapes (Knuth's T) */ 84 | int tapeRange; /* maxTapes-1 (Knuth's P) */ 85 | MemoryContext sortcontext; /* memory context holding most sort data */ 86 | MemoryContext tuplecontext; /* sub-context of sortcontext for tuple data */ 87 | LogicalTapeSet *tapeset; /* logtape.c object for tapes in a temp file */ 88 | 89 | /* 90 | * These function pointers decouple the routines that must know what kind 91 | * of tuple we are sorting from the routines that don't need to know it. 92 | * They are set up by the tuplesort_begin_xxx routines. 93 | * 94 | * Function to compare two tuples; result is per qsort() convention, ie: 95 | * <0, 0, >0 according as ab. The API must match 96 | * qsort_arg_comparator. 97 | */ 98 | SortTupleComparator comparetup; 99 | 100 | /* 101 | * Function to copy a supplied input tuple into palloc'd space and set up 102 | * its SortTuple representation (ie, set tuple/datum1/isnull1). Also, 103 | * state->availMem must be decreased by the amount of space used for the 104 | * tuple copy (note the SortTuple struct itself is not counted). 105 | */ 106 | void (*copytup) (Tuplesortstate *state, SortTuple *stup, void *tup); 107 | 108 | /* 109 | * Function to write a stored tuple onto tape. The representation of the 110 | * tuple on tape need not be the same as it is in memory; requirements on 111 | * the tape representation are given below. Unless the slab allocator is 112 | * used, after writing the tuple, pfree() the out-of-line data (not the 113 | * SortTuple struct!), and increase state->availMem by the amount of 114 | * memory space thereby released. 115 | */ 116 | void (*writetup) (Tuplesortstate *state, int tapenum, 117 | SortTuple *stup); 118 | 119 | /* 120 | * Function to read a stored tuple from tape back into memory. 'len' is 121 | * the already-read length of the stored tuple. The tuple is allocated 122 | * from the slab memory arena, or is palloc'd, see readtup_alloc(). 123 | */ 124 | void (*readtup) (Tuplesortstate *state, SortTuple *stup, 125 | int tapenum, unsigned int len); 126 | 127 | /* 128 | * This array holds the tuples now in sort memory. If we are in state 129 | * INITIAL, the tuples are in no particular order; if we are in state 130 | * SORTEDINMEM, the tuples are in final sorted order; in states BUILDRUNS 131 | * and FINALMERGE, the tuples are organized in "heap" order per Algorithm 132 | * H. In state SORTEDONTAPE, the array is not used. 133 | */ 134 | SortTuple *memtuples; /* array of SortTuple structs */ 135 | int memtupcount; /* number of tuples currently present */ 136 | int memtupsize; /* allocated length of memtuples array */ 137 | bool growmemtuples; /* memtuples' growth still underway? */ 138 | 139 | /* 140 | * Memory for tuples is sometimes allocated using a simple slab allocator, 141 | * rather than with palloc(). Currently, we switch to slab allocation 142 | * when we start merging. Merging only needs to keep a small, fixed 143 | * number of tuples in memory at any time, so we can avoid the 144 | * palloc/pfree overhead by recycling a fixed number of fixed-size slots 145 | * to hold the tuples. 146 | * 147 | * For the slab, we use one large allocation, divided into SLAB_SLOT_SIZE 148 | * slots. The allocation is sized to have one slot per tape, plus one 149 | * additional slot. We need that many slots to hold all the tuples kept 150 | * in the heap during merge, plus the one we have last returned from the 151 | * sort, with tuplesort_gettuple. 152 | * 153 | * Initially, all the slots are kept in a linked list of free slots. When 154 | * a tuple is read from a tape, it is put to the next available slot, if 155 | * it fits. If the tuple is larger than SLAB_SLOT_SIZE, it is palloc'd 156 | * instead. 157 | * 158 | * When we're done processing a tuple, we return the slot back to the free 159 | * list, or pfree() if it was palloc'd. We know that a tuple was 160 | * allocated from the slab, if its pointer value is between 161 | * slabMemoryBegin and -End. 162 | * 163 | * When the slab allocator is used, the USEMEM/LACKMEM mechanism of 164 | * tracking memory usage is not used. 165 | */ 166 | bool slabAllocatorUsed; 167 | 168 | char *slabMemoryBegin; /* beginning of slab memory arena */ 169 | char *slabMemoryEnd; /* end of slab memory arena */ 170 | SlabSlot *slabFreeHead; /* head of free list */ 171 | 172 | /* Buffer size to use for reading input tapes, during merge. */ 173 | size_t read_buffer_size; 174 | 175 | /* 176 | * When we return a tuple to the caller in tuplesort_gettuple_XXX, that 177 | * came from a tape (that is, in TSS_SORTEDONTAPE or TSS_FINALMERGE 178 | * modes), we remember the tuple in 'lastReturnedTuple', so that we can 179 | * recycle the memory on next gettuple call. 180 | */ 181 | void *lastReturnedTuple; 182 | 183 | /* 184 | * While building initial runs, this is the current output run number. 185 | * Afterwards, it is the number of initial runs we made. 186 | */ 187 | int currentRun; 188 | } pgsrt_Tuplesortstate; 189 | 190 | #endif /* PG_SORTSTATS_IMPORT_PG11_H */ 191 | -------------------------------------------------------------------------------- /include/pg_sortstats_import_pg12.h: -------------------------------------------------------------------------------- 1 | #ifndef PG_SORTSTATS_IMPORT_PG12_H 2 | #define PG_SORTSTATS_IMPORT_PG12_H 3 | 4 | #define PGSRT_ALLOC_CHUNKHDRSZ sizeof(struct pgsrt_AllocChunkData) 5 | 6 | /* 7 | * AllocChunk 8 | * The prefix of each piece of memory in an AllocBlock 9 | * 10 | * Note: to meet the memory context APIs, the payload area of the chunk must 11 | * be maxaligned, and the "aset" link must be immediately adjacent to the 12 | * payload area (cf. GetMemoryChunkContext). We simplify matters for this 13 | * module by requiring sizeof(AllocChunkData) to be maxaligned, and then 14 | * we can ensure things work by adding any required alignment padding before 15 | * the "aset" field. There is a static assertion below that the alignment 16 | * is done correctly. 17 | */ 18 | typedef struct pgsrt_AllocChunkData 19 | { 20 | /* size is always the size of the usable space in the chunk */ 21 | Size size; 22 | #ifdef MEMORY_CONTEXT_CHECKING 23 | /* when debugging memory usage, also store actual requested size */ 24 | /* this is zero in a free chunk */ 25 | Size requested_size; 26 | 27 | #define ALLOCCHUNK_RAWSIZE (SIZEOF_SIZE_T * 2 + SIZEOF_VOID_P) 28 | #else 29 | #define ALLOCCHUNK_RAWSIZE (SIZEOF_SIZE_T + SIZEOF_VOID_P) 30 | #endif /* MEMORY_CONTEXT_CHECKING */ 31 | 32 | /* ensure proper alignment by adding padding if needed */ 33 | #if (ALLOCCHUNK_RAWSIZE % MAXIMUM_ALIGNOF) != 0 34 | char padding[MAXIMUM_ALIGNOF - ALLOCCHUNK_RAWSIZE % MAXIMUM_ALIGNOF]; 35 | #endif 36 | 37 | /* aset is the owning aset if allocated, or the freelist link if free */ 38 | void *aset; 39 | /* there must not be any padding to reach a MAXALIGN boundary here! */ 40 | } pgsrt_AllocChunkData; 41 | 42 | 43 | #define SLAB_SLOT_SIZE 1024 44 | typedef union SlabSlot 45 | { 46 | union SlabSlot *nextfree; 47 | char buffer[SLAB_SLOT_SIZE]; 48 | } SlabSlot; 49 | 50 | typedef struct 51 | { 52 | void *tuple; /* the tuple itself */ 53 | Datum datum1; /* value of first key column */ 54 | bool isnull1; /* is first key column NULL? */ 55 | int tupindex; /* see notes above */ 56 | } SortTuple; 57 | 58 | typedef int (*SortTupleComparator) (const SortTuple *a, const SortTuple *b, 59 | Tuplesortstate *state); 60 | typedef enum 61 | { 62 | TSS_INITIAL, /* Loading tuples; still within memory limit */ 63 | TSS_BOUNDED, /* Loading tuples into bounded-size heap */ 64 | TSS_BUILDRUNS, /* Loading tuples; writing to tape */ 65 | TSS_SORTEDINMEM, /* Sort completed entirely in memory */ 66 | TSS_SORTEDONTAPE, /* Sort completed, final run is on tape */ 67 | TSS_FINALMERGE /* Performing final merge on-the-fly */ 68 | } TupSortStatus; 69 | 70 | typedef struct pgsrt_Tuplesortstate 71 | { 72 | TupSortStatus status; /* enumerated value as shown above */ 73 | int nKeys; /* number of columns in sort key */ 74 | bool randomAccess; /* did caller request random access? */ 75 | bool bounded; /* did caller specify a maximum number of 76 | * tuples to return? */ 77 | bool boundUsed; /* true if we made use of a bounded heap */ 78 | int bound; /* if bounded, the maximum number of tuples */ 79 | bool tuples; /* Can SortTuple.tuple ever be set? */ 80 | int64 availMem; /* remaining memory available, in bytes */ 81 | int64 allowedMem; /* total memory allowed, in bytes */ 82 | int maxTapes; /* number of tapes (Knuth's T) */ 83 | int tapeRange; /* maxTapes-1 (Knuth's P) */ 84 | MemoryContext sortcontext; /* memory context holding most sort data */ 85 | MemoryContext tuplecontext; /* sub-context of sortcontext for tuple data */ 86 | LogicalTapeSet *tapeset; /* logtape.c object for tapes in a temp file */ 87 | 88 | /* 89 | * These function pointers decouple the routines that must know what kind 90 | * of tuple we are sorting from the routines that don't need to know it. 91 | * They are set up by the tuplesort_begin_xxx routines. 92 | * 93 | * Function to compare two tuples; result is per qsort() convention, ie: 94 | * <0, 0, >0 according as ab. The API must match 95 | * qsort_arg_comparator. 96 | */ 97 | SortTupleComparator comparetup; 98 | 99 | /* 100 | * Function to copy a supplied input tuple into palloc'd space and set up 101 | * its SortTuple representation (ie, set tuple/datum1/isnull1). Also, 102 | * state->availMem must be decreased by the amount of space used for the 103 | * tuple copy (note the SortTuple struct itself is not counted). 104 | */ 105 | void (*copytup) (Tuplesortstate *state, SortTuple *stup, void *tup); 106 | 107 | /* 108 | * Function to write a stored tuple onto tape. The representation of the 109 | * tuple on tape need not be the same as it is in memory; requirements on 110 | * the tape representation are given below. Unless the slab allocator is 111 | * used, after writing the tuple, pfree() the out-of-line data (not the 112 | * SortTuple struct!), and increase state->availMem by the amount of 113 | * memory space thereby released. 114 | */ 115 | void (*writetup) (Tuplesortstate *state, int tapenum, 116 | SortTuple *stup); 117 | 118 | /* 119 | * Function to read a stored tuple from tape back into memory. 'len' is 120 | * the already-read length of the stored tuple. The tuple is allocated 121 | * from the slab memory arena, or is palloc'd, see readtup_alloc(). 122 | */ 123 | void (*readtup) (Tuplesortstate *state, SortTuple *stup, 124 | int tapenum, unsigned int len); 125 | 126 | /* 127 | * This array holds the tuples now in sort memory. If we are in state 128 | * INITIAL, the tuples are in no particular order; if we are in state 129 | * SORTEDINMEM, the tuples are in final sorted order; in states BUILDRUNS 130 | * and FINALMERGE, the tuples are organized in "heap" order per Algorithm 131 | * H. In state SORTEDONTAPE, the array is not used. 132 | */ 133 | SortTuple *memtuples; /* array of SortTuple structs */ 134 | int memtupcount; /* number of tuples currently present */ 135 | int memtupsize; /* allocated length of memtuples array */ 136 | bool growmemtuples; /* memtuples' growth still underway? */ 137 | 138 | /* 139 | * Memory for tuples is sometimes allocated using a simple slab allocator, 140 | * rather than with palloc(). Currently, we switch to slab allocation 141 | * when we start merging. Merging only needs to keep a small, fixed 142 | * number of tuples in memory at any time, so we can avoid the 143 | * palloc/pfree overhead by recycling a fixed number of fixed-size slots 144 | * to hold the tuples. 145 | * 146 | * For the slab, we use one large allocation, divided into SLAB_SLOT_SIZE 147 | * slots. The allocation is sized to have one slot per tape, plus one 148 | * additional slot. We need that many slots to hold all the tuples kept 149 | * in the heap during merge, plus the one we have last returned from the 150 | * sort, with tuplesort_gettuple. 151 | * 152 | * Initially, all the slots are kept in a linked list of free slots. When 153 | * a tuple is read from a tape, it is put to the next available slot, if 154 | * it fits. If the tuple is larger than SLAB_SLOT_SIZE, it is palloc'd 155 | * instead. 156 | * 157 | * When we're done processing a tuple, we return the slot back to the free 158 | * list, or pfree() if it was palloc'd. We know that a tuple was 159 | * allocated from the slab, if its pointer value is between 160 | * slabMemoryBegin and -End. 161 | * 162 | * When the slab allocator is used, the USEMEM/LACKMEM mechanism of 163 | * tracking memory usage is not used. 164 | */ 165 | bool slabAllocatorUsed; 166 | 167 | char *slabMemoryBegin; /* beginning of slab memory arena */ 168 | char *slabMemoryEnd; /* end of slab memory arena */ 169 | SlabSlot *slabFreeHead; /* head of free list */ 170 | 171 | /* Buffer size to use for reading input tapes, during merge. */ 172 | size_t read_buffer_size; 173 | 174 | /* 175 | * When we return a tuple to the caller in tuplesort_gettuple_XXX, that 176 | * came from a tape (that is, in TSS_SORTEDONTAPE or TSS_FINALMERGE 177 | * modes), we remember the tuple in 'lastReturnedTuple', so that we can 178 | * recycle the memory on next gettuple call. 179 | */ 180 | void *lastReturnedTuple; 181 | 182 | /* 183 | * While building initial runs, this is the current output run number. 184 | * Afterwards, it is the number of initial runs we made. 185 | */ 186 | int currentRun; 187 | } pgsrt_Tuplesortstate; 188 | 189 | #endif /* PG_SORTSTATS_IMPORT_PG12_H */ 190 | -------------------------------------------------------------------------------- /include/pg_sortstats_import_pg13.h: -------------------------------------------------------------------------------- 1 | #ifndef PG_SORTSTATS_IMPORT_PG13_H 2 | #define PG_SORTSTATS_IMPORT_PG13_H 3 | 4 | #include "utils/pg_rusage.h" 5 | 6 | #define PGSRT_ALLOC_CHUNKHDRSZ sizeof(struct pgsrt_AllocChunkData) 7 | 8 | /* 9 | * AllocChunk 10 | * The prefix of each piece of memory in an AllocBlock 11 | * 12 | * Note: to meet the memory context APIs, the payload area of the chunk must 13 | * be maxaligned, and the "aset" link must be immediately adjacent to the 14 | * payload area (cf. GetMemoryChunkContext). We simplify matters for this 15 | * module by requiring sizeof(AllocChunkData) to be maxaligned, and then 16 | * we can ensure things work by adding any required alignment padding before 17 | * the "aset" field. There is a static assertion below that the alignment 18 | * is done correctly. 19 | */ 20 | typedef struct pgsrt_AllocChunkData 21 | { 22 | /* size is always the size of the usable space in the chunk */ 23 | Size size; 24 | #ifdef MEMORY_CONTEXT_CHECKING 25 | /* when debugging memory usage, also store actual requested size */ 26 | /* this is zero in a free chunk */ 27 | Size requested_size; 28 | 29 | #define ALLOCCHUNK_RAWSIZE (SIZEOF_SIZE_T * 2 + SIZEOF_VOID_P) 30 | #else 31 | #define ALLOCCHUNK_RAWSIZE (SIZEOF_SIZE_T + SIZEOF_VOID_P) 32 | #endif /* MEMORY_CONTEXT_CHECKING */ 33 | 34 | /* ensure proper alignment by adding padding if needed */ 35 | #if (ALLOCCHUNK_RAWSIZE % MAXIMUM_ALIGNOF) != 0 36 | char padding[MAXIMUM_ALIGNOF - ALLOCCHUNK_RAWSIZE % MAXIMUM_ALIGNOF]; 37 | #endif 38 | 39 | /* aset is the owning aset if allocated, or the freelist link if free */ 40 | void *aset; 41 | /* there must not be any padding to reach a MAXALIGN boundary here! */ 42 | } pgsrt_AllocChunkData; 43 | 44 | 45 | #define SLAB_SLOT_SIZE 1024 46 | typedef union SlabSlot 47 | { 48 | union SlabSlot *nextfree; 49 | char buffer[SLAB_SLOT_SIZE]; 50 | } SlabSlot; 51 | 52 | typedef struct 53 | { 54 | void *tuple; /* the tuple itself */ 55 | Datum datum1; /* value of first key column */ 56 | bool isnull1; /* is first key column NULL? */ 57 | int tupindex; /* see notes above */ 58 | } SortTuple; 59 | 60 | typedef int (*SortTupleComparator) (const SortTuple *a, const SortTuple *b, 61 | Tuplesortstate *state); 62 | typedef enum 63 | { 64 | TSS_INITIAL, /* Loading tuples; still within memory limit */ 65 | TSS_BOUNDED, /* Loading tuples into bounded-size heap */ 66 | TSS_BUILDRUNS, /* Loading tuples; writing to tape */ 67 | TSS_SORTEDINMEM, /* Sort completed entirely in memory */ 68 | TSS_SORTEDONTAPE, /* Sort completed, final run is on tape */ 69 | TSS_FINALMERGE /* Performing final merge on-the-fly */ 70 | } TupSortStatus; 71 | 72 | typedef struct pgsrt_Tuplesortstate 73 | { 74 | TupSortStatus status; /* enumerated value as shown above */ 75 | int nKeys; /* number of columns in sort key */ 76 | bool randomAccess; /* did caller request random access? */ 77 | bool bounded; /* did caller specify a maximum number of 78 | * tuples to return? */ 79 | bool boundUsed; /* true if we made use of a bounded heap */ 80 | int bound; /* if bounded, the maximum number of tuples */ 81 | bool tuples; /* Can SortTuple.tuple ever be set? */ 82 | int64 availMem; /* remaining memory available, in bytes */ 83 | int64 allowedMem; /* total memory allowed, in bytes */ 84 | int maxTapes; /* number of tapes (Knuth's T) */ 85 | int tapeRange; /* maxTapes-1 (Knuth's P) */ 86 | int64 maxSpace; /* maximum amount of space occupied among sort 87 | * of groups, either in-memory or on-disk */ 88 | bool isMaxSpaceDisk; /* true when maxSpace is value for on-disk 89 | * space, false when it's value for in-memory 90 | * space */ 91 | TupSortStatus maxSpaceStatus; /* sort status when maxSpace was reached */ 92 | MemoryContext maincontext; /* memory context for tuple sort metadata that 93 | * persists across multiple batches */ 94 | MemoryContext sortcontext; /* memory context holding most sort data */ 95 | MemoryContext tuplecontext; /* sub-context of sortcontext for tuple data */ 96 | LogicalTapeSet *tapeset; /* logtape.c object for tapes in a temp file */ 97 | 98 | /* 99 | * These function pointers decouple the routines that must know what kind 100 | * of tuple we are sorting from the routines that don't need to know it. 101 | * They are set up by the tuplesort_begin_xxx routines. 102 | * 103 | * Function to compare two tuples; result is per qsort() convention, ie: 104 | * <0, 0, >0 according as ab. The API must match 105 | * qsort_arg_comparator. 106 | */ 107 | SortTupleComparator comparetup; 108 | 109 | /* 110 | * Function to copy a supplied input tuple into palloc'd space and set up 111 | * its SortTuple representation (ie, set tuple/datum1/isnull1). Also, 112 | * state->availMem must be decreased by the amount of space used for the 113 | * tuple copy (note the SortTuple struct itself is not counted). 114 | */ 115 | void (*copytup) (Tuplesortstate *state, SortTuple *stup, void *tup); 116 | 117 | /* 118 | * Function to write a stored tuple onto tape. The representation of the 119 | * tuple on tape need not be the same as it is in memory; requirements on 120 | * the tape representation are given below. Unless the slab allocator is 121 | * used, after writing the tuple, pfree() the out-of-line data (not the 122 | * SortTuple struct!), and increase state->availMem by the amount of 123 | * memory space thereby released. 124 | */ 125 | void (*writetup) (Tuplesortstate *state, int tapenum, 126 | SortTuple *stup); 127 | 128 | /* 129 | * Function to read a stored tuple from tape back into memory. 'len' is 130 | * the already-read length of the stored tuple. The tuple is allocated 131 | * from the slab memory arena, or is palloc'd, see readtup_alloc(). 132 | */ 133 | void (*readtup) (Tuplesortstate *state, SortTuple *stup, 134 | int tapenum, unsigned int len); 135 | 136 | /* 137 | * This array holds the tuples now in sort memory. If we are in state 138 | * INITIAL, the tuples are in no particular order; if we are in state 139 | * SORTEDINMEM, the tuples are in final sorted order; in states BUILDRUNS 140 | * and FINALMERGE, the tuples are organized in "heap" order per Algorithm 141 | * H. In state SORTEDONTAPE, the array is not used. 142 | */ 143 | SortTuple *memtuples; /* array of SortTuple structs */ 144 | int memtupcount; /* number of tuples currently present */ 145 | int memtupsize; /* allocated length of memtuples array */ 146 | bool growmemtuples; /* memtuples' growth still underway? */ 147 | 148 | /* 149 | * Memory for tuples is sometimes allocated using a simple slab allocator, 150 | * rather than with palloc(). Currently, we switch to slab allocation 151 | * when we start merging. Merging only needs to keep a small, fixed 152 | * number of tuples in memory at any time, so we can avoid the 153 | * palloc/pfree overhead by recycling a fixed number of fixed-size slots 154 | * to hold the tuples. 155 | * 156 | * For the slab, we use one large allocation, divided into SLAB_SLOT_SIZE 157 | * slots. The allocation is sized to have one slot per tape, plus one 158 | * additional slot. We need that many slots to hold all the tuples kept 159 | * in the heap during merge, plus the one we have last returned from the 160 | * sort, with tuplesort_gettuple. 161 | * 162 | * Initially, all the slots are kept in a linked list of free slots. When 163 | * a tuple is read from a tape, it is put to the next available slot, if 164 | * it fits. If the tuple is larger than SLAB_SLOT_SIZE, it is palloc'd 165 | * instead. 166 | * 167 | * When we're done processing a tuple, we return the slot back to the free 168 | * list, or pfree() if it was palloc'd. We know that a tuple was 169 | * allocated from the slab, if its pointer value is between 170 | * slabMemoryBegin and -End. 171 | * 172 | * When the slab allocator is used, the USEMEM/LACKMEM mechanism of 173 | * tracking memory usage is not used. 174 | */ 175 | bool slabAllocatorUsed; 176 | 177 | char *slabMemoryBegin; /* beginning of slab memory arena */ 178 | char *slabMemoryEnd; /* end of slab memory arena */ 179 | SlabSlot *slabFreeHead; /* head of free list */ 180 | 181 | /* Buffer size to use for reading input tapes, during merge. */ 182 | size_t read_buffer_size; 183 | 184 | /* 185 | * When we return a tuple to the caller in tuplesort_gettuple_XXX, that 186 | * came from a tape (that is, in TSS_SORTEDONTAPE or TSS_FINALMERGE 187 | * modes), we remember the tuple in 'lastReturnedTuple', so that we can 188 | * recycle the memory on next gettuple call. 189 | */ 190 | void *lastReturnedTuple; 191 | 192 | /* 193 | * While building initial runs, this is the current output run number. 194 | * Afterwards, it is the number of initial runs we made. 195 | */ 196 | int currentRun; 197 | 198 | /* 199 | * Unless otherwise noted, all pointer variables below are pointers to 200 | * arrays of length maxTapes, holding per-tape data. 201 | */ 202 | 203 | /* 204 | * This variable is only used during merge passes. mergeactive[i] is true 205 | * if we are reading an input run from (actual) tape number i and have not 206 | * yet exhausted that run. 207 | */ 208 | bool *mergeactive; /* active input run source? */ 209 | 210 | /* 211 | * Variables for Algorithm D. Note that destTape is a "logical" tape 212 | * number, ie, an index into the tp_xxx[] arrays. Be careful to keep 213 | * "logical" and "actual" tape numbers straight! 214 | */ 215 | int Level; /* Knuth's l */ 216 | int destTape; /* current output tape (Knuth's j, less 1) */ 217 | int *tp_fib; /* Target Fibonacci run counts (A[]) */ 218 | int *tp_runs; /* # of real runs on each tape */ 219 | int *tp_dummy; /* # of dummy runs for each tape (D[]) */ 220 | int *tp_tapenum; /* Actual tape numbers (TAPE[]) */ 221 | int activeTapes; /* # of active input tapes in merge pass */ 222 | 223 | /* 224 | * These variables are used after completion of sorting to keep track of 225 | * the next tuple to return. (In the tape case, the tape's current read 226 | * position is also critical state.) 227 | */ 228 | int result_tape; /* actual tape number of finished output */ 229 | int current; /* array index (only used if SORTEDINMEM) */ 230 | bool eof_reached; /* reached EOF (needed for cursors) */ 231 | 232 | /* markpos_xxx holds marked position for mark and restore */ 233 | long markpos_block; /* tape block# (only used if SORTEDONTAPE) */ 234 | int markpos_offset; /* saved "current", or offset in tape block */ 235 | bool markpos_eof; /* saved "eof_reached" */ 236 | 237 | /* 238 | * These variables are used during parallel sorting. 239 | * 240 | * worker is our worker identifier. Follows the general convention that 241 | * -1 value relates to a leader tuplesort, and values >= 0 worker 242 | * tuplesorts. (-1 can also be a serial tuplesort.) 243 | * 244 | * shared is mutable shared memory state, which is used to coordinate 245 | * parallel sorts. 246 | * 247 | * nParticipants is the number of worker Tuplesortstates known by the 248 | * leader to have actually been launched, which implies that they must 249 | * finish a run leader can merge. Typically includes a worker state held 250 | * by the leader process itself. Set in the leader Tuplesortstate only. 251 | */ 252 | int worker; 253 | Sharedsort *shared; 254 | int nParticipants; 255 | 256 | /* 257 | * The sortKeys variable is used by every case other than the hash index 258 | * case; it is set by tuplesort_begin_xxx. tupDesc is only used by the 259 | * MinimalTuple and CLUSTER routines, though. 260 | */ 261 | TupleDesc tupDesc; 262 | SortSupport sortKeys; /* array of length nKeys */ 263 | 264 | /* 265 | * This variable is shared by the single-key MinimalTuple case and the 266 | * Datum case (which both use qsort_ssup()). Otherwise it's NULL. 267 | */ 268 | SortSupport onlyKey; 269 | 270 | /* 271 | * Additional state for managing "abbreviated key" sortsupport routines 272 | * (which currently may be used by all cases except the hash index case). 273 | * Tracks the intervals at which the optimization's effectiveness is 274 | * tested. 275 | */ 276 | int64 abbrevNext; /* Tuple # at which to next check 277 | * applicability */ 278 | 279 | /* 280 | * These variables are specific to the CLUSTER case; they are set by 281 | * tuplesort_begin_cluster. 282 | */ 283 | IndexInfo *indexInfo; /* info about index being used for reference */ 284 | EState *estate; /* for evaluating index expressions */ 285 | 286 | /* 287 | * These variables are specific to the IndexTuple case; they are set by 288 | * tuplesort_begin_index_xxx and used only by the IndexTuple routines. 289 | */ 290 | Relation heapRel; /* table the index is being built on */ 291 | Relation indexRel; /* index being built */ 292 | 293 | /* These are specific to the index_btree subcase: */ 294 | bool enforceUnique; /* complain if we find duplicate tuples */ 295 | 296 | /* These are specific to the index_hash subcase: */ 297 | uint32 high_mask; /* masks for sortable part of hash code */ 298 | uint32 low_mask; 299 | uint32 max_buckets; 300 | 301 | /* 302 | * These variables are specific to the Datum case; they are set by 303 | * tuplesort_begin_datum and used only by the DatumTuple routines. 304 | */ 305 | Oid datumType; 306 | /* we need typelen in order to know how to copy the Datums. */ 307 | int datumTypeLen; 308 | 309 | /* 310 | * Resource snapshot for time of sort start. 311 | */ 312 | #ifdef TRACE_SORT 313 | PGRUsage ru_start; 314 | #endif 315 | } pgsrt_Tuplesortstate; 316 | 317 | #endif /* PG_SORTSTATS_IMPORT_PG13_H */ 318 | -------------------------------------------------------------------------------- /include/pg_sortstats_import_pg14.h: -------------------------------------------------------------------------------- 1 | #ifndef PG_SORTSTATS_IMPORT_PG14_H 2 | #define PG_SORTSTATS_IMPORT_PG14_H 3 | 4 | #include "utils/pg_rusage.h" 5 | 6 | #define PGSRT_ALLOC_CHUNKHDRSZ sizeof(struct pgsrt_AllocChunkData) 7 | 8 | /* 9 | * AllocChunk 10 | * The prefix of each piece of memory in an AllocBlock 11 | * 12 | * Note: to meet the memory context APIs, the payload area of the chunk must 13 | * be maxaligned, and the "aset" link must be immediately adjacent to the 14 | * payload area (cf. GetMemoryChunkContext). We simplify matters for this 15 | * module by requiring sizeof(AllocChunkData) to be maxaligned, and then 16 | * we can ensure things work by adding any required alignment padding before 17 | * the "aset" field. There is a static assertion below that the alignment 18 | * is done correctly. 19 | */ 20 | typedef struct pgsrt_AllocChunkData 21 | { 22 | /* size is always the size of the usable space in the chunk */ 23 | Size size; 24 | #ifdef MEMORY_CONTEXT_CHECKING 25 | /* when debugging memory usage, also store actual requested size */ 26 | /* this is zero in a free chunk */ 27 | Size requested_size; 28 | 29 | #define ALLOCCHUNK_RAWSIZE (SIZEOF_SIZE_T * 2 + SIZEOF_VOID_P) 30 | #else 31 | #define ALLOCCHUNK_RAWSIZE (SIZEOF_SIZE_T + SIZEOF_VOID_P) 32 | #endif /* MEMORY_CONTEXT_CHECKING */ 33 | 34 | /* ensure proper alignment by adding padding if needed */ 35 | #if (ALLOCCHUNK_RAWSIZE % MAXIMUM_ALIGNOF) != 0 36 | char padding[MAXIMUM_ALIGNOF - ALLOCCHUNK_RAWSIZE % MAXIMUM_ALIGNOF]; 37 | #endif 38 | 39 | /* aset is the owning aset if allocated, or the freelist link if free */ 40 | void *aset; 41 | /* there must not be any padding to reach a MAXALIGN boundary here! */ 42 | } pgsrt_AllocChunkData; 43 | 44 | 45 | #define SLAB_SLOT_SIZE 1024 46 | typedef union SlabSlot 47 | { 48 | union SlabSlot *nextfree; 49 | char buffer[SLAB_SLOT_SIZE]; 50 | } SlabSlot; 51 | 52 | typedef struct 53 | { 54 | void *tuple; /* the tuple itself */ 55 | Datum datum1; /* value of first key column */ 56 | bool isnull1; /* is first key column NULL? */ 57 | int tupindex; /* see notes above */ 58 | } SortTuple; 59 | 60 | typedef int (*SortTupleComparator) (const SortTuple *a, const SortTuple *b, 61 | Tuplesortstate *state); 62 | typedef enum 63 | { 64 | TSS_INITIAL, /* Loading tuples; still within memory limit */ 65 | TSS_BOUNDED, /* Loading tuples into bounded-size heap */ 66 | TSS_BUILDRUNS, /* Loading tuples; writing to tape */ 67 | TSS_SORTEDINMEM, /* Sort completed entirely in memory */ 68 | TSS_SORTEDONTAPE, /* Sort completed, final run is on tape */ 69 | TSS_FINALMERGE /* Performing final merge on-the-fly */ 70 | } TupSortStatus; 71 | 72 | typedef struct pgsrt_Tuplesortstate 73 | { 74 | TupSortStatus status; /* enumerated value as shown above */ 75 | int nKeys; /* number of columns in sort key */ 76 | bool randomAccess; /* did caller request random access? */ 77 | bool bounded; /* did caller specify a maximum number of 78 | * tuples to return? */ 79 | bool boundUsed; /* true if we made use of a bounded heap */ 80 | int bound; /* if bounded, the maximum number of tuples */ 81 | bool tuples; /* Can SortTuple.tuple ever be set? */ 82 | int64 availMem; /* remaining memory available, in bytes */ 83 | int64 allowedMem; /* total memory allowed, in bytes */ 84 | int maxTapes; /* number of tapes (Knuth's T) */ 85 | int tapeRange; /* maxTapes-1 (Knuth's P) */ 86 | int64 maxSpace; /* maximum amount of space occupied among sort 87 | * of groups, either in-memory or on-disk */ 88 | bool isMaxSpaceDisk; /* true when maxSpace is value for on-disk 89 | * space, false when it's value for in-memory 90 | * space */ 91 | TupSortStatus maxSpaceStatus; /* sort status when maxSpace was reached */ 92 | MemoryContext maincontext; /* memory context for tuple sort metadata that 93 | * persists across multiple batches */ 94 | MemoryContext sortcontext; /* memory context holding most sort data */ 95 | MemoryContext tuplecontext; /* sub-context of sortcontext for tuple data */ 96 | LogicalTapeSet *tapeset; /* logtape.c object for tapes in a temp file */ 97 | 98 | /* 99 | * These function pointers decouple the routines that must know what kind 100 | * of tuple we are sorting from the routines that don't need to know it. 101 | * They are set up by the tuplesort_begin_xxx routines. 102 | * 103 | * Function to compare two tuples; result is per qsort() convention, ie: 104 | * <0, 0, >0 according as ab. The API must match 105 | * qsort_arg_comparator. 106 | */ 107 | SortTupleComparator comparetup; 108 | 109 | /* 110 | * Function to copy a supplied input tuple into palloc'd space and set up 111 | * its SortTuple representation (ie, set tuple/datum1/isnull1). Also, 112 | * state->availMem must be decreased by the amount of space used for the 113 | * tuple copy (note the SortTuple struct itself is not counted). 114 | */ 115 | void (*copytup) (Tuplesortstate *state, SortTuple *stup, void *tup); 116 | 117 | /* 118 | * Function to write a stored tuple onto tape. The representation of the 119 | * tuple on tape need not be the same as it is in memory; requirements on 120 | * the tape representation are given below. Unless the slab allocator is 121 | * used, after writing the tuple, pfree() the out-of-line data (not the 122 | * SortTuple struct!), and increase state->availMem by the amount of 123 | * memory space thereby released. 124 | */ 125 | void (*writetup) (Tuplesortstate *state, int tapenum, 126 | SortTuple *stup); 127 | 128 | /* 129 | * Function to read a stored tuple from tape back into memory. 'len' is 130 | * the already-read length of the stored tuple. The tuple is allocated 131 | * from the slab memory arena, or is palloc'd, see readtup_alloc(). 132 | */ 133 | void (*readtup) (Tuplesortstate *state, SortTuple *stup, 134 | int tapenum, unsigned int len); 135 | 136 | /* 137 | * This array holds the tuples now in sort memory. If we are in state 138 | * INITIAL, the tuples are in no particular order; if we are in state 139 | * SORTEDINMEM, the tuples are in final sorted order; in states BUILDRUNS 140 | * and FINALMERGE, the tuples are organized in "heap" order per Algorithm 141 | * H. In state SORTEDONTAPE, the array is not used. 142 | */ 143 | SortTuple *memtuples; /* array of SortTuple structs */ 144 | int memtupcount; /* number of tuples currently present */ 145 | int memtupsize; /* allocated length of memtuples array */ 146 | bool growmemtuples; /* memtuples' growth still underway? */ 147 | 148 | /* 149 | * Memory for tuples is sometimes allocated using a simple slab allocator, 150 | * rather than with palloc(). Currently, we switch to slab allocation 151 | * when we start merging. Merging only needs to keep a small, fixed 152 | * number of tuples in memory at any time, so we can avoid the 153 | * palloc/pfree overhead by recycling a fixed number of fixed-size slots 154 | * to hold the tuples. 155 | * 156 | * For the slab, we use one large allocation, divided into SLAB_SLOT_SIZE 157 | * slots. The allocation is sized to have one slot per tape, plus one 158 | * additional slot. We need that many slots to hold all the tuples kept 159 | * in the heap during merge, plus the one we have last returned from the 160 | * sort, with tuplesort_gettuple. 161 | * 162 | * Initially, all the slots are kept in a linked list of free slots. When 163 | * a tuple is read from a tape, it is put to the next available slot, if 164 | * it fits. If the tuple is larger than SLAB_SLOT_SIZE, it is palloc'd 165 | * instead. 166 | * 167 | * When we're done processing a tuple, we return the slot back to the free 168 | * list, or pfree() if it was palloc'd. We know that a tuple was 169 | * allocated from the slab, if its pointer value is between 170 | * slabMemoryBegin and -End. 171 | * 172 | * When the slab allocator is used, the USEMEM/LACKMEM mechanism of 173 | * tracking memory usage is not used. 174 | */ 175 | bool slabAllocatorUsed; 176 | 177 | char *slabMemoryBegin; /* beginning of slab memory arena */ 178 | char *slabMemoryEnd; /* end of slab memory arena */ 179 | SlabSlot *slabFreeHead; /* head of free list */ 180 | 181 | /* Buffer size to use for reading input tapes, during merge. */ 182 | size_t read_buffer_size; 183 | 184 | /* 185 | * When we return a tuple to the caller in tuplesort_gettuple_XXX, that 186 | * came from a tape (that is, in TSS_SORTEDONTAPE or TSS_FINALMERGE 187 | * modes), we remember the tuple in 'lastReturnedTuple', so that we can 188 | * recycle the memory on next gettuple call. 189 | */ 190 | void *lastReturnedTuple; 191 | 192 | /* 193 | * While building initial runs, this is the current output run number. 194 | * Afterwards, it is the number of initial runs we made. 195 | */ 196 | int currentRun; 197 | 198 | /* 199 | * Unless otherwise noted, all pointer variables below are pointers to 200 | * arrays of length maxTapes, holding per-tape data. 201 | */ 202 | 203 | /* 204 | * This variable is only used during merge passes. mergeactive[i] is true 205 | * if we are reading an input run from (actual) tape number i and have not 206 | * yet exhausted that run. 207 | */ 208 | bool *mergeactive; /* active input run source? */ 209 | 210 | /* 211 | * Variables for Algorithm D. Note that destTape is a "logical" tape 212 | * number, ie, an index into the tp_xxx[] arrays. Be careful to keep 213 | * "logical" and "actual" tape numbers straight! 214 | */ 215 | int Level; /* Knuth's l */ 216 | int destTape; /* current output tape (Knuth's j, less 1) */ 217 | int *tp_fib; /* Target Fibonacci run counts (A[]) */ 218 | int *tp_runs; /* # of real runs on each tape */ 219 | int *tp_dummy; /* # of dummy runs for each tape (D[]) */ 220 | int *tp_tapenum; /* Actual tape numbers (TAPE[]) */ 221 | int activeTapes; /* # of active input tapes in merge pass */ 222 | 223 | /* 224 | * These variables are used after completion of sorting to keep track of 225 | * the next tuple to return. (In the tape case, the tape's current read 226 | * position is also critical state.) 227 | */ 228 | int result_tape; /* actual tape number of finished output */ 229 | int current; /* array index (only used if SORTEDINMEM) */ 230 | bool eof_reached; /* reached EOF (needed for cursors) */ 231 | 232 | /* markpos_xxx holds marked position for mark and restore */ 233 | long markpos_block; /* tape block# (only used if SORTEDONTAPE) */ 234 | int markpos_offset; /* saved "current", or offset in tape block */ 235 | bool markpos_eof; /* saved "eof_reached" */ 236 | 237 | /* 238 | * These variables are used during parallel sorting. 239 | * 240 | * worker is our worker identifier. Follows the general convention that 241 | * -1 value relates to a leader tuplesort, and values >= 0 worker 242 | * tuplesorts. (-1 can also be a serial tuplesort.) 243 | * 244 | * shared is mutable shared memory state, which is used to coordinate 245 | * parallel sorts. 246 | * 247 | * nParticipants is the number of worker Tuplesortstates known by the 248 | * leader to have actually been launched, which implies that they must 249 | * finish a run leader can merge. Typically includes a worker state held 250 | * by the leader process itself. Set in the leader Tuplesortstate only. 251 | */ 252 | int worker; 253 | Sharedsort *shared; 254 | int nParticipants; 255 | 256 | /* 257 | * The sortKeys variable is used by every case other than the hash index 258 | * case; it is set by tuplesort_begin_xxx. tupDesc is only used by the 259 | * MinimalTuple and CLUSTER routines, though. 260 | */ 261 | TupleDesc tupDesc; 262 | SortSupport sortKeys; /* array of length nKeys */ 263 | 264 | /* 265 | * This variable is shared by the single-key MinimalTuple case and the 266 | * Datum case (which both use qsort_ssup()). Otherwise it's NULL. 267 | */ 268 | SortSupport onlyKey; 269 | 270 | /* 271 | * Additional state for managing "abbreviated key" sortsupport routines 272 | * (which currently may be used by all cases except the hash index case). 273 | * Tracks the intervals at which the optimization's effectiveness is 274 | * tested. 275 | */ 276 | int64 abbrevNext; /* Tuple # at which to next check 277 | * applicability */ 278 | 279 | /* 280 | * These variables are specific to the CLUSTER case; they are set by 281 | * tuplesort_begin_cluster. 282 | */ 283 | IndexInfo *indexInfo; /* info about index being used for reference */ 284 | EState *estate; /* for evaluating index expressions */ 285 | 286 | /* 287 | * These variables are specific to the IndexTuple case; they are set by 288 | * tuplesort_begin_index_xxx and used only by the IndexTuple routines. 289 | */ 290 | Relation heapRel; /* table the index is being built on */ 291 | Relation indexRel; /* index being built */ 292 | 293 | /* These are specific to the index_btree subcase: */ 294 | bool enforceUnique; /* complain if we find duplicate tuples */ 295 | 296 | /* These are specific to the index_hash subcase: */ 297 | uint32 high_mask; /* masks for sortable part of hash code */ 298 | uint32 low_mask; 299 | uint32 max_buckets; 300 | 301 | /* 302 | * These variables are specific to the Datum case; they are set by 303 | * tuplesort_begin_datum and used only by the DatumTuple routines. 304 | */ 305 | Oid datumType; 306 | /* we need typelen in order to know how to copy the Datums. */ 307 | int datumTypeLen; 308 | 309 | /* 310 | * Resource snapshot for time of sort start. 311 | */ 312 | #ifdef TRACE_SORT 313 | PGRUsage ru_start; 314 | #endif 315 | } pgsrt_Tuplesortstate; 316 | 317 | #endif /* PG_SORTSTATS_IMPORT_PG14_H */ 318 | -------------------------------------------------------------------------------- /include/pg_sortstats_import_pg9_4.h: -------------------------------------------------------------------------------- 1 | #ifndef PG_SORTSTATS_IMPORT_PG9_4_H 2 | #define PG_SORTSTATS_IMPORT_PG9_4_H 3 | 4 | #define SizeofMinimalTupleHeader offsetof(MinimalTupleData, t_bits) 5 | /* 6 | * AllocChunk 7 | * The prefix of each piece of memory in an AllocBlock 8 | * 9 | * NB: this MUST match StandardChunkHeader as defined by utils/memutils.h. 10 | */ 11 | typedef struct pgsrt_AllocChunkData 12 | { 13 | /* aset is the owning aset if allocated, or the freelist link if free */ 14 | void *aset; 15 | /* size is always the size of the usable space in the chunk */ 16 | Size size; 17 | #ifdef MEMORY_CONTEXT_CHECKING 18 | /* when debugging memory usage, also store actual requested size */ 19 | /* this is zero in a free chunk */ 20 | Size requested_size; 21 | #endif 22 | } pgsrt_AllocChunkData; 23 | 24 | #define PGSRT_ALLOC_CHUNKHDRSZ MAXALIGN(sizeof(pgsrt_AllocChunkData)) 25 | 26 | 27 | typedef struct 28 | { 29 | void *tuple; /* the tuple proper */ 30 | Datum datum1; /* value of first key column */ 31 | bool isnull1; /* is first key column NULL? */ 32 | int tupindex; /* see notes above */ 33 | } SortTuple; 34 | 35 | typedef int (*SortTupleComparator) (const SortTuple *a, const SortTuple *b, 36 | Tuplesortstate *state); 37 | 38 | typedef enum 39 | { 40 | TSS_INITIAL, /* Loading tuples; still within memory limit */ 41 | TSS_BOUNDED, /* Loading tuples into bounded-size heap */ 42 | TSS_BUILDRUNS, /* Loading tuples; writing to tape */ 43 | TSS_SORTEDINMEM, /* Sort completed entirely in memory */ 44 | TSS_SORTEDONTAPE, /* Sort completed, final run is on tape */ 45 | TSS_FINALMERGE /* Performing final merge on-the-fly */ 46 | } TupSortStatus; 47 | 48 | typedef struct pgsrt_Tuplesortstate 49 | { 50 | TupSortStatus status; /* enumerated value as shown above */ 51 | int nKeys; /* number of columns in sort key */ 52 | bool randomAccess; /* did caller request random access? */ 53 | bool bounded; /* did caller specify a maximum number of 54 | * tuples to return? */ 55 | bool boundUsed; /* true if we made use of a bounded heap */ 56 | int bound; /* if bounded, the maximum number of tuples */ 57 | int64 availMem; /* remaining memory available, in bytes */ 58 | int64 allowedMem; /* total memory allowed, in bytes */ 59 | int maxTapes; /* number of tapes (Knuth's T) */ 60 | int tapeRange; /* maxTapes-1 (Knuth's P) */ 61 | MemoryContext sortcontext; /* memory context holding all sort data */ 62 | LogicalTapeSet *tapeset; /* logtape.c object for tapes in a temp file */ 63 | 64 | /* 65 | * These function pointers decouple the routines that must know what kind 66 | * of tuple we are sorting from the routines that don't need to know it. 67 | * They are set up by the tuplesort_begin_xxx routines. 68 | * 69 | * Function to compare two tuples; result is per qsort() convention, ie: 70 | * <0, 0, >0 according as ab. The API must match 71 | * qsort_arg_comparator. 72 | */ 73 | SortTupleComparator comparetup; 74 | 75 | /* 76 | * Function to copy a supplied input tuple into palloc'd space and set up 77 | * its SortTuple representation (ie, set tuple/datum1/isnull1). Also, 78 | * state->availMem must be decreased by the amount of space used for the 79 | * tuple copy (note the SortTuple struct itself is not counted). 80 | */ 81 | void (*copytup) (Tuplesortstate *state, SortTuple *stup, void *tup); 82 | 83 | /* 84 | * Function to write a stored tuple onto tape. The representation of the 85 | * tuple on tape need not be the same as it is in memory; requirements on 86 | * the tape representation are given below. After writing the tuple, 87 | * pfree() the out-of-line data (not the SortTuple struct!), and increase 88 | * state->availMem by the amount of memory space thereby released. 89 | */ 90 | void (*writetup) (Tuplesortstate *state, int tapenum, 91 | SortTuple *stup); 92 | 93 | /* 94 | * Function to read a stored tuple from tape back into memory. 'len' is 95 | * the already-read length of the stored tuple. Create a palloc'd copy, 96 | * initialize tuple/datum1/isnull1 in the target SortTuple struct, and 97 | * decrease state->availMem by the amount of memory space consumed. 98 | */ 99 | void (*readtup) (Tuplesortstate *state, SortTuple *stup, 100 | int tapenum, unsigned int len); 101 | 102 | /* 103 | * Function to reverse the sort direction from its current state. (We 104 | * could dispense with this if we wanted to enforce that all variants 105 | * represent the sort key information alike.) 106 | */ 107 | void (*reversedirection) (Tuplesortstate *state); 108 | 109 | /* 110 | * This array holds the tuples now in sort memory. If we are in state 111 | * INITIAL, the tuples are in no particular order; if we are in state 112 | * SORTEDINMEM, the tuples are in final sorted order; in states BUILDRUNS 113 | * and FINALMERGE, the tuples are organized in "heap" order per Algorithm 114 | * H. (Note that memtupcount only counts the tuples that are part of the 115 | * heap --- during merge passes, memtuples[] entries beyond tapeRange are 116 | * never in the heap and are used to hold pre-read tuples.) In state 117 | * SORTEDONTAPE, the array is not used. 118 | */ 119 | SortTuple *memtuples; /* array of SortTuple structs */ 120 | int memtupcount; /* number of tuples currently present */ 121 | int memtupsize; /* allocated length of memtuples array */ 122 | bool growmemtuples; /* memtuples' growth still underway? */ 123 | 124 | /* 125 | * While building initial runs, this is the current output run number 126 | * (starting at 0). Afterwards, it is the number of initial runs we made. 127 | */ 128 | int currentRun; 129 | } pgsrt_Tuplesortstate; 130 | 131 | #endif /* PG_SORTSTATS_IMPORT_PG9_4_H */ 132 | -------------------------------------------------------------------------------- /include/pg_sortstats_import_pg9_5.h: -------------------------------------------------------------------------------- 1 | #ifndef PG_SORTSTATS_IMPORT_PG9_5_H 2 | #define PG_SORTSTATS_IMPORT_PG9_5_H 3 | 4 | 5 | /* 6 | * AllocChunk 7 | * The prefix of each piece of memory in an AllocBlock 8 | * 9 | * NB: this MUST match StandardChunkHeader as defined by utils/memutils.h. 10 | */ 11 | typedef struct pgsrt_AllocChunkData 12 | { 13 | /* aset is the owning aset if allocated, or the freelist link if free */ 14 | void *aset; 15 | /* size is always the size of the usable space in the chunk */ 16 | Size size; 17 | #ifdef MEMORY_CONTEXT_CHECKING 18 | /* when debugging memory usage, also store actual requested size */ 19 | /* this is zero in a free chunk */ 20 | Size requested_size; 21 | #endif 22 | } pgsrt_AllocChunkData; 23 | 24 | #define PGSRT_ALLOC_CHUNKHDRSZ MAXALIGN(sizeof(pgsrt_AllocChunkData)) 25 | 26 | 27 | typedef struct 28 | { 29 | void *tuple; /* the tuple proper */ 30 | Datum datum1; /* value of first key column */ 31 | bool isnull1; /* is first key column NULL? */ 32 | int tupindex; /* see notes above */ 33 | } SortTuple; 34 | 35 | typedef int (*SortTupleComparator) (const SortTuple *a, const SortTuple *b, 36 | Tuplesortstate *state); 37 | 38 | typedef enum 39 | { 40 | TSS_INITIAL, /* Loading tuples; still within memory limit */ 41 | TSS_BOUNDED, /* Loading tuples into bounded-size heap */ 42 | TSS_BUILDRUNS, /* Loading tuples; writing to tape */ 43 | TSS_SORTEDINMEM, /* Sort completed entirely in memory */ 44 | TSS_SORTEDONTAPE, /* Sort completed, final run is on tape */ 45 | TSS_FINALMERGE /* Performing final merge on-the-fly */ 46 | } TupSortStatus; 47 | 48 | typedef struct pgsrt_Tuplesortstate 49 | { 50 | TupSortStatus status; /* enumerated value as shown above */ 51 | int nKeys; /* number of columns in sort key */ 52 | bool randomAccess; /* did caller request random access? */ 53 | bool bounded; /* did caller specify a maximum number of 54 | * tuples to return? */ 55 | bool boundUsed; /* true if we made use of a bounded heap */ 56 | int bound; /* if bounded, the maximum number of tuples */ 57 | int64 availMem; /* remaining memory available, in bytes */ 58 | int64 allowedMem; /* total memory allowed, in bytes */ 59 | int maxTapes; /* number of tapes (Knuth's T) */ 60 | int tapeRange; /* maxTapes-1 (Knuth's P) */ 61 | MemoryContext sortcontext; /* memory context holding all sort data */ 62 | LogicalTapeSet *tapeset; /* logtape.c object for tapes in a temp file */ 63 | 64 | /* 65 | * These function pointers decouple the routines that must know what kind 66 | * of tuple we are sorting from the routines that don't need to know it. 67 | * They are set up by the tuplesort_begin_xxx routines. 68 | * 69 | * Function to compare two tuples; result is per qsort() convention, ie: 70 | * <0, 0, >0 according as ab. The API must match 71 | * qsort_arg_comparator. 72 | */ 73 | SortTupleComparator comparetup; 74 | 75 | /* 76 | * Function to copy a supplied input tuple into palloc'd space and set up 77 | * its SortTuple representation (ie, set tuple/datum1/isnull1). Also, 78 | * state->availMem must be decreased by the amount of space used for the 79 | * tuple copy (note the SortTuple struct itself is not counted). 80 | */ 81 | void (*copytup) (Tuplesortstate *state, SortTuple *stup, void *tup); 82 | 83 | /* 84 | * Function to write a stored tuple onto tape. The representation of the 85 | * tuple on tape need not be the same as it is in memory; requirements on 86 | * the tape representation are given below. After writing the tuple, 87 | * pfree() the out-of-line data (not the SortTuple struct!), and increase 88 | * state->availMem by the amount of memory space thereby released. 89 | */ 90 | void (*writetup) (Tuplesortstate *state, int tapenum, 91 | SortTuple *stup); 92 | 93 | /* 94 | * Function to read a stored tuple from tape back into memory. 'len' is 95 | * the already-read length of the stored tuple. Create a palloc'd copy, 96 | * initialize tuple/datum1/isnull1 in the target SortTuple struct, and 97 | * decrease state->availMem by the amount of memory space consumed. 98 | */ 99 | void (*readtup) (Tuplesortstate *state, SortTuple *stup, 100 | int tapenum, unsigned int len); 101 | 102 | /* 103 | * This array holds the tuples now in sort memory. If we are in state 104 | * INITIAL, the tuples are in no particular order; if we are in state 105 | * SORTEDINMEM, the tuples are in final sorted order; in states BUILDRUNS 106 | * and FINALMERGE, the tuples are organized in "heap" order per Algorithm 107 | * H. (Note that memtupcount only counts the tuples that are part of the 108 | * heap --- during merge passes, memtuples[] entries beyond tapeRange are 109 | * never in the heap and are used to hold pre-read tuples.) In state 110 | * SORTEDONTAPE, the array is not used. 111 | */ 112 | SortTuple *memtuples; /* array of SortTuple structs */ 113 | int memtupcount; /* number of tuples currently present */ 114 | int memtupsize; /* allocated length of memtuples array */ 115 | bool growmemtuples; /* memtuples' growth still underway? */ 116 | 117 | /* 118 | * While building initial runs, this is the current output run number 119 | * (starting at 0). Afterwards, it is the number of initial runs we made. 120 | */ 121 | int currentRun; 122 | } pgsrt_Tuplesortstate; 123 | 124 | #endif /* PG_SORTSTATS_IMPORT_PG9_5_H */ 125 | -------------------------------------------------------------------------------- /include/pg_sortstats_import_pg9_6.h: -------------------------------------------------------------------------------- 1 | #ifndef PG_SORTSTATS_IMPORT_PG9_6_H 2 | #define PG_SORTSTATS_IMPORT_PG9_6_H 3 | 4 | /* 5 | * AllocChunk 6 | * The prefix of each piece of memory in an AllocBlock 7 | * 8 | * NB: this MUST match StandardChunkHeader as defined by utils/memutils.h. 9 | */ 10 | typedef struct pgsrt_AllocChunkData 11 | { 12 | /* aset is the owning aset if allocated, or the freelist link if free */ 13 | void *aset; 14 | /* size is always the size of the usable space in the chunk */ 15 | Size size; 16 | #ifdef MEMORY_CONTEXT_CHECKING 17 | /* when debugging memory usage, also store actual requested size */ 18 | /* this is zero in a free chunk */ 19 | Size requested_size; 20 | #endif 21 | } pgsrt_AllocChunkData; 22 | 23 | #define PGSRT_ALLOC_CHUNKHDRSZ MAXALIGN(sizeof(pgsrt_AllocChunkData)) 24 | 25 | 26 | typedef struct 27 | { 28 | void *tuple; /* the tuple itself */ 29 | Datum datum1; /* value of first key column */ 30 | bool isnull1; /* is first key column NULL? */ 31 | int tupindex; /* see notes above */ 32 | } SortTuple; 33 | 34 | typedef int (*SortTupleComparator) (const SortTuple *a, const SortTuple *b, 35 | Tuplesortstate *state); 36 | 37 | typedef enum 38 | { 39 | TSS_INITIAL, /* Loading tuples; still within memory limit */ 40 | TSS_BOUNDED, /* Loading tuples into bounded-size heap */ 41 | TSS_BUILDRUNS, /* Loading tuples; writing to tape */ 42 | TSS_SORTEDINMEM, /* Sort completed entirely in memory */ 43 | TSS_SORTEDONTAPE, /* Sort completed, final run is on tape */ 44 | TSS_FINALMERGE /* Performing final merge on-the-fly */ 45 | } TupSortStatus; 46 | 47 | typedef struct pgsrt_Tuplesortstate 48 | { 49 | TupSortStatus status; /* enumerated value as shown above */ 50 | int nKeys; /* number of columns in sort key */ 51 | bool randomAccess; /* did caller request random access? */ 52 | bool bounded; /* did caller specify a maximum number of 53 | * tuples to return? */ 54 | bool boundUsed; /* true if we made use of a bounded heap */ 55 | int bound; /* if bounded, the maximum number of tuples */ 56 | bool tuples; /* Can SortTuple.tuple ever be set? */ 57 | int64 availMem; /* remaining memory available, in bytes */ 58 | int64 allowedMem; /* total memory allowed, in bytes */ 59 | int maxTapes; /* number of tapes (Knuth's T) */ 60 | int tapeRange; /* maxTapes-1 (Knuth's P) */ 61 | MemoryContext sortcontext; /* memory context holding most sort data */ 62 | MemoryContext tuplecontext; /* sub-context of sortcontext for tuple data */ 63 | LogicalTapeSet *tapeset; /* logtape.c object for tapes in a temp file */ 64 | 65 | /* 66 | * These function pointers decouple the routines that must know what kind 67 | * of tuple we are sorting from the routines that don't need to know it. 68 | * They are set up by the tuplesort_begin_xxx routines. 69 | * 70 | * Function to compare two tuples; result is per qsort() convention, ie: 71 | * <0, 0, >0 according as ab. The API must match 72 | * qsort_arg_comparator. 73 | */ 74 | SortTupleComparator comparetup; 75 | 76 | /* 77 | * Function to copy a supplied input tuple into palloc'd space and set up 78 | * its SortTuple representation (ie, set tuple/datum1/isnull1). Also, 79 | * state->availMem must be decreased by the amount of space used for the 80 | * tuple copy (note the SortTuple struct itself is not counted). 81 | */ 82 | void (*copytup) (Tuplesortstate *state, SortTuple *stup, void *tup); 83 | 84 | /* 85 | * Function to write a stored tuple onto tape. The representation of the 86 | * tuple on tape need not be the same as it is in memory; requirements on 87 | * the tape representation are given below. After writing the tuple, 88 | * pfree() the out-of-line data (not the SortTuple struct!), and increase 89 | * state->availMem by the amount of memory space thereby released. 90 | */ 91 | void (*writetup) (Tuplesortstate *state, int tapenum, 92 | SortTuple *stup); 93 | 94 | /* 95 | * Function to read a stored tuple from tape back into memory. 'len' is 96 | * the already-read length of the stored tuple. Create a palloc'd copy, 97 | * initialize tuple/datum1/isnull1 in the target SortTuple struct, and 98 | * decrease state->availMem by the amount of memory space consumed. (See 99 | * batchUsed notes for details on how memory is handled when incremental 100 | * accounting is abandoned.) 101 | */ 102 | void (*readtup) (Tuplesortstate *state, SortTuple *stup, 103 | int tapenum, unsigned int len); 104 | 105 | /* 106 | * Function to move a caller tuple. This is usually implemented as a 107 | * memmove() shim, but function may also perform additional fix-up of 108 | * caller tuple where needed. Batch memory support requires the movement 109 | * of caller tuples from one location in memory to another. 110 | */ 111 | void (*movetup) (void *dest, void *src, unsigned int len); 112 | 113 | /* 114 | * This array holds the tuples now in sort memory. If we are in state 115 | * INITIAL, the tuples are in no particular order; if we are in state 116 | * SORTEDINMEM, the tuples are in final sorted order; in states BUILDRUNS 117 | * and FINALMERGE, the tuples are organized in "heap" order per Algorithm 118 | * H. (Note that memtupcount only counts the tuples that are part of the 119 | * heap --- during merge passes, memtuples[] entries beyond tapeRange are 120 | * never in the heap and are used to hold pre-read tuples.) In state 121 | * SORTEDONTAPE, the array is not used. 122 | */ 123 | SortTuple *memtuples; /* array of SortTuple structs */ 124 | int memtupcount; /* number of tuples currently present */ 125 | int memtupsize; /* allocated length of memtuples array */ 126 | bool growmemtuples; /* memtuples' growth still underway? */ 127 | 128 | /* 129 | * Memory for tuples is sometimes allocated in batch, rather than 130 | * incrementally. This implies that incremental memory accounting has 131 | * been abandoned. Currently, this only happens for the final on-the-fly 132 | * merge step. Large batch allocations can store tuples (e.g. 133 | * IndexTuples) without palloc() fragmentation and other overhead. 134 | */ 135 | bool batchUsed; 136 | 137 | /* 138 | * While building initial runs, this indicates if the replacement 139 | * selection strategy is in use. When it isn't, then a simple hybrid 140 | * sort-merge strategy is in use instead (runs are quicksorted). 141 | */ 142 | bool replaceActive; 143 | 144 | /* 145 | * While building initial runs, this is the current output run number 146 | * (starting at RUN_FIRST). Afterwards, it is the number of initial runs 147 | * we made. 148 | */ 149 | int currentRun; 150 | } pgsrt_Tuplesortstate; 151 | 152 | #endif /* PG_SORTSTATS_IMPORT_PG9_6_H */ 153 | -------------------------------------------------------------------------------- /pg_sortstats--0.0.1.sql: -------------------------------------------------------------------------------- 1 | -- This program is open source, licensed under the PostgreSQL License. 2 | -- For license terms, see the LICENSE file. 3 | -- 4 | -- Copyright (C) 2018-2023: The PoWA-team 5 | 6 | -- complain if script is sourced in psql, rather than via CREATE EXTENSION 7 | \echo Use "CREATE EXTENSION pg_sortstats" to load this file. \quit 8 | 9 | SET client_encoding = 'UTF8'; 10 | 11 | CREATE FUNCTION pg_sortstats(IN showtext boolean, 12 | OUT queryid bigint, 13 | OUT userid oid, 14 | OUT dbid oid, 15 | OUT nb_keys integer, 16 | OUT sort_keys text, 17 | OUT lines bigint, 18 | OUT lines_to_sort bigint, 19 | OUT work_mems bigint, 20 | OUT topn_sorts bigint, 21 | OUT quicksorts bigint, 22 | OUT external_sorts bigint, 23 | OUT external_merges bigint, 24 | OUT nb_tapes bigint, 25 | OUT space_disk bigint, 26 | OUT space_memory bigint, 27 | OUT non_parallels bigint, 28 | OUT nb_workers bigint 29 | ) 30 | RETURNS SETOF record 31 | AS '$libdir/pg_sortstats', 'pg_sortstats' 32 | LANGUAGE C STRICT VOLATILE COST 1000; 33 | 34 | CREATE VIEW pg_sortstats AS SELECT * FROM pg_sortstats(true); 35 | 36 | CREATE FUNCTION pg_sortstats_reset() 37 | RETURNS void 38 | LANGUAGE c COST 1000 39 | AS '$libdir/pg_sortstats', 'pg_sortstats_reset'; 40 | -------------------------------------------------------------------------------- /pg_sortstats.c: -------------------------------------------------------------------------------- 1 | /*------------------------------------------------------------------------- 2 | * 3 | * pg_sortstats.c 4 | * Track statistics about sorts performs, and also estimate how much 5 | * work_mem would have been needed to sort data in memory. 6 | * 7 | * This module is heavily inspired on the great pg_stat_statements official 8 | * contrib. The same locking rules are used, which for reference are: 9 | * 10 | * Note about locking issues: to create or delete an entry in the shared 11 | * hashtable, one must hold pgsrt->lock exclusively. Modifying any field 12 | * in an entry except the counters requires the same. To look up an entry, 13 | * one must hold the lock shared. To read or update the counters within 14 | * an entry, one must hold the lock shared or exclusive (so the entry doesn't 15 | * disappear!) and also take the entry's mutex spinlock. 16 | * The shared state variable pgsrt->extent (the next free spot in the external 17 | * keys-text file) should be accessed only while holding either the 18 | * pgsrt->mutex spinlock, or exclusive lock on pgsrt->lock. We use the mutex 19 | * to allow reserving file space while holding only shared lock on pgsrt->lock. 20 | * Rewriting the entire external keys-text file, eg for garbage collection, 21 | * requires holding pgsrt->lock exclusively; this allows individual entries 22 | * in the file to be read or written while holding only shared lock. 23 | * 24 | * Copyright (c) 2018-2023, The PoWA-team 25 | * 26 | *------------------------------------------------------------------------- 27 | */ 28 | #include "postgres.h" 29 | 30 | #include 31 | #include 32 | 33 | #include "fmgr.h" 34 | #include "funcapi.h" 35 | #include "miscadmin.h" 36 | #include "pgstat.h" 37 | #include "access/hash.h" 38 | #include "access/htup_details.h" 39 | #if PG_VERSION_NUM >= 90600 40 | #include "access/parallel.h" 41 | #endif 42 | #include "mb/pg_wchar.h" 43 | #include "nodes/nodeFuncs.h" 44 | #include "parser/parsetree.h" 45 | #if PG_VERSION_NUM >= 90600 46 | #include "postmaster/autovacuum.h" 47 | #endif 48 | #if PG_VERSION_NUM >= 120000 49 | #include "replication/walsender.h" 50 | #endif 51 | #if PG_VERSION_NUM < 100000 52 | #include "storage/fd.h" 53 | #endif 54 | #include "storage/ipc.h" 55 | #include "storage/lwlock.h" 56 | #include "storage/shmem.h" 57 | #if PG_VERSION_NUM < 110000 58 | #include "storage/spin.h" 59 | #endif 60 | #include "utils/builtins.h" 61 | #include "utils/guc.h" 62 | #if PG_VERSION_NUM < 110000 63 | #include "utils/memutils.h" 64 | #endif 65 | #if PG_VERSION_NUM >= 90500 66 | #include "utils/ruleutils.h" 67 | #endif 68 | #include "utils/tuplesort.h" 69 | 70 | #include "include/pg_sortstats_import.h" 71 | 72 | PG_MODULE_MAGIC; 73 | 74 | /*--- Macros and structs ---*/ 75 | 76 | /* Location of permanent stats file (valid when database is shut down) */ 77 | #define PGSRT_DUMP_FILE PGSTAT_STAT_PERMANENT_DIRECTORY "/pg_sortstats.stat" 78 | 79 | /* 80 | * Location of external keys text file. We don't keep it in the core 81 | * system's stats_temp_directory. The core system can safely use that GUC 82 | * setting, because the statistics collector temp file paths are set only once 83 | * as part of changing the GUC, but pg_sortstats has no way of avoiding 84 | * race conditions. Besides, we only expect modest, infrequent I/O for keys 85 | * strings, so placing the file on a faster filesystem is not compelling. 86 | */ 87 | #define PGSRT_TEXT_FILE PG_STAT_TMP_DIR "/pgsrt_sortkey_texts.stat" 88 | 89 | /* Magic number identifying the stats file format */ 90 | static const uint32 PGSRT_FILE_HEADER = 0x20180804; 91 | 92 | /* PostgreSQL major version number, changes in which invalidate all entries */ 93 | static const uint32 PGSRT_PG_MAJOR_VERSION = PG_VERSION_NUM / 100; 94 | 95 | #define PGSRT_COLUMNS 17 /* number of columns in pg_sortstats SRF */ 96 | #define USAGE_DECREASE_FACTOR (0.99) /* decreased every pgsrt_entry_dealloc */ 97 | #define USAGE_DEALLOC_PERCENT 5 /* free this % of entries at once */ 98 | #define USAGE_INIT (1.0) 99 | #define ASSUMED_MEDIAN_INIT (10.0) /* initial assumed median usage */ 100 | #define ASSUMED_LENGTH_INIT 128 /* initial assumed mean keys length */ 101 | 102 | #define record_gc_ktexts() \ 103 | do { \ 104 | volatile pgsrtSharedState *s = (volatile pgsrtSharedState *) pgsrt; \ 105 | SpinLockAcquire(&s->mutex); \ 106 | s->gc_count++; \ 107 | SpinLockRelease(&s->mutex); \ 108 | } while(0) 109 | 110 | 111 | 112 | /* In PostgreSQL 11, queryid becomes a uint64 internally. 113 | */ 114 | #if PG_VERSION_NUM >= 110000 115 | typedef uint64 pgsrt_queryid; 116 | #else 117 | typedef uint32 pgsrt_queryid; 118 | #endif 119 | 120 | typedef struct pgsrtSharedState 121 | { 122 | LWLockId lock; /* protects hashtable search/modification */ 123 | double cur_median_usage; /* current median usage in hashtable */ 124 | Size mean_keys_len; /* current mean keys text length */ 125 | slock_t mutex; /* protects following fields only: */ 126 | Size extent; /* current extent of keys file */ 127 | int n_writers; /* number of active writers to keys file */ 128 | int gc_count; /* keys file garbage collection cycle count */ 129 | #if PG_VERSION_NUM >= 90600 130 | LWLockId queryids_lock; /* protects following array */ 131 | pgsrt_queryid queryids[FLEXIBLE_ARRAY_MEMBER]; /* queryid of non-worker processes */ 132 | #endif 133 | } pgsrtSharedState; 134 | 135 | typedef struct pgsrtHashKey 136 | { 137 | Oid userid; /* user OID */ 138 | Oid dbid; /* database OID */ 139 | pgsrt_queryid queryid; /* query identifier */ 140 | uint32 sortid; /* sort identifier withing a query */ 141 | } pgsrtHashKey; 142 | 143 | typedef struct pgsrtCounters 144 | { 145 | double usage; /* usage factor */ 146 | int64 lines; /* total number of lines in input */ 147 | int64 lines_to_sort; /* total number of lines sorted */ 148 | int64 work_mems; /* total size of estimated work_mem */ 149 | int64 topn_sorts; /* number of top-N heapsorts */ 150 | int64 quicksorts; /* number of quicksorts */ 151 | int64 external_sorts; /* number of external sorts */ 152 | int64 external_merges; /* number of external merges */ 153 | int64 nbtapes; /* total number of tapes used */ 154 | int64 space_disk; /* total disk space consumed */ 155 | int64 space_memory; /* total memory space consumed */ 156 | int64 non_parallels; /* number of non parallel sorts */ 157 | int64 nb_workers; /* total number of parallel workers (including gather node) */ 158 | } pgsrtCounters; 159 | 160 | typedef struct pgsrtEntry 161 | { 162 | pgsrtHashKey key; 163 | pgsrtCounters counters; /* statistics for this sort */ 164 | int nbkeys; /* # of columns in the sort */ 165 | Size keys_offset; /* deparsed keys text offset in external file */ 166 | int keys_len; /* # of valid bytes in deparsed keys string, or -1 */ 167 | int encoding; /* deparsed keys text encoding */ 168 | slock_t mutex; /* protects the counters only */ 169 | } pgsrtEntry; 170 | 171 | typedef struct pgsrtWalkerContext 172 | { 173 | QueryDesc *queryDesc; 174 | List *ancestors; 175 | List *rtable; 176 | List *rtable_names; 177 | List *deparse_cxt; 178 | } pgsrtWalkerContext; 179 | 180 | /*--- Function declarations ---*/ 181 | 182 | void _PG_init(void); 183 | 184 | 185 | extern PGDLLEXPORT Datum pg_sortstats(PG_FUNCTION_ARGS); 186 | extern PGDLLEXPORT Datum pg_sortstats_reset(PG_FUNCTION_ARGS); 187 | 188 | PG_FUNCTION_INFO_V1(pg_sortstats); 189 | PG_FUNCTION_INFO_V1(pg_sortstats_reset); 190 | 191 | #if PG_VERSION_NUM >= 150000 192 | static void pgsrt_shmem_request(void); 193 | #endif 194 | static void pgsrt_shmem_startup(void); 195 | static void pgsrt_shmem_shutdown(int code, Datum arg); 196 | static void pgsrt_ExecutorStart(QueryDesc *queryDesc, int eflags); 197 | static void pgsrt_ExecutorRun(QueryDesc *queryDesc, 198 | ScanDirection direction, 199 | #if PG_VERSION_NUM >= 90600 200 | uint64 count 201 | #else 202 | long count 203 | #endif 204 | #if PG_VERSION_NUM >= 100000 205 | ,bool execute_once 206 | #endif 207 | ); 208 | static void pgsrt_ExecutorFinish(QueryDesc *queryDesc); 209 | static void pgsrt_ExecutorEnd(QueryDesc *queryDesc); 210 | 211 | #if PG_VERSION_NUM >= 150000 212 | static shmem_request_hook_type prev_shmem_request_hook = NULL; 213 | #endif 214 | static shmem_startup_hook_type prev_shmem_startup_hook = NULL; 215 | static ExecutorStart_hook_type prev_ExecutorStart = NULL; 216 | static ExecutorRun_hook_type prev_ExecutorRun = NULL; 217 | static ExecutorFinish_hook_type prev_ExecutorFinish = NULL; 218 | static ExecutorEnd_hook_type prev_ExecutorEnd = NULL; 219 | 220 | static Size pgsrt_memsize(void); 221 | #if PG_VERSION_NUM >= 90600 222 | static Size pgsrt_queryids_size(void); 223 | static pgsrt_queryid pgsrt_get_queryid(void); 224 | static void pgsrt_set_queryid(pgsrt_queryid); 225 | #endif 226 | 227 | static pgsrtEntry *pgsrt_entry_alloc(pgsrtHashKey *key, Size keys_offset, 228 | int keys_len, int encoding, int nbkeys); 229 | static void pgsrt_entry_dealloc(void); 230 | static void pgsrt_entry_reset(void); 231 | static void pgsrt_store(pgsrt_queryid queryId, int nbkeys, char *keys, 232 | pgsrtCounters *counters); 233 | static uint32 pgsrt_hash_fn(const void *key, Size keysize); 234 | static int pgsrt_match_fn(const void *key1, const void *key2, Size keysize); 235 | 236 | static bool ktext_store(const char *keys, int keys_len, Size *keys_offset, 237 | int *gc_count); 238 | static char *ktext_load_file(Size *buffer_size); 239 | static char *ktext_fetch(Size keys_offset, int keys_len, char *buffer, 240 | Size buffer_size); 241 | static bool need_gc_ktexts(void); 242 | static void gc_ktexts(void); 243 | 244 | static void pgsrt_process_sortstate(SortState *srtstate, pgsrtWalkerContext *context); 245 | static bool pgsrt_planstate_walker(PlanState *ps, pgsrtWalkerContext *context); 246 | static char * pgsrt_get_sort_group_keys(SortState *srtstate, 247 | int nkeys, AttrNumber *keycols, 248 | Oid *sortOperators, Oid *collations, bool *nullsFirst, 249 | pgsrtWalkerContext *context); 250 | static void pgsrt_setup_walker_context(pgsrtWalkerContext *context); 251 | 252 | static unsigned long round_up_pow2(int64 val); 253 | static int get_alignment_overhead(TupleDesc tupdesc); 254 | 255 | /*--- Local variables ---*/ 256 | static int nesting_level = 0; 257 | 258 | static bool pgsrt_enabled; 259 | static int pgsrt_max; /* max #of sorts to track */ 260 | static bool pgsrt_save; /* whether to save stats across shutdown */ 261 | 262 | static HTAB *pgsrt_hash = NULL; 263 | static pgsrtSharedState *pgsrt = NULL; 264 | 265 | 266 | void 267 | _PG_init(void) 268 | { 269 | if (!process_shared_preload_libraries_in_progress) 270 | { 271 | elog(ERROR, "This module can only be loaded via shared_preload_libraries"); 272 | return; 273 | } 274 | 275 | DefineCustomBoolVariable("pg_sortstats.enabled", 276 | "Enable / Disable pg_sortstats", 277 | NULL, 278 | &pgsrt_enabled, 279 | true, 280 | PGC_USERSET, 281 | 0, 282 | NULL, 283 | NULL, 284 | NULL); 285 | 286 | DefineCustomIntVariable("pg_sortstats.max", 287 | "Sets the maximum number of statements tracked by pg_sortstats.", 288 | NULL, 289 | &pgsrt_max, 290 | 10000, 291 | 100, 292 | INT_MAX, 293 | PGC_POSTMASTER, 294 | 0, 295 | NULL, 296 | NULL, 297 | NULL); 298 | 299 | DefineCustomBoolVariable("pg_sortstats.save", 300 | "Save pg_sortstats statistics across server shutdowns.", 301 | NULL, 302 | &pgsrt_save, 303 | true, 304 | PGC_SIGHUP, 305 | 0, 306 | NULL, 307 | NULL, 308 | NULL); 309 | 310 | EmitWarningsOnPlaceholders("pg_sortstats"); 311 | 312 | #if PG_VERSION_NUM < 150000 313 | /* 314 | * Request additional shared resources. (These are no-ops if we're not in 315 | * the postmaster process.) We'll allocate or attach to the shared 316 | * resources in pgsrt_shmem_startup(). 317 | * If you change code here, don't forget to also report the modifications 318 | * in pgsrt_shmem_request() for pg15 and later. 319 | */ 320 | RequestAddinShmemSpace(pgsrt_memsize()); 321 | #if PG_VERSION_NUM >= 90600 322 | RequestNamedLWLockTranche("pg_sortstats", 2); 323 | #else 324 | RequestAddinLWLocks(1); 325 | #endif /* pg 9.6+ */ 326 | #endif /* pg 15- */ 327 | 328 | /* install hooks */ 329 | prev_ExecutorStart = ExecutorStart_hook; 330 | ExecutorStart_hook = pgsrt_ExecutorStart; 331 | prev_ExecutorRun = ExecutorRun_hook; 332 | ExecutorRun_hook = pgsrt_ExecutorRun; 333 | prev_ExecutorFinish = ExecutorFinish_hook; 334 | ExecutorFinish_hook = pgsrt_ExecutorFinish; 335 | prev_ExecutorEnd = ExecutorEnd_hook; 336 | ExecutorEnd_hook = pgsrt_ExecutorEnd; 337 | #if PG_VERSION_NUM >= 150000 338 | prev_shmem_request_hook = shmem_request_hook; 339 | shmem_request_hook = pgsrt_shmem_request; 340 | #endif 341 | prev_shmem_startup_hook = shmem_startup_hook; 342 | shmem_startup_hook = pgsrt_shmem_startup; 343 | } 344 | 345 | #if PG_VERSION_NUM >= 150000 346 | /* 347 | * Request additional shared memory resources. 348 | * 349 | * If you change code here, don't forget to also report the modifications in 350 | * _PG_init() for pg14 and below. 351 | */ 352 | static void 353 | pgsrt_shmem_request(void) 354 | { 355 | if (prev_shmem_request_hook) 356 | prev_shmem_request_hook(); 357 | 358 | RequestAddinShmemSpace(pgsrt_memsize()); 359 | RequestNamedLWLockTranche("pg_sortstats", 2); 360 | } 361 | #endif 362 | 363 | static void 364 | pgsrt_shmem_startup(void) 365 | { 366 | bool found; 367 | HASHCTL info; 368 | FILE *file = NULL; 369 | FILE *kfile = NULL; 370 | uint32 header; 371 | int32 num; 372 | int32 pgver; 373 | int32 i; 374 | int buffer_size; 375 | char *buffer = NULL; 376 | Size tottextlen; 377 | int nvalidtexts; 378 | 379 | if (prev_shmem_startup_hook) 380 | prev_shmem_startup_hook(); 381 | 382 | /* reset in case this is a restart within the postmaster */ 383 | pgsrt = NULL; 384 | 385 | /* Create or attach to the shared memory state */ 386 | LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE); 387 | 388 | /* global access lock */ 389 | pgsrt = ShmemInitStruct("pg_sortstats", 390 | (sizeof(pgsrtSharedState) 391 | #if PG_VERSION_NUM >= 90600 392 | + pgsrt_queryids_size() 393 | #endif 394 | ), 395 | &found); 396 | 397 | if (!found) 398 | { 399 | /* First time through ... */ 400 | #if PG_VERSION_NUM >= 90600 401 | LWLockPadded *locks = GetNamedLWLockTranche("pg_sortstats"); 402 | pgsrt->lock = &(locks[0]).lock; 403 | pgsrt->queryids_lock = &(locks[1]).lock; 404 | memset(pgsrt->queryids, 0, pgsrt_queryids_size()); 405 | #else 406 | pgsrt->lock = LWLockAssign(); 407 | #endif 408 | pgsrt->cur_median_usage = ASSUMED_MEDIAN_INIT; 409 | pgsrt->mean_keys_len = ASSUMED_LENGTH_INIT; 410 | SpinLockInit(&pgsrt->mutex); 411 | pgsrt->extent = 0; 412 | pgsrt->n_writers = 0; 413 | pgsrt->gc_count = 0; 414 | } 415 | 416 | memset(&info, 0, sizeof(info)); 417 | info.keysize = sizeof(pgsrtHashKey); 418 | info.entrysize = sizeof(pgsrtEntry); 419 | info.hash = pgsrt_hash_fn; 420 | info.match = pgsrt_match_fn; 421 | 422 | /* allocate stats shared memory hash */ 423 | pgsrt_hash = ShmemInitHash("pg_sortstats hash", 424 | pgsrt_max, pgsrt_max, 425 | &info, 426 | HASH_ELEM | HASH_FUNCTION | HASH_COMPARE); 427 | 428 | LWLockRelease(AddinShmemInitLock); 429 | 430 | if (!IsUnderPostmaster) 431 | on_shmem_exit(pgsrt_shmem_shutdown, (Datum) 0); 432 | 433 | /* 434 | * Done if some other process already completed our initialization. 435 | */ 436 | if (found) 437 | return; 438 | 439 | /* 440 | * Note: we don't bother with locks here, because there should be no other 441 | * processes running when this code is reached. 442 | */ 443 | 444 | /* Unlink keys text file possibly left over from crash */ 445 | unlink(PGSRT_TEXT_FILE); 446 | 447 | /* Allocate new keys text temp file */ 448 | kfile = AllocateFile(PGSRT_TEXT_FILE, PG_BINARY_W); 449 | if (kfile == NULL) 450 | goto write_error; 451 | 452 | /* 453 | * If we were told not to load old statistics, we're done. (Note we do 454 | * not try to unlink any old dump file in this case. This seems a bit 455 | * questionable but it's the historical behavior.) 456 | */ 457 | if (!pgsrt_save) 458 | { 459 | FreeFile(kfile); 460 | return; 461 | } 462 | 463 | /* 464 | * Attempt to load old statistics from the dump file. 465 | */ 466 | file = AllocateFile(PGSRT_DUMP_FILE, PG_BINARY_R); 467 | if (file == NULL) 468 | { 469 | if (errno != ENOENT) 470 | goto read_error; 471 | /* No existing persisted stats file, so we're done */ 472 | FreeFile(kfile); 473 | return; 474 | } 475 | 476 | buffer_size = 2048; 477 | buffer = (char *) palloc(buffer_size); 478 | 479 | if (fread(&header, sizeof(uint32), 1, file) != 1 || 480 | fread(&pgver, sizeof(uint32), 1, file) != 1 || 481 | fread(&num, sizeof(int32), 1, file) != 1) 482 | goto read_error; 483 | 484 | if (header != PGSRT_FILE_HEADER || 485 | pgver != PGSRT_PG_MAJOR_VERSION) 486 | goto data_error; 487 | 488 | tottextlen = 0; 489 | nvalidtexts = 0; 490 | 491 | for (i = 0; i < num; i++) 492 | { 493 | pgsrtEntry temp; 494 | pgsrtEntry *entry; 495 | Size keys_offset; 496 | 497 | if (fread(&temp, sizeof(pgsrtEntry), 1, file) != 1) 498 | goto read_error; 499 | 500 | /* Encoding is the only field we can easily sanity-check */ 501 | if (!PG_VALID_BE_ENCODING(temp.encoding)) 502 | goto data_error; 503 | 504 | /* Resize buffer as needed */ 505 | if (temp.keys_len >= buffer_size) 506 | { 507 | buffer_size = Max(buffer_size * 2, temp.keys_len + 1); 508 | buffer = repalloc(buffer, buffer_size); 509 | } 510 | 511 | if (fread(buffer, 1, temp.keys_len + 1, file) != temp.keys_len + 1) 512 | goto read_error; 513 | 514 | /* Should have a trailing null, but let's make sure */ 515 | buffer[temp.keys_len] = '\0'; 516 | 517 | /* Store the keys text */ 518 | keys_offset = pgsrt->extent; 519 | if (fwrite(buffer, 1, temp.keys_len + 1, kfile) != temp.keys_len + 1) 520 | goto write_error; 521 | pgsrt->extent += temp.keys_len + 1; 522 | 523 | /* make the hashtable entry (discards old entries if too many) */ 524 | entry = pgsrt_entry_alloc(&temp.key, keys_offset, temp.keys_len, 525 | temp.encoding, temp.nbkeys); 526 | 527 | /* In the mean length computation, ignore dropped texts. */ 528 | if (entry->keys_len >= 0) 529 | { 530 | tottextlen += entry->keys_len + 1; 531 | nvalidtexts++; 532 | } 533 | 534 | /* copy in the actual stats */ 535 | entry->counters = temp.counters; 536 | } 537 | 538 | if (nvalidtexts > 0) 539 | pgsrt->mean_keys_len = tottextlen / nvalidtexts; 540 | else 541 | pgsrt->mean_keys_len = ASSUMED_LENGTH_INIT; 542 | 543 | pfree(buffer); 544 | FreeFile(file); 545 | FreeFile(kfile); 546 | 547 | /* 548 | * Remove the persisted stats file so it's not included in 549 | * backups/replication slaves, etc. A new file will be written on next 550 | * shutdown. 551 | * 552 | * Note: it's okay if the PGSRT_TEXT_FILE is included in a basebackup, 553 | * because we remove that file on startup; it acts inversely to 554 | * PGSRT_DUMP_FILE, in that it is only supposed to be around when the 555 | * server is running, whereas PGSRT_DUMP_FILE is only supposed to be around 556 | * when the server is not running. Leaving the file creates no danger of 557 | * a newly restored database having a spurious record of execution costs, 558 | * which is what we're really concerned about here. 559 | */ 560 | unlink(PGSRT_DUMP_FILE); 561 | 562 | return; 563 | 564 | read_error: 565 | ereport(LOG, 566 | (errcode_for_file_access(), 567 | errmsg("could not read file \"%s\": %m", 568 | PGSRT_DUMP_FILE))); 569 | goto fail; 570 | data_error: 571 | ereport(LOG, 572 | (errcode(ERRCODE_INVALID_PARAMETER_VALUE), 573 | errmsg("ignoring invalid data in file \"%s\"", 574 | PGSRT_DUMP_FILE))); 575 | goto fail; 576 | write_error: 577 | ereport(LOG, 578 | (errcode_for_file_access(), 579 | errmsg("could not write file \"%s\": %m", 580 | PGSRT_TEXT_FILE))); 581 | fail: 582 | if (buffer) 583 | pfree(buffer); 584 | if (file) 585 | FreeFile(file); 586 | if (kfile) 587 | FreeFile(kfile); 588 | /* If possible, throw away the bogus file; ignore any error */ 589 | unlink(PGSRT_DUMP_FILE); 590 | 591 | /* 592 | * Don't unlink PGSRT_TEXT_FILE here; it should always be around while the 593 | * server is running with pg_sortstats enabled 594 | */ 595 | } 596 | 597 | /* Save the statistics into a file at shutdown */ 598 | static void 599 | pgsrt_shmem_shutdown(int code, Datum arg) 600 | { 601 | FILE *file; 602 | char *kbuffer = NULL; 603 | Size kbuffer_size = 0; 604 | HASH_SEQ_STATUS hash_seq; 605 | int32 num_entries; 606 | pgsrtEntry *entry; 607 | 608 | /* Don't try to dump during a crash. */ 609 | if (code) 610 | return; 611 | 612 | /* Safety check ... shouldn't get here unless shmem is set up. */ 613 | if (!pgsrt || !pgsrt_hash) 614 | return; 615 | 616 | /* Don't dump if told not to. */ 617 | if (!pgsrt_save) 618 | return; 619 | 620 | file = AllocateFile(PGSRT_DUMP_FILE ".tmp", PG_BINARY_W); 621 | if (file == NULL) 622 | goto error; 623 | 624 | if (fwrite(&PGSRT_FILE_HEADER, sizeof(uint32), 1, file) != 1) 625 | goto error; 626 | if (fwrite(&PGSRT_PG_MAJOR_VERSION, sizeof(uint32), 1, file) != 1) 627 | goto error; 628 | num_entries = hash_get_num_entries(pgsrt_hash); 629 | if (fwrite(&num_entries, sizeof(int32), 1, file) != 1) 630 | goto error; 631 | 632 | kbuffer = ktext_load_file(&kbuffer_size); 633 | if (kbuffer == NULL) 634 | goto error; 635 | 636 | /* 637 | * When serializing to disk, we store keys texts immediately after their 638 | * entry data. Any orphaned keys texts are thereby excluded. 639 | */ 640 | hash_seq_init(&hash_seq, pgsrt_hash); 641 | while ((entry = hash_seq_search(&hash_seq)) != NULL) 642 | { 643 | int len = entry->keys_len; 644 | char *kstr = ktext_fetch(entry->keys_offset, len, 645 | kbuffer, kbuffer_size); 646 | 647 | if (kstr == NULL) 648 | continue; /* Ignore any entries with bogus texts */ 649 | 650 | if (fwrite(entry, sizeof(pgsrtEntry), 1, file) != 1 || 651 | fwrite(kstr, 1, len + 1, file) != len + 1) 652 | { 653 | /* note: we assume hash_seq_term won't change errno */ 654 | hash_seq_term(&hash_seq); 655 | goto error; 656 | } 657 | } 658 | 659 | free(kbuffer); 660 | kbuffer = NULL; 661 | 662 | if (FreeFile(file)) 663 | { 664 | file = NULL; 665 | goto error; 666 | } 667 | 668 | /* 669 | * Rename file into place, so we atomically replace any old one. 670 | */ 671 | (void) durable_rename(PGSRT_DUMP_FILE ".tmp", PGSRT_DUMP_FILE, LOG); 672 | 673 | /* Unlink keys-texts file; it's not needed while shutdown */ 674 | unlink(PGSRT_TEXT_FILE); 675 | 676 | return; 677 | 678 | error: 679 | ereport(LOG, 680 | (errcode_for_file_access(), 681 | errmsg("could not write file \"%s\": %m", 682 | PGSRT_DUMP_FILE ".tmp"))); 683 | if (kbuffer) 684 | free(kbuffer); 685 | if (file) 686 | FreeFile(file); 687 | unlink(PGSRT_DUMP_FILE ".tmp"); 688 | unlink(PGSRT_TEXT_FILE); 689 | } 690 | 691 | /* 692 | * Save this query's queryId if it's not a parallel worker 693 | */ 694 | static void 695 | pgsrt_ExecutorStart(QueryDesc *queryDesc, int eflags) 696 | { 697 | #if PG_VERSION_NUM >= 90600 698 | if (pgsrt_enabled && !IsParallelWorker()) 699 | pgsrt_set_queryid(queryDesc->plannedstmt->queryId); 700 | #endif 701 | 702 | if (prev_ExecutorStart) 703 | prev_ExecutorStart(queryDesc, eflags); 704 | else 705 | standard_ExecutorStart(queryDesc, eflags); 706 | 707 | } 708 | 709 | /* 710 | * ExecutorRun hook: all we need do is track nesting depth 711 | */ 712 | static void 713 | pgsrt_ExecutorRun(QueryDesc *queryDesc, 714 | ScanDirection direction, 715 | #if PG_VERSION_NUM >= 90600 716 | uint64 count 717 | #else 718 | long count 719 | #endif 720 | #if PG_VERSION_NUM >= 100000 721 | ,bool execute_once 722 | #endif 723 | ) 724 | { 725 | nesting_level++; 726 | PG_TRY(); 727 | { 728 | if (prev_ExecutorRun) 729 | #if PG_VERSION_NUM >= 100000 730 | prev_ExecutorRun(queryDesc, direction, count, execute_once); 731 | #else 732 | prev_ExecutorRun(queryDesc, direction, count); 733 | #endif 734 | else 735 | #if PG_VERSION_NUM >= 100000 736 | standard_ExecutorRun(queryDesc, direction, count, execute_once); 737 | #else 738 | standard_ExecutorRun(queryDesc, direction, count); 739 | #endif 740 | nesting_level--; 741 | } 742 | PG_CATCH(); 743 | { 744 | nesting_level--; 745 | PG_RE_THROW(); 746 | } 747 | PG_END_TRY(); 748 | } 749 | 750 | /* 751 | * ExecutorFinish hook: all we need do is track nesting depth 752 | */ 753 | static void 754 | pgsrt_ExecutorFinish(QueryDesc *queryDesc) 755 | { 756 | nesting_level++; 757 | PG_TRY(); 758 | { 759 | if (prev_ExecutorFinish) 760 | prev_ExecutorFinish(queryDesc); 761 | else 762 | standard_ExecutorFinish(queryDesc); 763 | nesting_level--; 764 | } 765 | PG_CATCH(); 766 | { 767 | nesting_level--; 768 | PG_RE_THROW(); 769 | } 770 | PG_END_TRY(); 771 | } 772 | 773 | /* 774 | * Walk the planstates, find any sorts and gather their statistics. 775 | */ 776 | static void 777 | pgsrt_ExecutorEnd(QueryDesc *queryDesc) 778 | { 779 | /* retrieve sorts informations, main work starts from here */ 780 | if (pgsrt_enabled) 781 | { 782 | pgsrtWalkerContext context; 783 | 784 | context.queryDesc = queryDesc; 785 | context.ancestors = NIL; 786 | 787 | pgsrt_planstate_walker(queryDesc->planstate, &context); 788 | 789 | #if PG_VERSION_NUM >= 90600 790 | /* Remove the saved queryid for safety */ 791 | if (!IsParallelWorker()) 792 | pgsrt_set_queryid(0); 793 | #endif 794 | } 795 | 796 | if (prev_ExecutorEnd) 797 | prev_ExecutorEnd(queryDesc); 798 | else 799 | standard_ExecutorEnd(queryDesc); 800 | } 801 | 802 | static Size 803 | pgsrt_memsize(void) 804 | { 805 | Size size; 806 | 807 | size = MAXALIGN(sizeof(pgsrtSharedState)); 808 | size = add_size(size, hash_estimate_size(pgsrt_max, sizeof(pgsrtEntry))); 809 | #if PG_VERSION_NUM >= 90600 810 | size = add_size(size, pgsrt_queryids_size()); 811 | #endif 812 | 813 | return size; 814 | } 815 | 816 | #if PG_VERSION_NUM >= 90600 817 | /* Parallel workers won't have their queryid setup. We store the leader 818 | * process' queryid in shared memory so that workers can find which queryid 819 | * they're actually executing. 820 | */ 821 | static Size 822 | pgsrt_queryids_size(void) 823 | { 824 | #if PG_VERSION_NUM >= 150000 825 | Assert(MaxBackends > 0); 826 | /* We need an extra slot since BackendId numerotation starts at 1. */ 827 | #define PGSRT_NB_BACKEND_SLOT (MaxBackends + 1) 828 | #elif PG_VERSION_NUM >= 120000 829 | /* We need frrom for all possible backends, plus the autovacuum launcher 830 | * and workers, plus the background workers, and an extra one since 831 | * BackendId numerotation starts at 1. 832 | * Starting with pg12, wal senders aren't part of MaxConnections anymore, 833 | * so they need to be accounted for. 834 | */ 835 | #define PGSRT_NB_BACKEND_SLOT (MaxConnections \ 836 | + autovacuum_max_workers + 1 \ 837 | + max_worker_processes \ 838 | + max_wal_senders + 1) 839 | #else 840 | #define PGSRT_NB_BACKEND_SLOT (MaxConnections \ 841 | + autovacuum_max_workers + 1 \ 842 | + max_worker_processes + 1) 843 | #endif 844 | 845 | return MAXALIGN(sizeof(pgsrt_queryid) * PGSRT_NB_BACKEND_SLOT); 846 | } 847 | 848 | static pgsrt_queryid 849 | pgsrt_get_queryid(void) 850 | { 851 | pgsrt_queryid queryId; 852 | 853 | Assert(IsParallelWorker()); 854 | Assert(MyBackendId <= PGSRT_NB_BACKEND_SLOT); 855 | 856 | LWLockAcquire(pgsrt->queryids_lock, LW_SHARED); 857 | queryId = pgsrt->queryids[ParallelLeaderBackendId]; 858 | LWLockRelease(pgsrt->queryids_lock); 859 | 860 | return queryId; 861 | } 862 | 863 | static void 864 | pgsrt_set_queryid(pgsrt_queryid queryId) 865 | { 866 | Assert(!IsParallelWorker()); 867 | Assert(MyBackendId <= PGSRT_NB_BACKEND_SLOT); 868 | 869 | LWLockAcquire(pgsrt->queryids_lock, LW_EXCLUSIVE); 870 | pgsrt->queryids[MyBackendId] = queryId; 871 | LWLockRelease(pgsrt->queryids_lock); 872 | } 873 | #endif 874 | 875 | /* 876 | * Allocate a new hashtable entry. 877 | * caller must hold an exclusive lock on pgsrt->lock 878 | */ 879 | static pgsrtEntry * 880 | pgsrt_entry_alloc(pgsrtHashKey *key, Size keys_offset, int keys_len, 881 | int encoding, int nbkeys) 882 | { 883 | pgsrtEntry *entry; 884 | bool found; 885 | 886 | /* Make space if needed */ 887 | while (hash_get_num_entries(pgsrt_hash) >= pgsrt_max) 888 | pgsrt_entry_dealloc(); 889 | 890 | /* Find or create an entry with desired hash code */ 891 | entry = (pgsrtEntry *) hash_search(pgsrt_hash, key, HASH_ENTER, &found); 892 | 893 | if (!found) 894 | { 895 | /* New entry, initialize it */ 896 | 897 | /* reset the statistics */ 898 | memset(&entry->counters, 0, sizeof(pgsrtCounters)); 899 | /* set the appropriate initial usage count */ 900 | entry->counters.usage = USAGE_INIT; 901 | /* re-initialize the mutex each time ... we assume no one using it */ 902 | SpinLockInit(&entry->mutex); 903 | /* set non counters fields */ 904 | Assert(keys_len >= 0); 905 | entry->nbkeys = nbkeys; 906 | entry->keys_offset = keys_offset; 907 | entry->keys_len = keys_len; 908 | entry->encoding = encoding; 909 | } 910 | 911 | return entry; 912 | } 913 | 914 | /* 915 | * qsort comparator for sorting into increasing usage order 916 | */ 917 | static int 918 | entry_cmp(const void *lhs, const void *rhs) 919 | { 920 | double l_usage = (*(pgsrtEntry *const *) lhs)->counters.usage; 921 | double r_usage = (*(pgsrtEntry *const *) rhs)->counters.usage; 922 | 923 | if (l_usage < r_usage) 924 | return -1; 925 | else if (l_usage > r_usage) 926 | return +1; 927 | else 928 | return 0; 929 | } 930 | 931 | /* 932 | * Deallocate least used entries. 933 | * 934 | * Caller must hold an exclusive lock on pgsrt->lock. 935 | */ 936 | static void 937 | pgsrt_entry_dealloc(void) 938 | { 939 | HASH_SEQ_STATUS hash_seq; 940 | pgsrtEntry **entries; 941 | pgsrtEntry *entry; 942 | int nvictims; 943 | int i; 944 | Size tottextlen; 945 | int nvalidtexts; 946 | 947 | /* 948 | * Sort entries by usage and deallocate USAGE_DEALLOC_PERCENT of them. 949 | * While we're scanning the table, apply the decay factor to the usage 950 | * values. 951 | * 952 | * Note that the mean query length is almost immediately obsolete, since 953 | * we compute it before not after discarding the least-used entries. 954 | * Hopefully, that doesn't affect the mean too much; it doesn't seem worth 955 | * making two passes to get a more current result. Likewise, the new 956 | * cur_median_usage includes the entries we're about to zap. 957 | */ 958 | entries = palloc(hash_get_num_entries(pgsrt_hash) * sizeof(pgsrtEntry *)); 959 | 960 | i = 0; 961 | tottextlen = 0; 962 | nvalidtexts = 0; 963 | 964 | hash_seq_init(&hash_seq, pgsrt_hash); 965 | while ((entry = hash_seq_search(&hash_seq)) != NULL) 966 | { 967 | entries[i++] = entry; 968 | entry->counters.usage *= USAGE_DECREASE_FACTOR; 969 | /* In the mean length computation, ignore dropped texts. */ 970 | if (entry->keys_len >= 0) 971 | { 972 | tottextlen += entry->keys_len + 1; 973 | nvalidtexts++; 974 | } 975 | } 976 | 977 | /* Sort into increasing order by usage */ 978 | qsort(entries, i, sizeof(pgsrtEntry *), entry_cmp); 979 | 980 | /* Record the (approximate) median usage */ 981 | if (i > 0) 982 | pgsrt->cur_median_usage = entries[i / 2]->counters.usage; 983 | /* Record the mean query length */ 984 | if (nvalidtexts > 0) 985 | pgsrt->mean_keys_len = tottextlen / nvalidtexts; 986 | else 987 | pgsrt->mean_keys_len = ASSUMED_LENGTH_INIT; 988 | 989 | /* Now zap an appropriate fraction of lowest-usage entries */ 990 | nvictims = Max(10, i * USAGE_DEALLOC_PERCENT / 100); 991 | nvictims = Min(nvictims, i); 992 | 993 | for (i = 0; i < nvictims; i++) 994 | { 995 | hash_search(pgsrt_hash, &entries[i]->key, HASH_REMOVE, NULL); 996 | } 997 | 998 | pfree(entries); 999 | } 1000 | 1001 | /* Remove all saved entries in shmem */ 1002 | static void 1003 | pgsrt_entry_reset(void) 1004 | { 1005 | HASH_SEQ_STATUS hash_seq; 1006 | pgsrtEntry *entry; 1007 | 1008 | LWLockAcquire(pgsrt->lock, LW_EXCLUSIVE); 1009 | 1010 | hash_seq_init(&hash_seq, pgsrt_hash); 1011 | while ((entry = hash_seq_search(&hash_seq)) != NULL) 1012 | { 1013 | hash_search(pgsrt_hash, &entry->key, HASH_REMOVE, NULL); 1014 | } 1015 | 1016 | LWLockRelease(pgsrt->lock); 1017 | } 1018 | 1019 | 1020 | /* 1021 | * Store some statistics for a sort. 1022 | */ 1023 | static void 1024 | pgsrt_store(pgsrt_queryid queryId, int nbkeys, char *keys, 1025 | pgsrtCounters *counters) 1026 | { 1027 | volatile pgsrtEntry *e; 1028 | pgsrtHashKey key; 1029 | pgsrtEntry *entry; 1030 | 1031 | Assert(keys != NULL); 1032 | 1033 | /* Safety check... */ 1034 | if (!pgsrt || !pgsrt_hash) 1035 | return; 1036 | 1037 | /* Set up key for hashtable search */ 1038 | key.userid = GetUserId(); 1039 | key.dbid = MyDatabaseId; 1040 | key.queryid = queryId; 1041 | key.sortid = (uint32) hash_any((unsigned char *) keys, 1042 | strlen(keys)); 1043 | 1044 | /* Lookup the hash table entry with shared lock. */ 1045 | LWLockAcquire(pgsrt->lock, LW_SHARED); 1046 | 1047 | entry = (pgsrtEntry *) hash_search(pgsrt_hash, &key, HASH_FIND, NULL); 1048 | 1049 | /* Create new entry, if not present */ 1050 | if (!entry) 1051 | { 1052 | Size keys_offset; 1053 | int keys_len = strlen(keys); 1054 | int gc_count; 1055 | bool stored; 1056 | bool do_gc; 1057 | 1058 | /* Append new keys text to file with only shared lock held */ 1059 | stored = ktext_store(keys, keys_len, &keys_offset, &gc_count); 1060 | 1061 | /* 1062 | * Determine whether we need to garbage collect external keys texts 1063 | * while the shared lock is still held. This micro-optimization 1064 | * avoids taking the time to decide this while holding exclusive lock. 1065 | */ 1066 | do_gc = need_gc_ktexts(); 1067 | 1068 | /* Need exclusive lock to make a new hashtable entry - promote */ 1069 | LWLockRelease(pgsrt->lock); 1070 | LWLockAcquire(pgsrt->lock, LW_EXCLUSIVE); 1071 | 1072 | /* 1073 | * A garbage collection may have occurred while we weren't holding the 1074 | * lock. In the unlikely event that this happens, the keys text we 1075 | * stored above will have been garbage collected, so write it again. 1076 | * This should be infrequent enough that doing it while holding 1077 | * exclusive lock isn't a performance problem. 1078 | */ 1079 | if (!stored || pgsrt->gc_count != gc_count) 1080 | stored = ktext_store(keys, keys_len, &keys_offset, NULL); 1081 | 1082 | /* If we failed to write to the text file, give up */ 1083 | if (!stored) 1084 | goto done; 1085 | 1086 | /* OK to create a new hashtable entry */ 1087 | entry = pgsrt_entry_alloc(&key, keys_offset, keys_len, 1088 | GetDatabaseEncoding(), nbkeys); 1089 | 1090 | /* If needed, perform garbage collection while exclusive lock held */ 1091 | if (do_gc) 1092 | gc_ktexts(); 1093 | } 1094 | 1095 | /* 1096 | * Grab the spinlock while updating the counters */ 1097 | e = (volatile pgsrtEntry *) entry; 1098 | 1099 | SpinLockAcquire(&e->mutex); 1100 | 1101 | e->counters.usage += 1; 1102 | e->counters.lines += counters->lines; 1103 | e->counters.lines_to_sort += counters->lines_to_sort; 1104 | e->counters.work_mems += counters->work_mems; 1105 | e->counters.topn_sorts += counters->topn_sorts; 1106 | e->counters.quicksorts += counters->quicksorts; 1107 | e->counters.external_sorts += counters->external_sorts; 1108 | e->counters.external_merges += counters->external_merges; 1109 | e->counters.nbtapes += counters->nbtapes; 1110 | e->counters.space_disk += counters->space_disk; 1111 | e->counters.space_memory += counters->space_memory; 1112 | e->counters.non_parallels += counters->non_parallels; 1113 | e->counters.nb_workers += counters->nb_workers; 1114 | 1115 | SpinLockRelease(&e->mutex); 1116 | 1117 | done: 1118 | LWLockRelease(pgsrt->lock); 1119 | } 1120 | 1121 | /* Compute hash value for a pgsrtHashKey. sortid is already hashed */ 1122 | static uint32 1123 | pgsrt_hash_fn(const void *key, Size keysize) 1124 | { 1125 | const pgsrtHashKey *k = (const pgsrtHashKey *) key; 1126 | 1127 | return hash_uint32((uint32) k->userid) ^ 1128 | hash_uint32((uint32) k->dbid) ^ 1129 | hash_uint32((uint32) k->queryid) ^ 1130 | k->sortid; 1131 | } 1132 | 1133 | /* Compare two pgsrtHashKey keys. Zero means match */ 1134 | static int 1135 | pgsrt_match_fn(const void *key1, const void *key2, Size keysize) 1136 | { 1137 | const pgsrtHashKey *k1 = (const pgsrtHashKey *) key1; 1138 | const pgsrtHashKey *k2 = (const pgsrtHashKey *) key2; 1139 | 1140 | if (k1->userid == k2->userid && 1141 | k1->dbid == k2->dbid && 1142 | k1->queryid == k2->queryid && 1143 | k1->sortid == k2->sortid) 1144 | return 0; 1145 | else 1146 | return 1; 1147 | } 1148 | 1149 | /* 1150 | * Given a keys string (not necessarily null-terminated), allocate a new 1151 | * entry in the external keys text file and store the string there. 1152 | * 1153 | * If successful, returns true, and stores the new entry's offset in the file 1154 | * into *keys_offset. Also, if gc_count isn't NULL, *gc_count is set to the 1155 | * number of garbage collections that have occurred so far. 1156 | * 1157 | * On failure, returns false. 1158 | * 1159 | * At least a shared lock on pgsrt->lock must be held by the caller, so as 1160 | * to prevent a concurrent garbage collection. Share-lock-holding callers 1161 | * should pass a gc_count pointer to obtain the number of garbage collections, 1162 | * so that they can recheck the count after obtaining exclusive lock to 1163 | * detect whether a garbage collection occurred (and removed this entry). 1164 | */ 1165 | static bool 1166 | ktext_store(const char *keys, int keys_len, 1167 | Size *keys_offset, int *gc_count) 1168 | { 1169 | Size off; 1170 | int fd; 1171 | 1172 | /* 1173 | * We use a spinlock to protect extent/n_writers/gc_count, so that 1174 | * multiple processes may execute this function concurrently. 1175 | */ 1176 | { 1177 | volatile pgsrtSharedState *s = (volatile pgsrtSharedState *) pgsrt; 1178 | 1179 | SpinLockAcquire(&s->mutex); 1180 | off = s->extent; 1181 | s->extent += keys_len + 1; 1182 | s->n_writers++; 1183 | if (gc_count) 1184 | *gc_count = s->gc_count; 1185 | SpinLockRelease(&s->mutex); 1186 | } 1187 | 1188 | *keys_offset = off; 1189 | 1190 | /* Now write the data into the successfully-reserved part of the file */ 1191 | #if PG_VERSION_NUM < 110000 1192 | fd = OpenTransientFile(PGSRT_TEXT_FILE, O_RDWR | O_CREAT | PG_BINARY, 1193 | S_IRUSR | S_IWUSR); 1194 | #else 1195 | fd = OpenTransientFile(PGSRT_TEXT_FILE, O_RDWR | O_CREAT | PG_BINARY); 1196 | #endif 1197 | if (fd < 0) 1198 | goto error; 1199 | 1200 | if (lseek(fd, off, SEEK_SET) != off) 1201 | goto error; 1202 | 1203 | if (write(fd, keys, keys_len) != keys_len) 1204 | goto error; 1205 | if (write(fd, "\0", 1) != 1) 1206 | goto error; 1207 | 1208 | CloseTransientFile(fd); 1209 | 1210 | /* Mark our write complete */ 1211 | { 1212 | volatile pgsrtSharedState *s = (volatile pgsrtSharedState *) pgsrt; 1213 | 1214 | SpinLockAcquire(&s->mutex); 1215 | s->n_writers--; 1216 | SpinLockRelease(&s->mutex); 1217 | } 1218 | 1219 | return true; 1220 | 1221 | error: 1222 | ereport(LOG, 1223 | (errcode_for_file_access(), 1224 | errmsg("could not write file \"%s\": %m", 1225 | PGSRT_TEXT_FILE))); 1226 | 1227 | if (fd >= 0) 1228 | CloseTransientFile(fd); 1229 | 1230 | /* Mark our write complete */ 1231 | { 1232 | volatile pgsrtSharedState *s = (volatile pgsrtSharedState *) pgsrt; 1233 | 1234 | SpinLockAcquire(&s->mutex); 1235 | s->n_writers--; 1236 | SpinLockRelease(&s->mutex); 1237 | } 1238 | 1239 | return false; 1240 | } 1241 | 1242 | /* 1243 | * Read the external keys text file into a malloc'd buffer. 1244 | * 1245 | * Returns NULL (without throwing an error) if unable to read, eg 1246 | * file not there or insufficient memory. 1247 | * 1248 | * On success, the buffer size is also returned into *buffer_size. 1249 | * 1250 | * This can be called without any lock on pgsrt->lock, but in that case 1251 | * the caller is responsible for verifying that the result is sane. 1252 | */ 1253 | static char * 1254 | ktext_load_file(Size *buffer_size) 1255 | { 1256 | char *buf; 1257 | int fd; 1258 | struct stat stat; 1259 | 1260 | #if PG_VERSION_NUM < 110000 1261 | fd = OpenTransientFile(PGSRT_TEXT_FILE, O_RDONLY | PG_BINARY, 0); 1262 | #else 1263 | fd = OpenTransientFile(PGSRT_TEXT_FILE, O_RDONLY | PG_BINARY); 1264 | #endif 1265 | if (fd < 0) 1266 | { 1267 | if (errno != ENOENT) 1268 | ereport(LOG, 1269 | (errcode_for_file_access(), 1270 | errmsg("could not read file \"%s\": %m", 1271 | PGSRT_TEXT_FILE))); 1272 | return NULL; 1273 | } 1274 | 1275 | /* Get file length */ 1276 | if (fstat(fd, &stat)) 1277 | { 1278 | ereport(LOG, 1279 | (errcode_for_file_access(), 1280 | errmsg("could not stat file \"%s\": %m", 1281 | PGSRT_TEXT_FILE))); 1282 | CloseTransientFile(fd); 1283 | return NULL; 1284 | } 1285 | 1286 | /* Allocate buffer; beware that off_t might be wider than size_t */ 1287 | if (stat.st_size <= MaxAllocHugeSize) 1288 | buf = (char *) malloc(stat.st_size); 1289 | else 1290 | buf = NULL; 1291 | if (buf == NULL) 1292 | { 1293 | ereport(LOG, 1294 | (errcode(ERRCODE_OUT_OF_MEMORY), 1295 | errmsg("out of memory"), 1296 | errdetail("Could not allocate enough memory to read file \"%s\".", 1297 | PGSRT_TEXT_FILE))); 1298 | CloseTransientFile(fd); 1299 | return NULL; 1300 | } 1301 | 1302 | /* 1303 | * OK, slurp in the file. If we get a short read and errno doesn't get 1304 | * set, the reason is probably that garbage collection truncated the file 1305 | * since we did the fstat(), so we don't log a complaint --- but we don't 1306 | * return the data, either, since it's most likely corrupt due to 1307 | * concurrent writes from garbage collection. 1308 | */ 1309 | errno = 0; 1310 | if (read(fd, buf, stat.st_size) != stat.st_size) 1311 | { 1312 | if (errno) 1313 | ereport(LOG, 1314 | (errcode_for_file_access(), 1315 | errmsg("could not read file \"%s\": %m", 1316 | PGSRT_TEXT_FILE))); 1317 | free(buf); 1318 | CloseTransientFile(fd); 1319 | return NULL; 1320 | } 1321 | 1322 | CloseTransientFile(fd); 1323 | 1324 | *buffer_size = stat.st_size; 1325 | return buf; 1326 | } 1327 | 1328 | /* 1329 | * Locate a keys text in the file image previously read by ktext_load_file(). 1330 | * 1331 | * We validate the given offset/length, and return NULL if bogus. Otherwise, 1332 | * the result points to a null-terminated string within the buffer. 1333 | */ 1334 | static char * 1335 | ktext_fetch(Size keys_offset, int keys_len, 1336 | char *buffer, Size buffer_size) 1337 | { 1338 | /* File read failed? */ 1339 | if (buffer == NULL) 1340 | return NULL; 1341 | /* Bogus offset/length? */ 1342 | if (keys_len < 0 || 1343 | keys_offset + keys_len >= buffer_size) 1344 | return NULL; 1345 | /* As a further sanity check, make sure there's a trailing null */ 1346 | if (buffer[keys_offset + keys_len] != '\0') 1347 | return NULL; 1348 | /* Looks OK */ 1349 | return buffer + keys_offset; 1350 | } 1351 | 1352 | /* 1353 | * Do we need to garbage-collect the external keys text file? 1354 | * 1355 | * Caller should hold at least a shared lock on pgsrt->lock. 1356 | */ 1357 | static bool 1358 | need_gc_ktexts(void) 1359 | { 1360 | Size extent; 1361 | 1362 | /* Read shared extent pointer */ 1363 | { 1364 | volatile pgsrtSharedState *s = (volatile pgsrtSharedState *) pgsrt; 1365 | 1366 | SpinLockAcquire(&s->mutex); 1367 | extent = s->extent; 1368 | SpinLockRelease(&s->mutex); 1369 | } 1370 | 1371 | /* Don't proceed if file does not exceed 100 bytes per possible entry */ 1372 | if (extent < 100 * pgsrt_max) 1373 | return false; 1374 | 1375 | /* 1376 | * Don't proceed if file is less than about 50% bloat. Nothing can or 1377 | * should be done in the event of unusually large keys texts accounting 1378 | * for file's large size. We go to the trouble of maintaining the mean 1379 | * keys length in order to prevent garbage collection from thrashing 1380 | * uselessly. 1381 | */ 1382 | if (extent < pgsrt->mean_keys_len * pgsrt_max * 2) 1383 | return false; 1384 | 1385 | return true; 1386 | } 1387 | 1388 | /* 1389 | * Garbage-collect orphaned keys texts in external file. 1390 | * 1391 | * This won't be called often in the typical case, since it's likely that 1392 | * there won't be too much churn, and besides, a similar compaction process 1393 | * occurs when serializing to disk at shutdown or as part of resetting. 1394 | * Despite this, it seems prudent to plan for the edge case where the file 1395 | * becomes unreasonably large, with no other method of compaction likely to 1396 | * occur in the foreseeable future. 1397 | * 1398 | * The caller must hold an exclusive lock on pgsrt->lock. 1399 | * 1400 | * At the first sign of trouble we unlink the keys text file to get a clean 1401 | * slate (although existing statistics are retained), rather than risk 1402 | * thrashing by allowing the same problem case to recur indefinitely. 1403 | */ 1404 | static void 1405 | gc_ktexts(void) 1406 | { 1407 | char *kbuffer; 1408 | Size kbuffer_size; 1409 | FILE *kfile = NULL; 1410 | HASH_SEQ_STATUS hash_seq; 1411 | pgsrtEntry *entry; 1412 | Size extent; 1413 | int nentries; 1414 | 1415 | /* 1416 | * When called from pgsrt_store, some other session might have proceeded 1417 | * with garbage collection in the no-lock-held interim of lock strength 1418 | * escalation. Check once more that this is actually necessary. 1419 | */ 1420 | if (!need_gc_ktexts()) 1421 | return; 1422 | 1423 | /* 1424 | * Load the old texts file. If we fail (out of memory, for instance), 1425 | * invalidate keys texts. Hopefully this is rare. It might seem better 1426 | * to leave things alone on an OOM failure, but the problem is that the 1427 | * file is only going to get bigger; hoping for a future non-OOM result is 1428 | * risky and can easily lead to complete denial of service. 1429 | */ 1430 | kbuffer = ktext_load_file(&kbuffer_size); 1431 | if (kbuffer == NULL) 1432 | goto gc_fail; 1433 | 1434 | /* 1435 | * We overwrite the keys texts file in place, so as to reduce the risk of 1436 | * an out-of-disk-space failure. Since the file is guaranteed not to get 1437 | * larger, this should always work on traditional filesystems; though we 1438 | * could still lose on copy-on-write filesystems. 1439 | */ 1440 | kfile = AllocateFile(PGSRT_TEXT_FILE, PG_BINARY_W); 1441 | if (kfile == NULL) 1442 | { 1443 | ereport(LOG, 1444 | (errcode_for_file_access(), 1445 | errmsg("could not write file \"%s\": %m", 1446 | PGSRT_TEXT_FILE))); 1447 | goto gc_fail; 1448 | } 1449 | 1450 | extent = 0; 1451 | nentries = 0; 1452 | 1453 | hash_seq_init(&hash_seq, pgsrt_hash); 1454 | while ((entry = hash_seq_search(&hash_seq)) != NULL) 1455 | { 1456 | int keys_len = entry->keys_len; 1457 | char *qry = ktext_fetch(entry->keys_offset, 1458 | keys_len, 1459 | kbuffer, 1460 | kbuffer_size); 1461 | 1462 | if (qry == NULL) 1463 | { 1464 | /* Trouble ... drop the text */ 1465 | entry->keys_offset = 0; 1466 | entry->keys_len = -1; 1467 | /* entry will not be counted in mean keys length computation */ 1468 | continue; 1469 | } 1470 | 1471 | if (fwrite(qry, 1, keys_len + 1, kfile) != keys_len + 1) 1472 | { 1473 | ereport(LOG, 1474 | (errcode_for_file_access(), 1475 | errmsg("could not write file \"%s\": %m", 1476 | PGSRT_TEXT_FILE))); 1477 | hash_seq_term(&hash_seq); 1478 | goto gc_fail; 1479 | } 1480 | 1481 | entry->keys_offset = extent; 1482 | extent += keys_len + 1; 1483 | nentries++; 1484 | } 1485 | 1486 | /* 1487 | * Truncate away any now-unused space. If this fails for some odd reason, 1488 | * we log it, but there's no need to fail. 1489 | */ 1490 | if (ftruncate(fileno(kfile), extent) != 0) 1491 | ereport(LOG, 1492 | (errcode_for_file_access(), 1493 | errmsg("could not truncate file \"%s\": %m", 1494 | PGSRT_TEXT_FILE))); 1495 | 1496 | if (FreeFile(kfile)) 1497 | { 1498 | ereport(LOG, 1499 | (errcode_for_file_access(), 1500 | errmsg("could not write file \"%s\": %m", 1501 | PGSRT_TEXT_FILE))); 1502 | kfile = NULL; 1503 | goto gc_fail; 1504 | } 1505 | 1506 | elog(DEBUG1, "pgsrt gc of keys file shrunk size from %zu to %zu", 1507 | pgsrt->extent, extent); 1508 | 1509 | /* Reset the shared extent pointer */ 1510 | pgsrt->extent = extent; 1511 | 1512 | /* 1513 | * Also update the mean keys length, to be sure that need_gc_ktexts() 1514 | * won't still think we have a problem. 1515 | */ 1516 | if (nentries > 0) 1517 | pgsrt->mean_keys_len = extent / nentries; 1518 | else 1519 | pgsrt->mean_keys_len = ASSUMED_LENGTH_INIT; 1520 | 1521 | free(kbuffer); 1522 | 1523 | /* 1524 | * OK, count a garbage collection cycle. (Note: even though we have 1525 | * exclusive lock on pgsrt->lock, we must take pgsrt->mutex for this, since 1526 | * other processes may examine gc_count while holding only the mutex. 1527 | * Also, we have to advance the count *after* we've rewritten the file, 1528 | * else other processes might not realize they read a stale file.) 1529 | */ 1530 | record_gc_ktexts(); 1531 | 1532 | return; 1533 | 1534 | gc_fail: 1535 | /* clean up resources */ 1536 | if (kfile) 1537 | FreeFile(kfile); 1538 | if (kbuffer) 1539 | free(kbuffer); 1540 | 1541 | /* 1542 | * Since the contents of the external file are now uncertain, mark all 1543 | * hashtable entries as having invalid texts. 1544 | */ 1545 | hash_seq_init(&hash_seq, pgsrt_hash); 1546 | while ((entry = hash_seq_search(&hash_seq)) != NULL) 1547 | { 1548 | entry->keys_offset = 0; 1549 | entry->keys_len = -1; 1550 | } 1551 | 1552 | /* 1553 | * Destroy the keys text file and create a new, empty one 1554 | */ 1555 | (void) unlink(PGSRT_TEXT_FILE); 1556 | kfile = AllocateFile(PGSRT_TEXT_FILE, PG_BINARY_W); 1557 | if (kfile == NULL) 1558 | ereport(LOG, 1559 | (errcode_for_file_access(), 1560 | errmsg("could not recreate file \"%s\": %m", 1561 | PGSRT_TEXT_FILE))); 1562 | else 1563 | FreeFile(kfile); 1564 | 1565 | /* Reset the shared extent pointer */ 1566 | pgsrt->extent = 0; 1567 | 1568 | /* Reset mean_keys_len to match the new state */ 1569 | pgsrt->mean_keys_len = ASSUMED_LENGTH_INIT; 1570 | 1571 | /* 1572 | * Bump the GC count even though we failed. 1573 | * 1574 | * This is needed to make concurrent readers of file without any lock on 1575 | * pgsrt->lock notice existence of new version of file. Once readers 1576 | * subsequently observe a change in GC count with pgsrt->lock held, that 1577 | * forces a safe reopen of file. Writers also require that we bump here, 1578 | * of course. (As required by locking protocol, readers and writers don't 1579 | * trust earlier file contents until gc_count is found unchanged after 1580 | * pgsrt->lock acquired in shared or exclusive mode respectively.) 1581 | */ 1582 | record_gc_ktexts(); 1583 | } 1584 | 1585 | static void 1586 | pgsrt_process_sortstate(SortState *srtstate, pgsrtWalkerContext *context) 1587 | { 1588 | Plan *plan = srtstate->ss.ps.plan; 1589 | Tuplesortstate *state = (Tuplesortstate *) srtstate->tuplesortstate; 1590 | #if PG_VERSION_NUM >= 110000 1591 | TuplesortInstrumentation stats; 1592 | #endif 1593 | Sort *sort = (Sort *) plan; 1594 | pgsrt_queryid queryId; 1595 | pgsrtCounters counters; 1596 | char *deparsed; 1597 | int nbtapes = 0; 1598 | #if PG_VERSION_NUM < 110000 1599 | const char *sortMethod; 1600 | const char *spaceType; 1601 | #endif 1602 | long spaceUsed; 1603 | bool found; 1604 | int memtupsize_palloc; /* tuplesort's main storage total size, 1605 | including palloc overhead */ 1606 | int tuple_palloc; /* average tuple size, including palloc overhead */ 1607 | int64 lines, /* number of lines underlying node returned */ 1608 | lines_to_sort, /* number of lines the sort will actually 1609 | process (may differ when bounded) */ 1610 | memtupsize_length, /* tuplesort's main storage array size */ 1611 | w_m; /* estimated work_mem */ 1612 | 1613 | Assert(state); 1614 | 1615 | /* First estimate the size of the main array that stores the lines */ 1616 | lines = 0; 1617 | /* get effective number of lines fed to the sort if available */ 1618 | if (srtstate->ss.ps.instrument) 1619 | lines = srtstate->ss.ps.instrument->ntuples; 1620 | 1621 | /* fallback to estimated # of lines if no value */ 1622 | if (lines == 0) 1623 | lines = sort->plan.plan_rows; 1624 | 1625 | /* 1626 | * If the sort is bounded, set the number of lines to sort 1627 | * accordingly, otherwise use the Sort input lines count. 1628 | */ 1629 | if (srtstate->bounded) 1630 | lines_to_sort = srtstate->bound; 1631 | else 1632 | lines_to_sort = lines; 1633 | 1634 | /* The minimal memtupsize is 1024 */ 1635 | memtupsize_length = Max(1024, lines_to_sort); 1636 | /* 1637 | * growth is done by doubling the size each time with a minimum of 1024 1638 | * entries, so we'll have a power of 2. No need to deal with the the last 1639 | * growth special rule, there's no way we can exhaust the work_mem for the 1640 | * main array and still put all the rows to sort in memory 1641 | */ 1642 | memtupsize_length = round_up_pow2(memtupsize_length); 1643 | 1644 | /* compute the memtupsize palloc size */ 1645 | memtupsize_palloc = sizeof(SortTuple) * memtupsize_length; 1646 | memtupsize_palloc += PGSRT_ALLOC_CHUNKHDRSZ; 1647 | 1648 | /* 1649 | * Then estimate the per-line space used. We use the average row width, 1650 | * and add the fixed MnimalTuple header overhead 1651 | * FIXME: take into account NULLs, OIDs and alignment lost bytes 1652 | */ 1653 | tuple_palloc = sort->plan.plan_width + MAXALIGN(SizeofMinimalTupleHeader); 1654 | 1655 | /* Add lost space due to alignment */ 1656 | tuple_palloc += get_alignment_overhead(srtstate->ss.ss_ScanTupleSlot->tts_tupleDescriptor); 1657 | 1658 | /* 1659 | * Each tuple is palloced, and a palloced chunk uses a 2^N size unless size 1660 | * is more then PGSRT_ALLOC_CHUNK_LIMIT 1661 | */ 1662 | if (tuple_palloc < PGSRT_ALLOC_CHUNK_LIMIT) 1663 | tuple_palloc = round_up_pow2(tuple_palloc); 1664 | 1665 | /* Add the palloc overhead */ 1666 | tuple_palloc += PGSRT_ALLOC_CHUNKHDRSZ; 1667 | 1668 | /* 1669 | * compute the estimated total work_mem that's needed to perform the 1670 | * sort in memory. First add the space needed for the lines 1671 | */ 1672 | w_m = lines_to_sort * tuple_palloc; 1673 | 1674 | /* 1675 | * If a bounded sort was asked, we'll try to sort only the bound limit 1676 | * number of line, but a Top-N heapsort may need to be able to store twice 1677 | * the amount of rows, so use twice the memory, assuming that the worst 1678 | * case always happens. 1679 | */ 1680 | if (srtstate->bounded) 1681 | w_m *= 2; 1682 | 1683 | /* add the tuplesort's main array storage and we're done */ 1684 | w_m += memtupsize_palloc; 1685 | 1686 | /* convert in kB, and add 1 kB as a quick round up */ 1687 | w_m /= 1024; 1688 | w_m += 1; 1689 | 1690 | /* deparse the sort keys */ 1691 | deparsed = pgsrt_get_sort_group_keys(srtstate, sort->numCols, 1692 | sort->sortColIdx, sort->sortOperators, sort->collations, 1693 | sort->nullsFirst, context); 1694 | 1695 | #if PG_VERSION_NUM >= 110000 1696 | tuplesort_get_stats(state, &stats); 1697 | //sortMethod = tuplesort_method_name(stats.sortMethod); 1698 | //spaceType = tuplesort_space_type_name(stats.spaceType); 1699 | spaceUsed = stats.spaceUsed; 1700 | #else 1701 | tuplesort_get_stats(state, &sortMethod, &spaceType, &spaceUsed); 1702 | #endif 1703 | 1704 | counters.lines = lines; 1705 | counters.lines_to_sort = lines_to_sort; 1706 | counters.work_mems = w_m; 1707 | found = false; 1708 | #if PG_VERSION_NUM >= 110000 1709 | if (stats.sortMethod == SORT_TYPE_TOP_N_HEAPSORT) 1710 | #else 1711 | if (strcmp(sortMethod, "top-N heapsort") == 0) 1712 | #endif 1713 | { 1714 | counters.topn_sorts = 1; 1715 | found = true; 1716 | } 1717 | else 1718 | counters.topn_sorts = 0; 1719 | 1720 | #if PG_VERSION_NUM >= 110000 1721 | if (stats.sortMethod == SORT_TYPE_QUICKSORT) 1722 | #else 1723 | if (!found && strcmp(sortMethod, "quicksort") == 0) 1724 | #endif 1725 | { 1726 | counters.quicksorts = 1; 1727 | found = true; 1728 | } 1729 | else 1730 | counters.quicksorts = 0; 1731 | 1732 | #if PG_VERSION_NUM >= 110000 1733 | if (stats.sortMethod == SORT_TYPE_EXTERNAL_SORT) 1734 | #else 1735 | if (!found && strcmp(sortMethod, "external sort") == 0) 1736 | #endif 1737 | { 1738 | counters.external_sorts = 1; 1739 | found = true; 1740 | } 1741 | else 1742 | counters.external_sorts = 0; 1743 | 1744 | #if PG_VERSION_NUM >= 110000 1745 | if (stats.sortMethod == SORT_TYPE_EXTERNAL_MERGE) 1746 | #else 1747 | if (!found && strcmp(sortMethod, "external merge") == 0) 1748 | #endif 1749 | { 1750 | counters.external_merges = 1; 1751 | nbtapes = ((pgsrt_Tuplesortstate *) state)->currentRun + 1; 1752 | found = true; 1753 | } 1754 | else 1755 | counters.external_merges = 0; 1756 | 1757 | if (!found) 1758 | #if PG_VERSION_NUM >= 110000 1759 | elog(ERROR, "Unexpected sort method: %d", stats.sortMethod); 1760 | #else 1761 | elog(ERROR, "Unexpected sort method: %s", sortMethod); 1762 | #endif 1763 | 1764 | counters.nbtapes = nbtapes; 1765 | 1766 | #if PG_VERSION_NUM >= 110000 1767 | if (stats.spaceType == SORT_SPACE_TYPE_DISK) 1768 | #else 1769 | if (strcmp(spaceType, "Disk") == 0) 1770 | #endif 1771 | { 1772 | counters.space_disk = spaceUsed; 1773 | counters.space_memory = 0; 1774 | } 1775 | else 1776 | { 1777 | counters.space_disk = 0; 1778 | counters.space_memory = spaceUsed; 1779 | } 1780 | 1781 | #if PG_VERSION_NUM >= 110000 1782 | if (srtstate->shared_info){ 1783 | counters.non_parallels = 0; 1784 | /* 1785 | * we compute the total number of processes participating to the sort, 1786 | * so we have to increment the number of workers to take the gather 1787 | * node into account 1788 | */ 1789 | counters.nb_workers = srtstate->shared_info->num_workers + 1; 1790 | } 1791 | else 1792 | { 1793 | counters.non_parallels = 1; 1794 | counters.nb_workers = 0; 1795 | } 1796 | #else 1797 | counters.non_parallels = 1; 1798 | counters.nb_workers = 0; 1799 | #endif 1800 | 1801 | #if PG_VERSION_NUM >= 90600 1802 | if (IsParallelWorker()) 1803 | queryId = pgsrt_get_queryid(); 1804 | else 1805 | queryId = context->queryDesc->plannedstmt->queryId; 1806 | #else 1807 | queryId = context->queryDesc->plannedstmt->queryId; 1808 | #endif 1809 | 1810 | pgsrt_store(queryId, sort->numCols, deparsed, &counters); 1811 | 1812 | //elog(WARNING, "sort info:\n" 1813 | // "keys: %s\n" 1814 | // "type: %s\n" 1815 | // "space type: %s\n" 1816 | // "space: %ld kB\n" 1817 | // "lines to sort: %ld\n" 1818 | // "w_m estimated: %ld kB\n" 1819 | // "nbTapes: %d\n" 1820 | #if PG_VERSION_NUM >= 110000 1821 | // "parallel: %s (%d)\n" 1822 | #endif 1823 | // "bounded? %s - %s , bound %ld - %ld", 1824 | // deparsed, 1825 | // sortMethod, 1826 | // spaceType, 1827 | // spaceUsed, 1828 | // lines_to_sort, 1829 | // w_m, 1830 | // nbtapes, 1831 | #if PG_VERSION_NUM >= 110000 1832 | // (srtstate->shared_info ? "yes" : "no"),(srtstate->shared_info ? srtstate->shared_info->num_workers : -1), 1833 | #endif 1834 | // (srtstate->bounded ? "yes":"no"),(srtstate->bounded_Done ? "yes":"no"), srtstate->bound, srtstate->bound_Done); 1835 | } 1836 | 1837 | /* 1838 | * walker functions that recurse the planstate tree looking for sort nodes. 1839 | */ 1840 | static bool pgsrt_planstate_walker(PlanState *ps, pgsrtWalkerContext *context) 1841 | { 1842 | if (IsA(ps, SortState)) 1843 | { 1844 | SortState *srtstate = (SortState *) ps; 1845 | 1846 | if (srtstate->tuplesortstate) 1847 | pgsrt_process_sortstate(srtstate, context); 1848 | } 1849 | 1850 | context->ancestors = lcons(ps, context->ancestors); 1851 | 1852 | return planstate_tree_walker(ps, pgsrt_planstate_walker, context); 1853 | } 1854 | 1855 | /* Adapted from ExplainPrintPlan */ 1856 | static void 1857 | pgsrt_setup_walker_context(pgsrtWalkerContext *context) 1858 | { 1859 | Bitmapset *rels_used = NULL; 1860 | 1861 | /* Set up ExplainState fields associated with this plan tree */ 1862 | Assert(context->queryDesc->plannedstmt != NULL); 1863 | 1864 | context->rtable = context->queryDesc->plannedstmt->rtable; 1865 | pgsrt_PreScanNode(context->queryDesc->planstate, &rels_used); 1866 | context->rtable_names = select_rtable_names_for_explain(context->rtable, 1867 | rels_used); 1868 | #if PG_VERSION_NUM < 130000 1869 | context->deparse_cxt = deparse_context_for_plan_rtable(context->rtable, 1870 | context->rtable_names); 1871 | #else 1872 | context->deparse_cxt = deparse_context_for_plan_tree( 1873 | context->queryDesc->plannedstmt, 1874 | context->rtable_names); 1875 | #endif 1876 | } 1877 | 1878 | /* Adapted from show_sort_group_keys */ 1879 | static char * 1880 | pgsrt_get_sort_group_keys(SortState *srtstate, 1881 | int nkeys, AttrNumber *keycols, 1882 | Oid *sortOperators, Oid *collations, bool *nullsFirst, 1883 | pgsrtWalkerContext *context) 1884 | { 1885 | Plan *plan = srtstate->ss.ps.plan; 1886 | List *dp_context = NIL; 1887 | StringInfoData sortkeybuf; 1888 | bool useprefix; 1889 | int keyno; 1890 | 1891 | if (nkeys <= 0) 1892 | return "nothing?"; 1893 | 1894 | pgsrt_setup_walker_context(context); 1895 | 1896 | initStringInfo(&sortkeybuf); 1897 | 1898 | /* Set up deparsing context */ 1899 | #if PG_VERSION_NUM < 130000 1900 | dp_context = set_deparse_context_planstate(context->deparse_cxt, 1901 | (Node *) srtstate, 1902 | context->ancestors); 1903 | #else 1904 | dp_context = set_deparse_context_plan(context->deparse_cxt, 1905 | plan, 1906 | context->ancestors); 1907 | #endif 1908 | useprefix = (list_length(context->rtable) > 1); 1909 | 1910 | for (keyno = 0; keyno < nkeys; keyno++) 1911 | { 1912 | /* find key expression in tlist */ 1913 | AttrNumber keyresno = keycols[keyno]; 1914 | TargetEntry *target = get_tle_by_resno(plan->targetlist, 1915 | keyresno); 1916 | char *exprstr; 1917 | 1918 | if (keyno != 0) 1919 | appendStringInfoString(&sortkeybuf, ", "); 1920 | 1921 | if (!target) 1922 | elog(ERROR, "no tlist entry for key %d", keyresno); 1923 | /* Deparse the expression, showing any top-level cast */ 1924 | exprstr = deparse_expression((Node *) target->expr, dp_context, 1925 | useprefix, true); 1926 | appendStringInfoString(&sortkeybuf, exprstr); 1927 | 1928 | /* Append sort order information, if relevant */ 1929 | if (sortOperators != NULL) 1930 | pgsrt_show_sortorder_options(&sortkeybuf, 1931 | (Node *) target->expr, 1932 | sortOperators[keyno], 1933 | collations[keyno], 1934 | nullsFirst[keyno]); 1935 | } 1936 | 1937 | return sortkeybuf.data; 1938 | } 1939 | 1940 | /* 1941 | * Reset statistics. 1942 | */ 1943 | PGDLLEXPORT Datum 1944 | pg_sortstats_reset(PG_FUNCTION_ARGS) 1945 | { 1946 | if (!pgsrt) 1947 | ereport(ERROR, 1948 | (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), 1949 | errmsg("pg_sortstats must be loaded via shared_preload_libraries"))); 1950 | 1951 | pgsrt_entry_reset(); 1952 | PG_RETURN_VOID(); 1953 | } 1954 | 1955 | Datum 1956 | pg_sortstats(PG_FUNCTION_ARGS) 1957 | { 1958 | bool showtext = PG_GETARG_BOOL(0); 1959 | ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; 1960 | MemoryContext per_query_ctx; 1961 | MemoryContext oldcontext; 1962 | TupleDesc tupdesc; 1963 | Tuplestorestate *tupstore; 1964 | char *kbuffer = NULL; 1965 | Size kbuffer_size = 0; 1966 | Size extent = 0; 1967 | int gc_count = 0; 1968 | HASH_SEQ_STATUS hash_seq; 1969 | pgsrtEntry *entry; 1970 | 1971 | 1972 | if (!pgsrt) 1973 | ereport(ERROR, 1974 | (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), 1975 | errmsg("pg_sortstats must be loaded via shared_preload_libraries"))); 1976 | /* check to see if caller supports us returning a tuplestore */ 1977 | if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo)) 1978 | ereport(ERROR, 1979 | (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), 1980 | errmsg("set-valued function called in context that cannot accept a set"))); 1981 | if (!(rsinfo->allowedModes & SFRM_Materialize)) 1982 | ereport(ERROR, 1983 | (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), 1984 | errmsg("materialize mode required, but it is not " \ 1985 | "allowed in this context"))); 1986 | 1987 | /* Switch into long-lived context to construct returned data structures */ 1988 | per_query_ctx = rsinfo->econtext->ecxt_per_query_memory; 1989 | oldcontext = MemoryContextSwitchTo(per_query_ctx); 1990 | 1991 | /* Build a tuple descriptor for our result type */ 1992 | if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) 1993 | elog(ERROR, "return type must be a row type"); 1994 | 1995 | tupstore = tuplestore_begin_heap(true, false, work_mem); 1996 | rsinfo->returnMode = SFRM_Materialize; 1997 | rsinfo->setResult = tupstore; 1998 | rsinfo->setDesc = tupdesc; 1999 | 2000 | MemoryContextSwitchTo(oldcontext); 2001 | 2002 | /* 2003 | * We'd like to load the keys text file (if needed) while not holding any 2004 | * lock on pgsrt->lock. In the worst case we'll have to do this again 2005 | * after we have the lock, but it's unlikely enough to make this a win 2006 | * despite occasional duplicated work. We need to reload if anybody 2007 | * writes to the file (either a retail ktext_store(), or a garbage 2008 | * collection) between this point and where we've gotten shared lock. If 2009 | * a ktext_store is actually in progress when we look, we might as well 2010 | * skip the speculative load entirely. 2011 | */ 2012 | if (showtext) 2013 | { 2014 | int n_writers; 2015 | 2016 | /* Take the mutex so we can examine variables */ 2017 | { 2018 | volatile pgsrtSharedState *s = (volatile pgsrtSharedState *) pgsrt; 2019 | 2020 | SpinLockAcquire(&s->mutex); 2021 | extent = s->extent; 2022 | n_writers = s->n_writers; 2023 | gc_count = s->gc_count; 2024 | SpinLockRelease(&s->mutex); 2025 | } 2026 | 2027 | /* No point in loading file now if there are active writers */ 2028 | if (n_writers == 0) 2029 | kbuffer = ktext_load_file(&kbuffer_size); 2030 | } 2031 | 2032 | /* 2033 | * Get shared lock, load or reload the keys text file if we must, and 2034 | * iterate over the hashtable entries. 2035 | * 2036 | * With a large hash table, we might be holding the lock rather longer 2037 | * than one could wish. However, this only blocks creation of new hash 2038 | * table entries, and the larger the hash table the less likely that is to 2039 | * be needed. So we can hope this is okay. Perhaps someday we'll decide 2040 | * we need to partition the hash table to limit the time spent holding any 2041 | * one lock. 2042 | */ 2043 | LWLockAcquire(pgsrt->lock, LW_SHARED); 2044 | 2045 | if (showtext) 2046 | { 2047 | /* 2048 | * Here it is safe to examine extent and gc_count without taking the 2049 | * mutex. Note that although other processes might change 2050 | * pgsrt->extent just after we look at it, the strings they then write 2051 | * into the file cannot yet be referenced in the hashtable, so we 2052 | * don't care whether we see them or not. 2053 | * 2054 | * If ktext_load_file fails, we just press on; we'll return NULL for 2055 | * every keys text. 2056 | */ 2057 | if (kbuffer == NULL || 2058 | pgsrt->extent != extent || 2059 | pgsrt->gc_count != gc_count) 2060 | { 2061 | if (kbuffer) 2062 | free(kbuffer); 2063 | kbuffer = ktext_load_file(&kbuffer_size); 2064 | } 2065 | } 2066 | 2067 | hash_seq_init(&hash_seq, pgsrt_hash); 2068 | while ((entry = hash_seq_search(&hash_seq)) != NULL) 2069 | { 2070 | Datum values[PGSRT_COLUMNS]; 2071 | bool nulls[PGSRT_COLUMNS]; 2072 | pgsrtCounters tmp; 2073 | int i = 0; 2074 | 2075 | memset(values, 0, sizeof(values)); 2076 | memset(nulls, 0, sizeof(nulls)); 2077 | 2078 | /* copy counters to a local variable to keep locking time short */ 2079 | { 2080 | volatile pgsrtEntry *e = (volatile pgsrtEntry *) entry; 2081 | 2082 | SpinLockAcquire(&e->mutex); 2083 | tmp = e->counters; 2084 | SpinLockRelease(&e->mutex); 2085 | } 2086 | 2087 | values[i++] = Int64GetDatumFast(entry->key.queryid); 2088 | values[i++] = ObjectIdGetDatum(entry->key.userid); 2089 | values[i++] = ObjectIdGetDatum(entry->key.dbid); 2090 | values[i++] = Int32GetDatum(entry->nbkeys); 2091 | if (showtext) 2092 | { 2093 | char *kstr = ktext_fetch(entry->keys_offset, 2094 | entry->keys_len, 2095 | kbuffer, 2096 | kbuffer_size); 2097 | 2098 | if (kstr) 2099 | { 2100 | char *enc; 2101 | 2102 | enc = pg_any_to_server(kstr, 2103 | entry->keys_len, 2104 | entry->encoding); 2105 | 2106 | values[i++] = CStringGetTextDatum(enc); 2107 | 2108 | if (enc != kstr) 2109 | pfree(enc); 2110 | } 2111 | else 2112 | { 2113 | /* Just return a null if we fail to find the text */ 2114 | nulls[i++] = true; 2115 | } 2116 | } 2117 | else 2118 | { 2119 | /* keys text not requested */ 2120 | nulls[i++] = true; 2121 | } 2122 | values[i++] = Int64GetDatumFast(tmp.lines); 2123 | values[i++] = Int64GetDatumFast(tmp.lines_to_sort); 2124 | values[i++] = Int64GetDatumFast(tmp.work_mems); 2125 | values[i++] = Int64GetDatumFast(tmp.topn_sorts); 2126 | values[i++] = Int64GetDatumFast(tmp.quicksorts); 2127 | values[i++] = Int64GetDatumFast(tmp.external_sorts); 2128 | values[i++] = Int64GetDatumFast(tmp.external_merges); 2129 | values[i++] = Int64GetDatumFast(tmp.nbtapes); 2130 | values[i++] = Int64GetDatumFast(tmp.space_disk); 2131 | values[i++] = Int64GetDatumFast(tmp.space_memory); 2132 | values[i++] = Int64GetDatumFast(tmp.non_parallels); 2133 | #if PG_VERSION_NUM >= 110000 2134 | values[i++] = Int64GetDatumFast(tmp.nb_workers); 2135 | #else 2136 | nulls[i++] = true; 2137 | #endif 2138 | 2139 | Assert(i == PGSRT_COLUMNS); 2140 | 2141 | tuplestore_putvalues(tupstore, tupdesc, values, nulls); 2142 | } 2143 | 2144 | /* clean up and return the tuplestore */ 2145 | LWLockRelease(pgsrt->lock); 2146 | 2147 | if (kbuffer) 2148 | free(kbuffer); 2149 | 2150 | tuplestore_donestoring(tupstore); 2151 | return (Datum) 0; 2152 | } 2153 | 2154 | static unsigned long 2155 | round_up_pow2(int64 val) 2156 | { 2157 | val--; 2158 | val |= val >> 1; 2159 | val |= val >> 2; 2160 | val |= val >> 4; 2161 | val |= val >> 8; 2162 | val |= val >> 16; 2163 | val++; 2164 | return val; 2165 | } 2166 | 2167 | static int 2168 | get_alignment_overhead(TupleDesc tupdesc) 2169 | { 2170 | int align_overhead = 0; 2171 | int off = 0; 2172 | int i; 2173 | 2174 | for (i = 0; i < tupdesc->natts; i++) 2175 | { 2176 | Form_pg_attribute attr = TupleDescAttr(tupdesc, i); 2177 | int newoff; 2178 | 2179 | /* FIXME use better heuristic for varlena :) */ 2180 | if (attr->attlen > 0) 2181 | off += attr->attlen; 2182 | else 2183 | off += 1; 2184 | 2185 | newoff = att_align_nominal(off, attr->attalign); 2186 | if (newoff != off) 2187 | { 2188 | align_overhead += (newoff - off); 2189 | off = newoff; 2190 | } 2191 | } 2192 | 2193 | return align_overhead; 2194 | } 2195 | -------------------------------------------------------------------------------- /pg_sortstats.control: -------------------------------------------------------------------------------- 1 | comment = 'An extension collecting statistics about sorts' 2 | default_version = '0.0.1' 3 | module_pathname = '$libdir/pg_sortstats' 4 | relocatable = true 5 | -------------------------------------------------------------------------------- /pg_sortstats_import.c: -------------------------------------------------------------------------------- 1 | /*------------------------------------------------------------------------- 2 | * 3 | * pg_sortstats_import.c 4 | * Imported needed function. 5 | * 6 | *------------------------------------------------------------------------- 7 | */ 8 | 9 | #include "postgres.h" 10 | 11 | #if PG_VERSION_NUM >= 110000 12 | #include "catalog/pg_collation_d.h" 13 | #else 14 | #include "catalog/pg_collation.h" 15 | #endif 16 | #include "nodes/execnodes.h" 17 | #include "nodes/nodeFuncs.h" 18 | #include "utils/builtins.h" 19 | #include "utils/lsyscache.h" 20 | #include "utils/tuplesort.h" 21 | #include "utils/typcache.h" 22 | 23 | #include "include/pg_sortstats_import.h" 24 | 25 | /* 26 | * Imported from ExplainPreScanNode 27 | */ 28 | bool 29 | pgsrt_PreScanNode(PlanState *planstate, Bitmapset **rels_used) 30 | { 31 | Plan *plan = planstate->plan; 32 | 33 | switch (nodeTag(plan)) 34 | { 35 | case T_SeqScan: 36 | #if PG_VERSION_NUM >= 90500 37 | case T_SampleScan: 38 | #endif 39 | case T_IndexScan: 40 | case T_IndexOnlyScan: 41 | case T_BitmapHeapScan: 42 | case T_TidScan: 43 | case T_SubqueryScan: 44 | case T_FunctionScan: 45 | #if PG_VERSION_NUM >= 100000 46 | case T_TableFuncScan: 47 | #endif 48 | case T_ValuesScan: 49 | case T_CteScan: 50 | #if PG_VERSION_NUM >= 100000 51 | case T_NamedTuplestoreScan: 52 | #endif 53 | case T_WorkTableScan: 54 | *rels_used = bms_add_member(*rels_used, 55 | ((Scan *) plan)->scanrelid); 56 | break; 57 | case T_ForeignScan: 58 | #if PG_VERSION_NUM >= 90500 59 | *rels_used = bms_add_members(*rels_used, 60 | ((ForeignScan *) plan)->fs_relids); 61 | #else 62 | *rels_used = bms_add_member(*rels_used, 63 | ((Scan *) plan)->scanrelid); 64 | #endif 65 | break; 66 | #if PG_VERSION_NUM >= 90500 67 | case T_CustomScan: 68 | *rels_used = bms_add_members(*rels_used, 69 | ((CustomScan *) plan)->custom_relids); 70 | break; 71 | #endif 72 | case T_ModifyTable: 73 | #if PG_VERSION_NUM >= 90500 74 | *rels_used = bms_add_member(*rels_used, 75 | ((ModifyTable *) plan)->nominalRelation); 76 | if (((ModifyTable *) plan)->exclRelRTI) 77 | *rels_used = bms_add_member(*rels_used, 78 | ((ModifyTable *) plan)->exclRelRTI); 79 | #else 80 | /* cf ExplainModifyTarget */ 81 | *rels_used = bms_add_member(*rels_used, 82 | linitial_int(((ModifyTable *) plan)->resultRelations)); 83 | #endif 84 | break; 85 | default: 86 | break; 87 | } 88 | 89 | return planstate_tree_walker(planstate, pgsrt_PreScanNode, rels_used); 90 | } 91 | 92 | /* Imported from show_sortorder_options */ 93 | void 94 | pgsrt_show_sortorder_options(StringInfo buf, Node *sortexpr, 95 | Oid sortOperator, Oid collation, bool nullsFirst) 96 | { 97 | Oid sortcoltype = exprType(sortexpr); 98 | bool reverse = false; 99 | TypeCacheEntry *typentry; 100 | 101 | typentry = lookup_type_cache(sortcoltype, 102 | TYPECACHE_LT_OPR | TYPECACHE_GT_OPR); 103 | 104 | /* 105 | * Print COLLATE if it's not default. There are some cases where this is 106 | * redundant, eg if expression is a column whose declared collation is 107 | * that collation, but it's hard to distinguish that here. 108 | */ 109 | if (OidIsValid(collation) && collation != DEFAULT_COLLATION_OID) 110 | { 111 | char *collname = get_collation_name(collation); 112 | 113 | if (collname == NULL) 114 | elog(ERROR, "cache lookup failed for collation %u", collation); 115 | appendStringInfo(buf, " COLLATE %s", quote_identifier(collname)); 116 | } 117 | 118 | /* Print direction if not ASC, or USING if non-default sort operator */ 119 | if (sortOperator == typentry->gt_opr) 120 | { 121 | appendStringInfoString(buf, " DESC"); 122 | reverse = true; 123 | } 124 | else if (sortOperator != typentry->lt_opr) 125 | { 126 | char *opname = get_opname(sortOperator); 127 | 128 | if (opname == NULL) 129 | elog(ERROR, "cache lookup failed for operator %u", sortOperator); 130 | appendStringInfo(buf, " USING %s", opname); 131 | /* Determine whether operator would be considered ASC or DESC */ 132 | (void) get_equality_op_for_ordering_op(sortOperator, &reverse); 133 | } 134 | 135 | /* Add NULLS FIRST/LAST only if it wouldn't be default */ 136 | if (nullsFirst && !reverse) 137 | { 138 | appendStringInfoString(buf, " NULLS FIRST"); 139 | } 140 | else if (!nullsFirst && reverse) 141 | { 142 | appendStringInfoString(buf, " NULLS LAST"); 143 | } 144 | } 145 | 146 | #if PG_VERSION_NUM < 90600 147 | 148 | #include "nodes/nodes.h" 149 | #include "nodes/pg_list.h" 150 | #include "utils/logtape.h" 151 | #include "utils/tuplesort.h" 152 | 153 | #include "include/pg_sortstats_import.h" 154 | 155 | bool 156 | planstate_tree_walker(PlanState *planstate, 157 | bool (*walker) (), 158 | void *context) 159 | { 160 | Plan *plan = planstate->plan; 161 | #if PG_VERSION_NUM >= 90500 162 | ListCell *lc; 163 | #endif 164 | 165 | /* initPlan-s */ 166 | if (planstate_walk_subplans(planstate->initPlan, walker, context)) 167 | return true; 168 | 169 | /* lefttree */ 170 | if (outerPlanState(planstate)) 171 | { 172 | if (walker(outerPlanState(planstate), context)) 173 | return true; 174 | } 175 | 176 | /* righttree */ 177 | if (innerPlanState(planstate)) 178 | { 179 | if (walker(innerPlanState(planstate), context)) 180 | return true; 181 | } 182 | 183 | /* special child plans */ 184 | switch (nodeTag(plan)) 185 | { 186 | case T_ModifyTable: 187 | if (planstate_walk_members(((ModifyTableState *) planstate)->mt_plans, 188 | ((ModifyTableState *) planstate)->mt_nplans, 189 | walker, context)) 190 | return true; 191 | break; 192 | case T_Append: 193 | if (planstate_walk_members(((AppendState *) planstate)->appendplans, 194 | ((AppendState *) planstate)->as_nplans, 195 | walker, context)) 196 | return true; 197 | break; 198 | case T_MergeAppend: 199 | if (planstate_walk_members(((MergeAppendState *) planstate)->mergeplans, 200 | ((MergeAppendState *) planstate)->ms_nplans, 201 | walker, context)) 202 | return true; 203 | break; 204 | case T_BitmapAnd: 205 | if (planstate_walk_members(((BitmapAndState *) planstate)->bitmapplans, 206 | ((BitmapAndState *) planstate)->nplans, 207 | walker, context)) 208 | return true; 209 | break; 210 | case T_BitmapOr: 211 | if (planstate_walk_members(((BitmapOrState *) planstate)->bitmapplans, 212 | ((BitmapOrState *) planstate)->nplans, 213 | walker, context)) 214 | return true; 215 | break; 216 | case T_SubqueryScan: 217 | if (walker(((SubqueryScanState *) planstate)->subplan, context)) 218 | return true; 219 | break; 220 | #if PG_VERSION_NUM >= 90500 221 | case T_CustomScan: 222 | foreach(lc, ((CustomScanState *) planstate)->custom_ps) 223 | { 224 | if (walker((PlanState *) lfirst(lc), context)) 225 | return true; 226 | } 227 | break; 228 | #endif 229 | default: 230 | break; 231 | } 232 | 233 | /* subPlan-s */ 234 | if (planstate_walk_subplans(planstate->subPlan, walker, context)) 235 | return true; 236 | 237 | return false; 238 | } 239 | 240 | bool 241 | planstate_walk_subplans(List *plans, 242 | bool (*walker) (), 243 | void *context) 244 | { 245 | ListCell *lc; 246 | 247 | foreach(lc, plans) 248 | { 249 | SubPlanState *sps = lfirst_node(SubPlanState, lc); 250 | 251 | if (walker(sps->planstate, context)) 252 | return true; 253 | } 254 | 255 | return false; 256 | } 257 | 258 | bool 259 | planstate_walk_members(PlanState **planstates, int nplans, 260 | bool (*walker) (), void *context) 261 | { 262 | int j; 263 | 264 | for (j = 0; j < nplans; j++) 265 | { 266 | if (walker(planstates[j], context)) 267 | return true; 268 | } 269 | 270 | return false; 271 | } 272 | #endif 273 | -------------------------------------------------------------------------------- /sql/pg_sortstats.sql: -------------------------------------------------------------------------------- 1 | CREATE EXTENSION pg_sortstats; 2 | 3 | SELECT pg_sortstats_reset(); 4 | 5 | CREATE TABLE sorts (id integer, val text COLLATE "C"); 6 | INSERT INTO sorts SELECT i, 'line ' || i FROM generate_series(1, 100000) i; 7 | VACUUM ANALYZE sorts; 8 | 9 | SET work_mem = '64kB'; 10 | WITH src AS ( 11 | SELECT * FROM sorts ORDER BY val, id DESC 12 | ) 13 | SELECT * FROM src LIMIT 1; 14 | SELECT * FROM sorts ORDER BY id DESC LIMIT 1; 15 | 16 | SELECT nb_keys, sort_keys, lines, lines_to_sort, 17 | work_mems < (12 * 1024) AS "exp_less_12MB", 18 | topn_sorts, quicksorts, external_sorts, external_merges, 19 | nb_tapes > 2 AS multiple_tapes, 20 | space_disk > 1024 AS "disk_more_1MB", 21 | space_memory > 1024 AS "mem_more_1MB", 22 | non_parallels, COALESCE(nb_workers, 0) AS nb_workers 23 | FROM pg_sortstats(true) ORDER BY nb_keys; 24 | 25 | SELECT pg_sortstats_reset(); 26 | 27 | SET work_mem = '12MB'; 28 | WITH src AS ( 29 | SELECT * FROM sorts ORDER BY val, id DESC 30 | ) 31 | SELECT * FROM src LIMIT 1; 32 | 33 | SELECT nb_keys, sort_keys, lines, lines_to_sort, 34 | work_mems < (12 * 1024) AS "exp_less_12MB", 35 | topn_sorts, quicksorts, external_sorts, external_merges, 36 | nb_tapes > 2 AS multiple_tapes, 37 | space_disk > 1024 AS "disk_more_1MB", 38 | space_memory > 1024 AS "mem_more_1MB", 39 | non_parallels, COALESCE(nb_workers, 0) AS nb_workers 40 | FROM pg_sortstats(true) ORDER BY nb_keys; 41 | 42 | SELECT pg_sortstats_reset(); 43 | -------------------------------------------------------------------------------- /sql/pg_sortstats_12.sql: -------------------------------------------------------------------------------- 1 | CREATE EXTENSION pg_sortstats; 2 | 3 | SELECT pg_sortstats_reset(); 4 | 5 | CREATE TABLE sorts (id integer, val text COLLATE "C"); 6 | INSERT INTO sorts SELECT i, 'line ' || i FROM generate_series(1, 100000) i; 7 | VACUUM ANALYZE sorts; 8 | 9 | SET work_mem = '64kB'; 10 | WITH src AS MATERIALIZED ( 11 | SELECT * FROM sorts ORDER BY val, id DESC 12 | ) 13 | SELECT * FROM src LIMIT 1; 14 | SELECT * FROM sorts ORDER BY id DESC LIMIT 1; 15 | 16 | SELECT nb_keys, sort_keys, lines, lines_to_sort, 17 | work_mems < (12 * 1024) AS "exp_less_12MB", 18 | topn_sorts, quicksorts, external_sorts, external_merges, 19 | nb_tapes > 2 AS multiple_tapes, 20 | space_disk > 1024 AS "disk_more_1MB", 21 | space_memory > 1024 AS "mem_more_1MB", 22 | non_parallels, COALESCE(nb_workers, 0) AS nb_workers 23 | FROM pg_sortstats(true) ORDER BY nb_keys; 24 | 25 | SELECT pg_sortstats_reset(); 26 | 27 | SET work_mem = '12MB'; 28 | WITH src AS MATERIALIZED ( 29 | SELECT * FROM sorts ORDER BY val, id DESC 30 | ) 31 | SELECT * FROM src LIMIT 1; 32 | 33 | SELECT nb_keys, sort_keys, lines, lines_to_sort, 34 | work_mems < (12 * 1024) AS "exp_less_12MB", 35 | topn_sorts, quicksorts, external_sorts, external_merges, 36 | nb_tapes > 2 AS multiple_tapes, 37 | space_disk > 1024 AS "disk_more_1MB", 38 | space_memory > 1024 AS "mem_more_1MB", 39 | non_parallels, COALESCE(nb_workers, 0) AS nb_workers 40 | FROM pg_sortstats(true) ORDER BY nb_keys; 41 | 42 | SELECT pg_sortstats_reset(); 43 | --------------------------------------------------------------------------------