├── .gitignore ├── LICENSE.md ├── README.md ├── generate-btree-graph.sh ├── generate-demo-graphs.sh ├── graphviz-query.sql ├── leaf-page-split.dot ├── lehman-yao-orthodox-btree-suffix-short.dot ├── lehman-yao-orthodox-btree-suffix.dot ├── lehman-yao-orthodox-btree.dot ├── optimized-leaf-page-split-1.dot ├── optimized-leaf-page-split-2.dot ├── optimized-leaf-page-split-3.dot ├── pg_query_internals.sql ├── postgres-real-btree.dot ├── unoptimized-leaf-page-split-1.dot ├── unoptimized-leaf-page-split-2.dot └── unoptimized-leaf-page-split-3.dot /.gitignore: -------------------------------------------------------------------------------- 1 | *.svg 2 | *.pdf 3 | *.swp 4 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Copyright (c) 2016, Peter Geoghegan 2 | 3 | Permission to use, copy, modify, and distribute this software and its 4 | documentation for any purpose, without fee, and without a written agreement 5 | is hereby granted, provided that the above copyright notice and this 6 | paragraph and the following two paragraphs appear in all copies. 7 | 8 | IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR 9 | DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING 10 | LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS 11 | DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE 12 | POSSIBILITY OF SUCH DAMAGE. 13 | 14 | THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, 15 | INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY 16 | AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS 17 | ON AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO 18 | PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. 19 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pg_query_internals: Query PostgreSQL internals using SQL 2 | 3 | Current version: 0.2 4 | 5 | Author: Peter Geoghegan [``](mailto:pg@bowt.ie) 6 | 7 | License: PostgreSQL license 8 | 9 | Minimum supported version: PostgreSQL 9.5 (most things work on earlier versions, though) 10 | 11 | Requires: contrib/pageinspect, 12 | contrib/pg_buffercache 13 | 14 | ## Overview 15 | 16 | `pg_query_internals` is a collection of SQL queries that are useful for 17 | inspecting the state of a PostgreSQL database. SQL queries for querying the 18 | contents of the buffer cache are provided, as well as for querying the 19 | structure of a given B-Tree index, and how the index is cached. 20 | 21 | These queries are published for educational purposes only; they are not 22 | designed for production use. These queries may have some hard-coded 23 | assumptions about the underlying data being queried, although that is generally 24 | directly noted in comments. While the queries may be useful as a starting 25 | point for certain types of low-level investigations, they are generally not 26 | usable as instrumentation to find issues in production systems. 27 | 28 | In short, the SQL queries are written for those with a specific interest in 29 | PostgreSQL internals, and in particular the internals of the B-Tree access 30 | method and PostgreSQL buffer manager. 31 | 32 | ### Usage 33 | 34 | The SQL queries within `pg_query_internals.sql` are intended to be run on an 35 | ad-hoc basis. The queries are deliberatly not packaged as functions within an 36 | extension. 37 | 38 | ### Graphviz 39 | 40 | The SQL query within `graphviz-query.sql` can be used to generate a graph of a 41 | PostgreSQL B-Tree using graphviz. The bash script `generate-btree-graph.sh` 42 | gives an example of how this can be coordinated and managed. 43 | 44 | This is based on a much earlier approach by Heikki Linnakangas. 45 | 46 | ### Other resources 47 | 48 | For those that wish to learn more about PostgreSQL B-Tree indexes, the 49 | following resources are suggested: 50 | 51 | * The blogpost "Discovering the Computer Science Behind Postgres Indexes", by 52 | Pat Shaughnessy: 53 | 54 | http://patshaughnessy.net/2014/11/11/discovering-the-computer-science-behind-postgres-indexes 55 | 56 | Good high-level overview. 57 | 58 | * The PostgreSQL nbtree README. 59 | 60 | The authoritative source of information on PostgreSQL B-Tree indexes. 61 | 62 | * The paper "A symmetric concurrent B-tree algorithm", from Lanin & Shasha. 63 | 64 | This is the paper that the PostgreSQL page deletion (and page recycling) 65 | algorithm is based on. Although this isn't the original Lehman & Yao B-Tree 66 | paper that first described the optimistic technique used to avoid "crabbing" of 67 | buffer locks (these locks are sometimes called "latches" in the literature), 68 | it is the more useful resource in my opinion. Note that the algorithm is 69 | implemented in a slightly different manner in PostgreSQL, though the 70 | differences that are directly noted in the nbtree README. 71 | 72 | Lanin & Shasha's paper is of far more practical use to implementers, who may 73 | consider skipping the Lehman & Yao paper entirely. For example, it 74 | specifically takes issue with a strange tacit assumption made by the Lehman & 75 | Yao paper: the assumption that page reads and writes are always atomic. This 76 | assumption justifies the Lehman & Yao contention that their algorithm requires 77 | *no* locks during index scans. This claim is rather a lot stronger than the 78 | claim that only one lock is required at a time during a descent of the B-Tree, 79 | which is all that PostgreSQL manages, and all that Lanin & Shasha see fit to 80 | claim for their enhanced algorithm. 81 | 82 | The Lanin & Shasha paper actually describes a practical deletion algorithm, 83 | rather than assuming that in general page deletion can happen during a period 84 | in which the system is offline, as Lehman & Yao rather fancifully suggest. 85 | Since all practical requirements are met at once, the Lanin & Shasha design is 86 | a truly comprehensive guide to implementing a real-world, high concurrency 87 | B-Tree structure. 88 | 89 | * "The Internals of PostgreSQL" website: 90 | 91 | http://www.interdb.jp/pg/index.html 92 | 93 | This website is a good general starting point for learning about PostgreSQL 94 | internals more generally. 95 | -------------------------------------------------------------------------------- /generate-btree-graph.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Generate an svg image of the B-Tree index specified within 4 | # graphviz-query.sql. 5 | # 6 | # Assumes that psql and graphviz dot are in $PATH 7 | 8 | echo "generating dot file..." 9 | time psql -f graphviz-query.sql --no-psqlrc --no-align -t > /tmp/query-btree.dot 10 | echo "generating svg file..." 11 | time dot -T svg /tmp/query-btree.dot -o /tmp/query-btree.svg 12 | -------------------------------------------------------------------------------- /generate-demo-graphs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | rm *.pdf 4 | NAME="lehman-yao-orthodox-btree" 5 | echo "Generating __$NAME.pdf..." 6 | dot -T pdf "$NAME.dot" -o "__$NAME.pdf" 7 | NAME="lehman-yao-orthodox-btree-suffix" 8 | echo "Generating __$NAME.pdf..." 9 | dot -T pdf "$NAME.dot" -o "__$NAME.pdf" 10 | NAME="lehman-yao-orthodox-btree-suffix-short" 11 | echo "Generating __$NAME.pdf..." 12 | dot -T pdf "$NAME.dot" -o "__$NAME.pdf" 13 | NAME="postgres-real-btree" 14 | echo "Generating __$NAME.pdf..." 15 | dot -T pdf "$NAME.dot" -o "__$NAME.pdf" 16 | 17 | # Initial leaf page, about to be split 18 | NAME="leaf-page-split" 19 | echo "Generating __$NAME.pdf..." 20 | dot -T pdf "$NAME.dot" -o "__$NAME.pdf" 21 | 22 | # Unoptimized: 23 | NAME="unoptimized-leaf-page-split-1" 24 | echo "Generating __$NAME.pdf..." 25 | dot -T pdf "$NAME.dot" -o "__$NAME.pdf" 26 | NAME="unoptimized-leaf-page-split-2" 27 | echo "Generating __$NAME.pdf..." 28 | dot -T pdf "$NAME.dot" -o "__$NAME.pdf" 29 | NAME="unoptimized-leaf-page-split-3" 30 | echo "Generating __$NAME.pdf..." 31 | dot -T pdf "$NAME.dot" -o "__$NAME.pdf" 32 | 33 | # Optimized: 34 | NAME="optimized-leaf-page-split-1" 35 | echo "Generating __$NAME.pdf..." 36 | dot -T pdf "$NAME.dot" -o "__$NAME.pdf" 37 | NAME="optimized-leaf-page-split-2" 38 | echo "Generating __$NAME.pdf..." 39 | dot -T pdf "$NAME.dot" -o "__$NAME.pdf" 40 | NAME="optimized-leaf-page-split-3" 41 | echo "Generating __$NAME.pdf..." 42 | dot -T pdf "$NAME.dot" -o "__$NAME.pdf" 43 | -------------------------------------------------------------------------------- /graphviz-query.sql: -------------------------------------------------------------------------------- 1 | WITH RECURSIVE index_details AS ( 2 | SELECT 3 | 'pgbench_accounts_pkey'::text idx 4 | ), 5 | size_in_pages_index AS ( 6 | SELECT 7 | (pg_relation_size(idx::regclass) / (2^13))::int4 size_pages 8 | FROM 9 | index_details 10 | ), 11 | page_stats AS ( 12 | SELECT 13 | index_details.*, 14 | stats.* 15 | FROM 16 | index_details, 17 | size_in_pages_index, 18 | lateral (SELECT i FROM generate_series(1, size_pages - 1) i) series, 19 | lateral (SELECT * FROM bt_page_stats(idx, i)) stats 20 | ), 21 | meta_stats AS ( 22 | SELECT 23 | * 24 | FROM 25 | index_details s, 26 | lateral (SELECT * FROM bt_metap(s.idx)) meta 27 | ), 28 | pages_raw AS ( 29 | SELECT 30 | * 31 | FROM 32 | page_stats 33 | ORDER BY 34 | btpo DESC 35 | ), 36 | /* XXX: Note ordering dependency within this CTE */ 37 | pages_walk(item, prior, llive_items, blk, level) AS ( 38 | SELECT 39 | 1, 40 | 0, 41 | case when btpo_next = 0 then live_items else live_items - 1 end, 42 | blkno, 43 | btpo 44 | FROM 45 | pages_raw 46 | WHERE 47 | btpo_prev = 0 48 | AND btpo = (SELECT level FROM meta_stats) 49 | UNION 50 | SELECT 51 | CASE WHEN level = btpo THEN w.item + 1 ELSE 1 END, 52 | CASE WHEN level != btpo then 0 else prior + llive_items END, 53 | case when btpo_next = 0 then live_items else live_items - 1 end, 54 | blkno, 55 | btpo 56 | FROM 57 | pages_raw i, 58 | pages_walk w 59 | WHERE 60 | i.btpo_prev = w.blk OR (btpo_prev = 0 AND btpo = w.level - 1) 61 | ), 62 | level_det as ( 63 | SELECT 64 | format($fff$ 65 | node%1$s_%2$s[ tooltip = "Block %6$s values (high key is positioned at offset 1): %14$s" label=< 66 | 67 | -- First item on page 68 | 69 | -- Second item on page 70 | 71 | -- Highkey 72 | 73 | -- Gray box with details 74 | 75 | 76 |
%3$s%4$s%5$sLevel %1$s logical page %2$s

Block number: %6$s
live/dead items: %7$s/%8$s
avg tuple width: %9$s
distinct keys (no highkey): %10$s
distinct block pointers: %11$s
free size: %12$s
77 | > 78 | ]; 79 | $fff$, 80 | btpo, item, 81 | /* First item */ 82 | CASE WHEN btpo != 0 THEN '-∞' when btpo_next = 0 then int4_from_page_data(all_items[1])::text else int4_from_page_data(all_items[2])::text end, 83 | /* Second item */ 84 | CASE WHEN btpo_next = 0 then int4_from_page_data(all_items[2])::text else int4_from_page_data(all_items[3])::text end, 85 | /* High key */ 86 | coalesce(CASE WHEN btpo_next != 0 THEN int4_from_page_data(all_items[1])::text END, '+∞'), 87 | /* Page details */ 88 | blkno::text, live_items, dead_items, avg_item_size, distinct_real_item_keys, distinct_block_pointers, free_size, 89 | /* Appropriate HTML color for first and second items */ 90 | case when btpo != 0 then '#F1C40F'::text else '#2ECC71'::text end, 91 | /* 92 | * Tooltip values, for each page. Doesn't seem worth using 93 | * int4_from_page_data() here, as that's very slow. 94 | */ 95 | array_to_string(all_items, '; ') 96 | ) || 97 | -- Use logical block numbers to build downlinks to children 98 | -- 99 | -- XXX: This is probably broken by page deletion, where there is no downlink in 100 | -- parent but child still has sibling pointers. It's probably possible to fix 101 | -- this by skipping deleted pages. 102 | case when btpo != 0 then 103 | (select string_agg(format('"node%s_%s" -> "node%s_%s":f0 ', btpo, item, btpo - 1, gg), E'\n') 104 | from 105 | generate_series(prior +1, prior + distinct_block_pointers) gg) 106 | else 107 | '' 108 | end || 109 | -- sibling pointer: 110 | case when btpo_next != 0 then 111 | (select format(E'\n\n"node%1$s_%2$s" -> "node%1$s_%3$s"[constraint=false,color=gray,style=dashed,arrowsize=0.5]', btpo, item, item + 1)) 112 | else 113 | '' 114 | end 115 | as all_level_details, 116 | 117 | btpo, item 118 | FROM 119 | pages_walk w, 120 | pages_raw i, 121 | lateral ( 122 | SELECT 123 | COUNT(DISTINCT (CASE WHEN btpo_next = 0 OR itemoffset > 1 THEN (DATA COLLATE "C") END)) AS distinct_real_item_keys, 124 | COUNT(DISTINCT (CASE WHEN btpo_next = 0 OR itemoffset > 1 THEN (ctid::text::point)[0]::BIGINT END)) AS distinct_block_pointers, 125 | /* Note: displaying all values as int4 takes rather a long time */ 126 | array_agg(nullif(data, '')) AS all_items 127 | FROM bt_page_items(idx, blkno) 128 | ) items 129 | WHERE w.blk = i.blkno 130 | /* Uncomment to avoid showing leaf level (faster): */ 131 | /* and level > 0*/ 132 | ORDER BY btpo DESC, item 133 | ) 134 | select 135 | $digraph$ 136 | digraph nbtree { 137 | graph [fontname = "monospace"]; 138 | node [shape = none,height=.1,fontname = "monospace",fontsize=6]; 139 | edge [penwidth=0.5] 140 | $digraph$ 141 | union all 142 | select * from (select all_level_details from level_det order by btpo DESC, item) a 143 | union all 144 | select '}'; 145 | -------------------------------------------------------------------------------- /leaf-page-split.dot: -------------------------------------------------------------------------------- 1 | // Example B-Tree graph 2 | // 3 | // Breadth-first order 4 | // 5 | // Start from root, go left to right 6 | // 7 | // Workflow: 8 | // 9 | // $ dot -T svg btree.dot -o btree.svg; 10 | 11 | digraph nbtree { 12 | graph [fontname = "monospace"]; 13 | node [shape = none,height=.1,fontname = "monospace",fontsize=18]; 14 | // Level 0 (leaf level) 15 | leafnode_1[ label=< 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 |
1,11,21,32,1
24 | > 25 | ]; 26 | //Force alignment from root to internal to leaf levels: 27 | //edge[style=invis]; 28 | //"rootnode":d1 -> "leafnode_2":t2 29 | } 30 | -------------------------------------------------------------------------------- /lehman-yao-orthodox-btree-suffix-short.dot: -------------------------------------------------------------------------------- 1 | // Example B-Tree graph 2 | // 3 | // Breadth-first order 4 | // 5 | // Start from root, go left to right 6 | // 7 | // Workflow: 8 | // 9 | // $ dot -T svg btree.dot -o btree.svg; 10 | 11 | digraph nbtree { 12 | graph [fontname = "monospace"]; 13 | node [shape = none,height=.1,fontname = "monospace",fontsize=8]; 14 | // Level 1 (root level) 15 | // Downlinks + highkey: 16 | rootnode[ label=< 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 |
-∞DGNPTY+∞
35 | > 36 | ]; 37 | // Downlink arrows to children: 38 | "rootnode":d0 -> "leafnode_1" 39 | "rootnode":d1 -> "leafnode_2" 40 | "rootnode":d2 -> "leafnode_3" 41 | "rootnode":d3 -> "leafnode_4" 42 | // Still need this, to make arrows not overlap too much: 43 | "rootnode":d4 -> "leafnode_5":t0 44 | "rootnode":d5 -> "leafnode_6":t0 45 | "rootnode":d6 -> "leafnode_7":t0 46 | 47 | // sibling pointer: 48 | // (None) 49 | 50 | 51 | // Level 0 (leaf level) 52 | leafnode_1[ label=< 53 | 54 | 55 | 56 | 57 | 58 |
AlfaBravoD
59 | > 60 | ]; 61 | // sibling pointer: 62 | "leafnode_1" -> "leafnode_2"[constraint=false,color=black,style=dashed,arrowsize=0.5] 63 | leafnode_2[ label=< 64 | 65 | 66 | 67 | 68 | 69 | 70 |
DeltaEchoFoxtrotG
71 | > 72 | ]; 73 | // sibling pointer: 74 | "leafnode_2" -> "leafnode_3"[constraint=false,color=black,style=dashed,arrowsize=0.5] 75 | leafnode_3[ label=< 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 |
JulietKiloLimaMikeN
84 | > 85 | ]; 86 | // sibling pointer: 87 | "leafnode_3" -> "leafnode_4"[constraint=false,color=black,style=dashed,arrowsize=0.5] 88 | leafnode_4[ label=< 89 | 90 | 91 | 92 | 93 | 94 |
NovemberOscarP
95 | > 96 | ]; 97 | // sibling pointer (not a cousin anymore): 98 | "leafnode_4" -> "leafnode_5"[constraint=false,color=black,style=dashed,arrowsize=0.5] 99 | leafnode_5[ label=< 100 | 101 | 102 | 103 | 104 | 105 | 106 |
QuebecRomeoSierraT
107 | > 108 | ]; 109 | // sibling pointer: 110 | "leafnode_5" -> "leafnode_6"[constraint=false,color=black,style=dashed,arrowsize=0.5] 111 | leafnode_6[ label=< 112 | 113 | 114 | 115 | 116 | 117 | 118 |
UniformVictorXRayY
119 | > 120 | ]; 121 | // sibling pointer: 122 | "leafnode_6" -> "leafnode_7"[constraint=false,color=black,style=dashed,arrowsize=0.5] 123 | leafnode_7[ label=< 124 | 125 | 126 | 127 | 128 | 129 |
YankeeZulu+∞
130 | > 131 | ]; 132 | // sibling pointer: 133 | // (None) 134 | 135 | //Force alignment from root to internal to leaf levels: 136 | edge[style=invis]; 137 | "rootnode":s4 -> "leafnode_4":hk 138 | } 139 | -------------------------------------------------------------------------------- /lehman-yao-orthodox-btree-suffix.dot: -------------------------------------------------------------------------------- 1 | // Example B-Tree graph 2 | // 3 | // Breadth-first order 4 | // 5 | // Start from root, go left to right 6 | // 7 | // Workflow: 8 | // 9 | // $ dot -T svg btree.dot -o btree.svg; 10 | 11 | digraph nbtree { 12 | graph [fontname = "monospace"]; 13 | node [shape = none,height=.1,fontname = "monospace",fontsize=8]; 14 | // Level 2 (Root internal level) 15 | rootnode[ label=< 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 |
-∞P+∞
24 | > 25 | ]; 26 | // Downlink arrows to children: 27 | "rootnode":d0 -> "leftinternal" 28 | "rootnode":d1 -> "rightinternal" 29 | // sibling pointer: 30 | // (None) 31 | 32 | 33 | // Level 1 (Internal level) 34 | // Downlinks + highkey: 35 | leftinternal[ label=< 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 |
-∞DGNP
48 | > 49 | ]; 50 | // Downlink arrows to children: 51 | "leftinternal":d0 -> "leftleafnode_1" 52 | "leftinternal":d1 -> "leftleafnode_2" 53 | "leftinternal":d2 -> "leftleafnode_3" 54 | "leftinternal":d3 -> "leftleafnode_4" 55 | // sibling pointer: 56 | "leftinternal" -> "rightinternal"[constraint=false,color=black,style=dashed,arrowsize=0.5] 57 | // Downlinks + highkey: 58 | rightinternal[ label=< 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 |
"-∞"TY+∞
69 | > 70 | ]; 71 | // Downlink arrows to children: 72 | "rightinternal":d0 -> "rightleafnode_1":t0 73 | "rightinternal":d1 -> "rightleafnode_2":t0 74 | "rightinternal":d2 -> "rightleafnode_3":t0 75 | // sibling pointer: 76 | // (None) 77 | 78 | 79 | // Level 0 (leaf level) 80 | leftleafnode_1[ label=< 81 | 82 | 83 | 84 | 85 | 86 |
AlfaBravoD
87 | > 88 | ]; 89 | // sibling pointer: 90 | "leftleafnode_1" -> "leftleafnode_2"[constraint=false,color=black,style=dashed,arrowsize=0.5] 91 | leftleafnode_2[ label=< 92 | 93 | 94 | 95 | 96 | 97 | 98 |
DeltaEchoFoxtrotG
99 | > 100 | ]; 101 | // sibling pointer: 102 | "leftleafnode_2" -> "leftleafnode_3"[constraint=false,color=black,style=dashed,arrowsize=0.5] 103 | leftleafnode_3[ label=< 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 |
JulietKiloLimaMikeN
112 | > 113 | ]; 114 | // sibling pointer: 115 | "leftleafnode_3" -> "leftleafnode_4"[constraint=false,color=black,style=dashed,arrowsize=0.5] 116 | leftleafnode_4[ label=< 117 | 118 | 119 | 120 | 121 | 122 |
NovemberOscarP
123 | > 124 | ]; 125 | // sibling/cousin pointer: 126 | "leftleafnode_4" -> "rightleafnode_1"[constraint=false,color=gray,style=dashed,arrowsize=0.5] 127 | rightleafnode_1[ label=< 128 | 129 | 130 | 131 | 132 | 133 | 134 |
QuebecRomeoSierraT
135 | > 136 | ]; 137 | // sibling pointer: 138 | "rightleafnode_1" -> "rightleafnode_2"[constraint=false,color=black,style=dashed,arrowsize=0.5] 139 | rightleafnode_2[ label=< 140 | 141 | 142 | 143 | 144 | 145 | 146 |
UniformVictorXRayY
147 | > 148 | ]; 149 | // sibling pointer: 150 | "rightleafnode_2" -> "rightleafnode_3"[constraint=false,color=black,style=dashed,arrowsize=0.5] 151 | rightleafnode_3[ label=< 152 | 153 | 154 | 155 | 156 | 157 |
YankeeZulu+∞
158 | > 159 | ]; 160 | // sibling pointer: 161 | // (None) 162 | 163 | // Force alignment from root to internal to leaf levels: 164 | edge[style=invis]; 165 | "rootnode":s1 -> "leftleafnode_4":hk 166 | "leftinternal":hk -> "leftleafnode_4":hk 167 | } 168 | -------------------------------------------------------------------------------- /lehman-yao-orthodox-btree.dot: -------------------------------------------------------------------------------- 1 | // Example B-Tree graph 2 | // 3 | // Breadth-first order 4 | // 5 | // Start from root, go left to right 6 | // 7 | // Workflow: 8 | // 9 | // $ dot -T svg btree.dot -o btree.svg; 10 | 11 | digraph nbtree { 12 | graph [fontname = "monospace"]; 13 | node [shape = none,height=.1,fontname = "monospace",fontsize=8]; 14 | // Level 2 (Root internal level) 15 | rootnode[ label=< 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 |
-∞Papa+∞
24 | > 25 | ]; 26 | // Downlink arrows to children: 27 | "rootnode":d0 -> "leftinternal" 28 | "rootnode":d1 -> "rightinternal" 29 | // sibling pointer: 30 | // (None) 31 | 32 | 33 | // Level 1 (Internal level) 34 | // Downlinks + highkey: 35 | leftinternal[ label=< 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 |
-∞CharlieGolfMikePapa
48 | > 49 | ]; 50 | // Downlink arrows to children: 51 | "leftinternal":d0 -> "leftleafnode_1" 52 | "leftinternal":d1 -> "leftleafnode_2" 53 | "leftinternal":d2 -> "leftleafnode_3" 54 | "leftinternal":d3 -> "leftleafnode_4" 55 | // sibling pointer: 56 | "leftinternal" -> "rightinternal"[constraint=false,color=black,style=dashed,arrowsize=0.5] 57 | // Downlinks + highkey: 58 | rightinternal[ label=< 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 |
"-∞"TangoX-Ray+∞
69 | > 70 | ]; 71 | // Downlink arrows to children: 72 | "rightinternal":d0 -> "rightleafnode_1":t0 73 | "rightinternal":d1 -> "rightleafnode_2":t0 74 | "rightinternal":d2 -> "rightleafnode_3":t0 75 | // sibling pointer: 76 | // (None) 77 | 78 | 79 | // Level 0 (leaf level) 80 | leftleafnode_1[ label=< 81 | 82 | 83 | 84 | 85 | 86 |
AlfaBravoCharlie
87 | > 88 | ]; 89 | // sibling pointer: 90 | "leftleafnode_1" -> "leftleafnode_2"[constraint=false,color=black,style=dashed,arrowsize=0.5] 91 | leftleafnode_2[ label=< 92 | 93 | 94 | 95 | 96 | 97 | 98 |
DeltaEchoFoxtrotGolf
99 | > 100 | ]; 101 | // sibling pointer: 102 | "leftleafnode_2" -> "leftleafnode_3"[constraint=false,color=black,style=dashed,arrowsize=0.5] 103 | leftleafnode_3[ label=< 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 |
JulietKiloLimaMikeMike
112 | > 113 | ]; 114 | // sibling pointer: 115 | "leftleafnode_3" -> "leftleafnode_4"[constraint=false,color=black,style=dashed,arrowsize=0.5] 116 | leftleafnode_4[ label=< 117 | 118 | 119 | 120 | 121 | 122 |
NovemberOscarPapa
123 | > 124 | ]; 125 | // sibling/cousin pointer: 126 | "leftleafnode_4" -> "rightleafnode_1"[constraint=false,color=gray,style=dashed,arrowsize=0.5] 127 | rightleafnode_1[ label=< 128 | 129 | 130 | 131 | 132 | 133 | 134 |
QuebecRomeoSierraTango
135 | > 136 | ]; 137 | // sibling pointer: 138 | "rightleafnode_1" -> "rightleafnode_2"[constraint=false,color=black,style=dashed,arrowsize=0.5] 139 | rightleafnode_2[ label=< 140 | 141 | 142 | 143 | 144 | 145 | 146 |
UniformVictorXRayXRay
147 | > 148 | ]; 149 | // sibling pointer: 150 | "rightleafnode_2" -> "rightleafnode_3"[constraint=false,color=black,style=dashed,arrowsize=0.5] 151 | rightleafnode_3[ label=< 152 | 153 | 154 | 155 | 156 | 157 |
YankeeZulu+∞
158 | > 159 | ]; 160 | // sibling pointer: 161 | // (None) 162 | 163 | // Force alignment from root to internal to leaf levels: 164 | edge[style=invis]; 165 | "rootnode":s1 -> "leftleafnode_4":hk 166 | "leftinternal":hk -> "leftleafnode_4":hk 167 | } 168 | -------------------------------------------------------------------------------- /optimized-leaf-page-split-1.dot: -------------------------------------------------------------------------------- 1 | // Example B-Tree graph 2 | // 3 | // Breadth-first order 4 | // 5 | // Start from root, go left to right 6 | // 7 | // Workflow: 8 | // 9 | // $ dot -T svg btree.dot -o btree.svg; 10 | 11 | digraph nbtree { 12 | graph [fontname = "monospace"]; 13 | node [shape = none,height=.1,fontname = "monospace",fontsize=18]; 14 | // Level 0 (leaf level) 15 | leafnode_2[ label=< 16 | 17 | 18 | 19 | 20 | 21 | 22 |
1,11,21,31,4
23 | > 24 | ]; 25 | edge[style=invis]; 26 | "leafnode_2" -> "leafnode_3" 27 | leafnode_3[ label=< 28 | 29 | 30 | 31 | 32 | 33 | 34 |
1,51,6
35 | > 36 | ]; 37 | edge[style=invis]; 38 | "leafnode_3" -> "leafnode_4" 39 | leafnode_4[ label=< 40 | 41 | 42 | 43 | 44 | 45 | 46 |
2,1
47 | > 48 | ]; 49 | //Force alignment from root to internal to leaf levels: 50 | //edge[style=invis]; 51 | //"rootnode":d1 -> "leafnode_2":t2 52 | } 53 | -------------------------------------------------------------------------------- /optimized-leaf-page-split-2.dot: -------------------------------------------------------------------------------- 1 | // Example B-Tree graph 2 | // 3 | // Breadth-first order 4 | // 5 | // Start from root, go left to right 6 | // 7 | // Workflow: 8 | // 9 | // $ dot -T svg btree.dot -o btree.svg; 10 | 11 | digraph nbtree { 12 | graph [fontname = "monospace"]; 13 | node [shape = none,height=.1,fontname = "monospace",fontsize=18]; 14 | // Level 0 (leaf level) 15 | leafnode_2[ label=< 16 | 17 | 18 | 19 | 20 | 21 | 22 |
1,11,21,31,4
23 | > 24 | ]; 25 | edge[style=invis]; 26 | "leafnode_2" -> "leafnode_3" 27 | leafnode_3[ label=< 28 | 29 | 30 | 31 | 32 | 33 | 34 |
1,51,61,71,8
35 | > 36 | ]; 37 | edge[style=invis]; 38 | "leafnode_3" -> "leafnode_4" 39 | leafnode_4[ label=< 40 | 41 | 42 | 43 | 44 | 45 | 46 |
1,9
47 | > 48 | ]; 49 | edge[style=invis]; 50 | "leafnode_4" -> "leafnode_5" 51 | leafnode_5[ label=< 52 | 53 | 54 | 55 | 56 | 57 | 58 |
2,1
59 | > 60 | ]; 61 | //Force alignment from root to internal to leaf levels: 62 | //edge[style=invis]; 63 | //"rootnode":d1 -> "leafnode_2":t2 64 | } 65 | -------------------------------------------------------------------------------- /optimized-leaf-page-split-3.dot: -------------------------------------------------------------------------------- 1 | // Example B-Tree graph 2 | // 3 | // Breadth-first order 4 | // 5 | // Start from root, go left to right 6 | // 7 | // Workflow: 8 | // 9 | // $ dot -T svg btree.dot -o btree.svg; 10 | 11 | digraph nbtree { 12 | graph [fontname = "monospace"]; 13 | node [shape = none,height=.1,fontname = "monospace",fontsize=18]; 14 | // Level 0 (leaf level) 15 | leafnode_2[ label=< 16 | 17 | 18 | 19 | 20 | 21 | 22 |
1,11,21,31,4
23 | > 24 | ]; 25 | edge[style=invis]; 26 | "leafnode_2" -> "leafnode_3" 27 | leafnode_3[ label=< 28 | 29 | 30 | 31 | 32 | 33 | 34 |
1,51,61,71,8
35 | > 36 | ]; 37 | edge[style=invis]; 38 | "leafnode_3" -> "leafnode_4" 39 | leafnode_4[ label=< 40 | 41 | 42 | 43 | 44 | 45 | 46 |
1,91,101,111,12
47 | > 48 | ]; 49 | edge[style=invis]; 50 | "leafnode_4" -> "leafnode_5" 51 | leafnode_5[ label=< 52 | 53 | 54 | 55 | 56 | 57 | 58 |
2,1
59 | > 60 | ]; 61 | //Force alignment from root to internal to leaf levels: 62 | //edge[style=invis]; 63 | //"rootnode":d1 -> "leafnode_2":t2 64 | } 65 | -------------------------------------------------------------------------------- /pg_query_internals.sql: -------------------------------------------------------------------------------- 1 | ------------------ 2 | -- Dependencies -- 3 | ------------------ 4 | create extension pageinspect; 5 | create extension pg_buffercache; 6 | 7 | -------------------------- 8 | -- B-Tree balance stats -- 9 | -------------------------- 10 | -- 11 | -- Among B-Tree indexes, what proportion are leaf pages, internal pages and 12 | -- root pages? Note that a single non-meta page B-Tree, which has only a root 13 | -- page yet to undergo a root page split counts as only having a single leaf 14 | -- page. 15 | with tots as ( 16 | SELECT 17 | count(*) c, 18 | avg(live_items) avg_live_items, 19 | avg(dead_items) avg_dead_items, 20 | avg(avg_item_size) avg_item_size, 21 | u.type, 22 | r.oid 23 | from 24 | (select 25 | c.oid, 26 | -- Don't count meta-page, or trust pg_class.relpages: 27 | generate_series(1, (select (pg_relation_size(c.oid) / (2^13))::int4) - 1) i 28 | from 29 | pg_index i 30 | join pg_opclass op on i.indclass[0] = op.oid 31 | join pg_am am on op.opcmethod = am.oid 32 | join pg_class c on i.indexrelid = c.oid 33 | where am.amname = 'btree') r, 34 | lateral (select * from bt_page_stats(r.oid::regclass::text, i)) u 35 | group by r.oid, type) 36 | select 37 | ct.relname table_name, 38 | tots.oid::regclass::text index_name, 39 | upper(type) page_type, 40 | c npages, 41 | to_char(avg_live_items, '990.999') as avg_live_items, 42 | to_char(avg_dead_items, '990.999') as avg_dead_items, 43 | to_char(avg_item_size, '990.999') as avg_item_size, 44 | to_char(c/sum(c) over(partition by tots.oid) * 100, '990.999') || ' %' as prop_of_index 45 | from tots 46 | join pg_index i on i.indexrelid = tots.oid 47 | join pg_class ct on ct.oid = i.indrelid 48 | order by ct.relnamespace, table_name, index_name, npages, type; 49 | 50 | ---------------------------- 51 | -- B-Tree root page stats -- 52 | ---------------------------- 53 | with index_details as ( 54 | select 55 | 'some_index'::text idx 56 | ), 57 | meta_stats as ( 58 | select 59 | * 60 | from index_details s, 61 | lateral (select * from bt_metap(s.idx)) meta), 62 | root_stats as ( 63 | select 64 | idx, 65 | root, 66 | level, 67 | fastroot, 68 | fastlevel, 69 | stats.* 70 | from 71 | meta_stats s, 72 | lateral (select * from bt_page_stats(idx, root)) stats) 73 | select 74 | root_stats.* 75 | from 76 | root_stats; 77 | 78 | ------------------------------------------------------------ 79 | -- Summarize internal (non-leaf) B-Tree levels, key space -- 80 | ------------------------------------------------------------ 81 | -- 82 | -- Shows internal pages (including root page) in logical order, along with high 83 | -- key data that determines logically ordering. (This is an upper bound on page 84 | -- data). 85 | -- 86 | -- Should work with most types of data. Things like varlena headers will look 87 | -- a bit funny, but this should be easy enough to wade through or tweak. 88 | with recursive index_details as ( 89 | select 90 | 'some_text_index'::text idx 91 | ), 92 | size_in_pages_index as ( 93 | select 94 | (pg_relation_size(idx::regclass) / (2^13))::int4 size_pages 95 | from 96 | index_details 97 | ), 98 | page_stats as ( 99 | select 100 | index_details.*, 101 | stats.* 102 | from 103 | index_details, 104 | size_in_pages_index, 105 | lateral (select i from generate_series(1, size_pages - 1) i) series, 106 | lateral (select * from bt_page_stats(idx, i)) stats), 107 | internal_page_stats as ( 108 | select 109 | * 110 | from 111 | page_stats 112 | where 113 | type != 'l'), 114 | meta_stats as ( 115 | select 116 | * 117 | from 118 | index_details s, 119 | lateral (select * from bt_metap(s.idx)) meta), 120 | internal_items as ( 121 | select 122 | * 123 | from 124 | internal_page_stats 125 | order by 126 | btpo desc), 127 | -- XXX: Note ordering dependency within this CTE, on internal_items 128 | ordered_internal_items(item, blk, level) as ( 129 | select 130 | 1, 131 | blkno, 132 | btpo 133 | from 134 | internal_items 135 | where 136 | btpo_prev = 0 137 | and btpo = (select level from meta_stats) 138 | union 139 | select 140 | case when level = btpo then o.item + 1 else 1 end, 141 | blkno, 142 | btpo 143 | from 144 | internal_items i, 145 | ordered_internal_items o 146 | where 147 | i.btpo_prev = o.blk or (btpo_prev = 0 and btpo = o.level - 1) 148 | ) 149 | select 150 | idx, 151 | btpo as level, 152 | item as l_item, 153 | blkno, 154 | btpo_prev, 155 | btpo_next, 156 | btpo_flags, 157 | type, 158 | live_items, 159 | dead_items, 160 | avg_item_size, 161 | page_size, 162 | free_size, 163 | -- Only non-rightmost pages have high key. 164 | -- 165 | -- XXX: To get this to work with a non-text index, fiddle with the expression 166 | -- that extracts from the "data" bt_page_items column. 167 | -- 168 | -- Data is formed starting after varlena header bytes. NUL bytes often appear 169 | -- due to alignment considerations, but aren't valid utf-8. We switch NUL 170 | -- bytes with raw bytes that make what looks like a text output function 171 | -- escaped NUL byte (non-NUL special bytes will look like this without our 172 | -- help, but we need to take special measures for NUL) 173 | case when btpo_next != 0 then (select convert_from(decode(regexp_replace(data, ' 0{2}', '5C783030', 'g'), 'hex'), 'utf-8') 174 | from bt_page_items(idx, blkno) where itemoffset = 1) end as highkey 175 | from 176 | ordered_internal_items o 177 | join internal_items i on o.blk = i.blkno 178 | order by btpo desc, item; 179 | 180 | --------------------------------------------------------- 181 | -- Get a quick view of B-Tree buffercache usage counts -- 182 | --------------------------------------------------------- 183 | select 184 | c.relname, 185 | bt.type, 186 | count(*), 187 | rl.*, 188 | avg(case when bf.isdirty then 1.0 else 0.0 end) as avg_is_dirty, 189 | avg(bf.usagecount) as avg_usagecount 190 | from 191 | pg_buffercache bf 192 | join pg_class c on c.oid = pg_filenode_relation(bf.reltablespace, bf.relfilenode) 193 | join pg_index i on i.indexrelid = c.oid 194 | join pg_opclass op on i.indclass[0] = op.oid 195 | join pg_am am on op.opcmethod = am.oid 196 | join lateral bt_page_stats(c.oid::regclass::text, bf.relblocknumber::int4) as bt on true 197 | join lateral pg_relation_size(c.oid) as rl on true 198 | where am.amname = 'btree' and bf.relblocknumber > 0 199 | group by c.relname, bt.type 200 | order by c.relname, bt.type; 201 | 202 | ------------------------------------------------------------------------------- 203 | -- Details caching analysis, for testing clocksweep efficiency in production -- 204 | ------------------------------------------------------------------------------- 205 | 206 | -- In first pass, create materialized copy of pg_buffercache view: 207 | create materialized view bufcacheview as 208 | select * from pg_buffercache; 209 | 210 | -- Then, materialize bt page stats (this needs to happen afterwards, to not 211 | -- spoil cache): 212 | create materialized view btree_pages as 213 | select 214 | r.oid as pg_class_oid, 215 | r.i as relblocknumber, 216 | upper(u.type) as type, 217 | live_items, 218 | dead_items, 219 | avg_item_size 220 | from 221 | (select 222 | c.oid, 223 | -- Don't rely on potentially stale pg_class.relpages here: 224 | generate_series(1, (select (pg_relation_size(c.oid) / (2^13))::int4) - 1) i 225 | from 226 | pg_index i 227 | join pg_opclass op on i.indclass[0] = op.oid 228 | join pg_am am on op.opcmethod = am.oid 229 | join pg_class c on i.indexrelid = c.oid 230 | where am.amname = 'btree') r, 231 | lateral (select * from bt_page_stats(r.oid::regclass::text, i)) u 232 | order by r.oid, u.type, r.i; 233 | 234 | -- Finally, put it together. Show how well each class of B-Tree page is 235 | -- cached, with standard buffercache statistics for each, rolled-up: 236 | select 237 | pg_class_oid::regclass as index_name, 238 | pg_size_pretty(pg_relation_size(pg_class_oid)) as index_relation_size, 239 | type, 240 | count(*) as blocks, 241 | sum(case when bc.relblocknumber is null then 0 else 1 end) as buffers, 242 | sum(case when bc.relblocknumber is null then 0.0 else 1.0 end) / count(*) as prop_cached, 243 | sum(case when bc.isdirty then 1 else 0 end) as are_dirty, 244 | avg(coalesce(usagecount, 0)) as avg_usagecount, 245 | avg(pinning_backends) as avg_pinning_backends_in_cache, 246 | var_pop(bc.relblocknumber) as var_pop_blocks_in_cache 247 | from 248 | btree_pages btp 249 | join pg_class c on btp.pg_class_oid = c.oid 250 | left join bufcacheview bc on c.relfilenode = bc.relfilenode 251 | and btp.relblocknumber = bc.relblocknumber 252 | group by rollup(pg_class_oid, type) 253 | order by pg_relation_size(pg_class_oid) desc nulls last, pg_class_oid, 254 | -- Force "Root, internal, leaf" ordering ("nulls last" avoids breaking 255 | -- "rollup"): 256 | case type when 'R' then 0 when 'I' then 2 when 'L' then 3 end nulls last; 257 | 258 | ------------------------------------------------- 259 | -- Higher-level summary of entire buffer cache -- 260 | ------------------------------------------------- 261 | -- 262 | -- (Note: this isn't exactly comparable to above, since proportions are stuff 263 | -- in cache only here, not all blocks. Actually, some thigns above are for 264 | -- buffer cache only, others are all blocks.) 265 | select 266 | c.oid::regclass, 267 | pg_size_pretty(pg_relation_size(c.oid)) as index_relation_size, 268 | c.relkind, 269 | case relforknumber 270 | when 0 then 271 | 'Main Fork' 272 | when 1 then 273 | 'Freespace Map' 274 | when 2 then 275 | 'Visibility Map' 276 | when 3 then 277 | 'Init Fork' 278 | end page_type, 279 | count(*) as buffers, 280 | sum(case when bc.isdirty then 1 else 0 end) as are_dirty, 281 | avg(usagecount) as avg_usagecount, 282 | avg(pinning_backends) as avg_pinning_backends, 283 | var_pop(bc.relblocknumber) as var_pop_blocks_in_cache 284 | from 285 | bufcacheview bc 286 | join pg_class c on bc.relfilenode = c.relfilenode 287 | group by rollup(c.oid, c.relkind, relforknumber) 288 | order by pg_relation_size(c.oid) desc nulls last, c.oid, relforknumber; 289 | 290 | -- 291 | -- Working with int4 indexes + bt_page_items() convenience functions. 292 | -- 293 | -- Sources: 294 | -- 295 | -- https://stackoverflow.com/questions/17208945/whats-the-easiest-way-to-represent-a-bytea-as-a-single-integer-in-postgresql 296 | -- https://stackoverflow.com/questions/11142235/convert-bigint-to-bytea-but-swap-the-byte-order 297 | -- 298 | create or replace function reverse_bytes_iter(bytes bytea, length int, midpoint int, index int) 299 | returns bytea as 300 | $$ 301 | select case when index >= midpoint then bytes else 302 | reverse_bytes_iter( 303 | set_byte( 304 | set_byte(bytes, index, get_byte(bytes, length-index)), 305 | length-index, get_byte(bytes, index) 306 | ), 307 | length, midpoint, index + 1 308 | ) 309 | end; 310 | $$ language sql immutable; 311 | 312 | create or replace function reverse_bytes(bytes bytea) returns bytea as 313 | $$ 314 | select reverse_bytes_iter(bytes, octet_length(bytes)-1, octet_length(bytes)/2, 0) 315 | $$ 316 | language sql immutable; 317 | 318 | create or replace function int4_from_bytea(bytea) returns int4 319 | as $$ 320 | select ('x' || right($1::text, 6))::bit(24)::int; 321 | $$ 322 | language sql immutable; 323 | 324 | create or replace function int4_from_page_data(text) returns int4 325 | as $$ 326 | select int4_from_bytea(reverse_bytes(decode($1, 'hex'))); 327 | $$ 328 | language sql immutable; 329 | 330 | -- 331 | -- Use: 332 | -- 333 | -- postgres=# select *, int4_from_page_data(data) from bt_page_items('f', 1) limit 15; 334 | -- itemoffset │ ctid │ itemlen │ nulls │ vars │ data │ int4_from_page_data 335 | -- ────────────┼────────────┼─────────┼───────┼──────┼─────────────────────────┼───────────────────── 336 | -- 1 │ (17698,69) │ 16 │ f │ f │ 5c 00 00 00 00 00 00 00 │ 92 337 | -- 2 │ (0,1) │ 16 │ f │ f │ 01 00 00 00 00 00 00 00 │ 1 338 | -- 3 │ (8849,126) │ 16 │ f │ f │ 01 00 00 00 00 00 00 00 │ 1 339 | -- 4 │ (17699,25) │ 16 │ f │ f │ 01 00 00 00 00 00 00 00 │ 1 340 | -- 5 │ (17699,26) │ 16 │ f │ f │ 01 00 00 00 00 00 00 00 │ 1 341 | -- 6 │ (0,2) │ 16 │ f │ f │ 02 00 00 00 00 00 00 00 │ 2 342 | -- 7 │ (8849,125) │ 16 │ f │ f │ 02 00 00 00 00 00 00 00 │ 2 343 | -- 8 │ (17699,23) │ 16 │ f │ f │ 02 00 00 00 00 00 00 00 │ 2 344 | -- 9 │ (17699,24) │ 16 │ f │ f │ 02 00 00 00 00 00 00 00 │ 2 345 | -- 10 │ (0,3) │ 16 │ f │ f │ 03 00 00 00 00 00 00 00 │ 3 346 | -- 11 │ (8849,124) │ 16 │ f │ f │ 03 00 00 00 00 00 00 00 │ 3 347 | -- 12 │ (17699,21) │ 16 │ f │ f │ 03 00 00 00 00 00 00 00 │ 3 348 | -- 13 │ (17699,22) │ 16 │ f │ f │ 03 00 00 00 00 00 00 00 │ 3 349 | -- 14 │ (0,4) │ 16 │ f │ f │ 04 00 00 00 00 00 00 00 │ 4 350 | -- 15 │ (8849,123) │ 16 │ f │ f │ 04 00 00 00 00 00 00 00 │ 4 351 | -- (15 rows) 352 | 353 | -- Spurious unfrozen row catcher query. From 354 | -- https://www.postgresql.org/message-id/20180319181723.ugaf7hfkluqyos5d@alap3.anarazel.de : 355 | create or replace function check_rel(rel regclass, out blockno int8, out lp int2, out xmin xid) 356 | returns setof record 357 | language sql 358 | as $$ 359 | select blockno, lp, t_xmin 360 | from 361 | generate_series(0, pg_relation_size($1::text) / 8192 - 1) blockno, -- every block in the relation 362 | heap_page_items(get_raw_page($1::text, blockno::int4)) -- every item on the page 363 | where 364 | t_xmin is not null -- filter out empty items 365 | and t_xmin != 1 -- filter out bootstrap 366 | and t_xmin != 2 -- filter out frozen transaction id 367 | and (t_infomask & ((x'0100' | x'0200')::int)) != ((x'0100' | x'0200')::int) -- filter out frozen rows with xid present 368 | and age(t_xmin) > age((select relfrozenxid from pg_class where oid = $1)) -- xid cutoff filter 369 | $$; 370 | 371 | -- Usage: 372 | -- 373 | -- select * from check_rel('pg_authid') limit 100; 374 | -------------------------------------------------------------------------------- /postgres-real-btree.dot: -------------------------------------------------------------------------------- 1 | // Example B-Tree graph 2 | // 3 | // Breadth-first order 4 | // 5 | // Start from root, go left to right 6 | // 7 | // Workflow: 8 | // 9 | // $ dot -T svg btree.dot -o btree.svg; 10 | 11 | digraph nbtree { 12 | graph [fontname = "monospace"]; 13 | node [shape = none,height=.1,fontname = "monospace",fontsize=8]; 14 | // Level 1 (root level) 15 | // Downlinks + highkey: 16 | rootnode[ label=< 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 |
-∞367, '-∞'733, '-∞'+∞
27 | > 28 | ]; 29 | // Downlink arrows to children: 30 | "rootnode":d0 -> "leafnode_1":t0 31 | "rootnode":d1 -> "leafnode_2":t0 32 | "rootnode":d2 -> "leafnode_3":t0 33 | 34 | // sibling pointer: 35 | // (None) 36 | 37 | 38 | // Level 0 (leaf level) 39 | leafnode_1[ label=< 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 |
1, '(0,1)'2, '(0,2)'...366, '(5,61)'367, '-∞'
366 non-pivot items & high key
51 | > 52 | ]; 53 | // sibling pointer: 54 | "leafnode_1" -> "leafnode_2"[constraint=false,color=black,style=dashed,arrowsize=0.5] 55 | leafnode_2[ label=< 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 |
367, '(6,1)'368, '(6,2)'...732, '(11,61)'733, '-∞'
366 non-pivot items & high key
67 | > 68 | ]; 69 | // sibling pointer: 70 | "leafnode_2" -> "leafnode_3"[constraint=false,color=black,style=dashed,arrowsize=0.5] 71 | leafnode_3[ label=< 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 |
733, '(12,1)'734, '(12,2)'...+∞
2+ non-pivot items, implicit +∞ high key
82 | > 83 | ]; 84 | 85 | //Force alignment from root to internal to leaf levels: 86 | edge[style=invis]; 87 | "rootnode":d1 -> "leafnode_2":t2 88 | } 89 | -------------------------------------------------------------------------------- /unoptimized-leaf-page-split-1.dot: -------------------------------------------------------------------------------- 1 | // Example B-Tree graph 2 | // 3 | // Breadth-first order 4 | // 5 | // Start from root, go left to right 6 | // 7 | // Workflow: 8 | // 9 | // $ dot -T svg btree.dot -o btree.svg; 10 | 11 | digraph nbtree { 12 | graph [fontname = "monospace"]; 13 | node [shape = none,height=.1,fontname = "monospace",fontsize=18]; 14 | // Level 0 (leaf level) 15 | leafnode_2[ label=< 16 | 17 | 18 | 19 | 20 | 21 | 22 |
1,11,2
23 | > 24 | ]; 25 | edge[style=invis]; 26 | "leafnode_2" -> "leafnode_3" 27 | leafnode_3[ label=< 28 | 29 | 30 | 31 | 32 | 33 | 34 |
1,31,4
35 | > 36 | ]; 37 | edge[style=invis]; 38 | "leafnode_3" -> "leafnode_4" 39 | leafnode_4[ label=< 40 | 41 | 42 | 43 | 44 | 45 | 46 |
1,51,62,1
47 | > 48 | ]; 49 | //Force alignment from root to internal to leaf levels: 50 | //edge[style=invis]; 51 | //"rootnode":d1 -> "leafnode_2":t2 52 | } 53 | -------------------------------------------------------------------------------- /unoptimized-leaf-page-split-2.dot: -------------------------------------------------------------------------------- 1 | // Example B-Tree graph 2 | // 3 | // Breadth-first order 4 | // 5 | // Start from root, go left to right 6 | // 7 | // Workflow: 8 | // 9 | // $ dot -T svg btree.dot -o btree.svg; 10 | 11 | digraph nbtree { 12 | graph [fontname = "monospace"]; 13 | node [shape = none,height=.1,fontname = "monospace",fontsize=18]; 14 | // Level 0 (leaf level) 15 | leafnode_2[ label=< 16 | 17 | 18 | 19 | 20 | 21 | 22 |
1,11,2
23 | > 24 | ]; 25 | edge[style=invis]; 26 | "leafnode_2" -> "leafnode_3" 27 | leafnode_3[ label=< 28 | 29 | 30 | 31 | 32 | 33 | 34 |
1,31,4
35 | > 36 | ]; 37 | edge[style=invis]; 38 | "leafnode_3" -> "leafnode_4" 39 | leafnode_4[ label=< 40 | 41 | 42 | 43 | 44 | 45 | 46 |
1,51,6
47 | > 48 | ]; 49 | edge[style=invis]; 50 | "leafnode_4" -> "leafnode_5" 51 | leafnode_5[ label=< 52 | 53 | 54 | 55 | 56 | 57 | 58 |
1,71,81,92,1
59 | > 60 | ]; 61 | //Force alignment from root to internal to leaf levels: 62 | //edge[style=invis]; 63 | //"rootnode":d1 -> "leafnode_2":t2 64 | } 65 | -------------------------------------------------------------------------------- /unoptimized-leaf-page-split-3.dot: -------------------------------------------------------------------------------- 1 | // Example B-Tree graph 2 | // 3 | // Breadth-first order 4 | // 5 | // Start from root, go left to right 6 | // 7 | // Workflow: 8 | // 9 | // $ dot -T svg btree.dot -o btree.svg; 10 | 11 | digraph nbtree { 12 | graph [fontname = "monospace"]; 13 | node [shape = none,height=.1,fontname = "monospace",fontsize=18]; 14 | // Level 0 (leaf level) 15 | leafnode_2[ label=< 16 | 17 | 18 | 19 | 20 | 21 | 22 |
1,11,2
23 | > 24 | ]; 25 | edge[style=invis]; 26 | "leafnode_2" -> "leafnode_3" 27 | leafnode_3[ label=< 28 | 29 | 30 | 31 | 32 | 33 | 34 |
1,31,4
35 | > 36 | ]; 37 | edge[style=invis]; 38 | "leafnode_3" -> "leafnode_4" 39 | leafnode_4[ label=< 40 | 41 | 42 | 43 | 44 | 45 | 46 |
1,51,6
47 | > 48 | ]; 49 | edge[style=invis]; 50 | "leafnode_4" -> "leafnode_5" 51 | leafnode_5[ label=< 52 | 53 | 54 | 55 | 56 | 57 | 58 |
1,71,8
59 | > 60 | ]; 61 | edge[style=invis]; 62 | "leafnode_5" -> "leafnode_6" 63 | leafnode_6[ label=< 64 | 65 | 66 | 67 | 68 | 69 | 70 |
1,91,10
71 | > 72 | ]; 73 | edge[style=invis]; 74 | "leafnode_6" -> "leafnode_7" 75 | leafnode_7[ label=< 76 | 77 | 78 | 79 | 80 | 81 | 82 |
1,111,122,1
83 | > 84 | ]; 85 | //Force alignment from root to internal to leaf levels: 86 | //edge[style=invis]; 87 | //"rootnode":d1 -> "leafnode_2":t2 88 | } 89 | --------------------------------------------------------------------------------