├── .github └── workflows │ └── main.yml ├── .gitignore ├── AUTHORS ├── COPYRIGHT ├── ChangeLog ├── Makefile ├── README.md ├── TODO ├── debian ├── changelog ├── control ├── control.in ├── copyright ├── pgversions ├── rules ├── source │ └── format ├── tests │ ├── control │ └── installcheck └── watch ├── examples └── buffercache_pgfincore.sql ├── expected └── pgfincore.out ├── pgfincore--1.2--1.3.1.sql ├── pgfincore--1.3.1.sql ├── pgfincore.c ├── pgfincore.control └── sql └── pgfincore.sql /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | on: 3 | push: 4 | branches: ['*'] 5 | pull_request: 6 | branches: ['*'] 7 | jobs: 8 | build: 9 | strategy: 10 | matrix: 11 | pg: 12 | - 16 13 | - 15 14 | - 14 15 | - 13 16 | - 12 17 | - 11 18 | - 10 19 | - 9.6 20 | - 9.5 21 | - 9.4 22 | name: 🐘 PostgreSQL ${{ matrix.pg }} 23 | runs-on: ubuntu-latest 24 | container: pgxn/pgxn-tools 25 | steps: 26 | - name: Start PostgreSQL ${{ matrix.pg }} 27 | run: pg-start ${{ matrix.pg }} 28 | - name: Check out the repo 29 | uses: actions/checkout@v4 30 | - name: Test on PostgreSQL ${{ matrix.pg }} 31 | run: pg-build-test -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .pc 2 | debian/files 3 | build-pgfincore-* 4 | debian/postgresql-* 5 | results/ 6 | *.so 7 | pgfincore*.tar.gz 8 | -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | pgfincore is written by: 2 | 3 | * Cédric Villemain 4 | 5 | I take pg_relation_size code as a model, I look at the C interesting part from 6 | fincore (http://net.doit.wisc.edu/~plonka/fincore/), and I follow the great idea 7 | from http://www.kennygorman.com/wordpress/?p=246. 8 | 9 | In short, thank you Kenny Gorman, thank you Dave Plonka ! 10 | 11 | -------------------------------------------------------------------------------- /COPYRIGHT: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2009-2016 Cédric Villemain 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions 7 | * are met: 8 | * 1. Redistributions of source code must retain the above copyright 9 | * notice, this list of conditions and the following disclaimer. 10 | * 2. Redistributions in binary form must reproduce the above copyright 11 | * notice, this list of conditions and the following disclaimer in the 12 | * documentation and/or other materials provided with the distribution. 13 | * 3. Neither the name of the author nor the names of any co-contributors 14 | * may be used to endorse or promote products derived from this software 15 | * without specific prior written permission. 16 | * 17 | * THIS SOFTWARE IS PROVIDED BY CONTRIBUTORS ``AS IS'' AND ANY EXPRESS 18 | * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 | * ARE DISCLAIMED. IN NO EVENT SHALL CONTRIBUTORS BE LIABLE FOR ANY 21 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE 23 | * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 24 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER 25 | * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 26 | * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN 27 | * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | */ 29 | 30 | -------------------------------------------------------------------------------- /ChangeLog: -------------------------------------------------------------------------------- 1 | 21/09/2023 Cédric Villemain 2 | * 1.3.1 - drop support for upgrading from "unpackaged" 3 | 21/09/2023 Cédric Villemain 4 | * 1.3 - added support for PostgreSQL 16 5 | - drop support for PostgreSQL < 9.4 6 | 2019-10-29 Cédric Villemain 7 | * 1.2.2 - Fix bad errno usage 8 | 22/09/2017 Cédric Villemain 9 | * 1.2.1 - Fix check on NULL input for drawer function 10 | 11 | 16/09/2016 Cédric Villemain 12 | * 1.2 - Prepared fincore syscall usage 13 | - Added 2 columns in pgfincore() with dirty status when available 14 | - Support PostgreSQL 9.6 15 | - Added a drawer function 16 | 17 | 12/10/2013 Cédric Villemain 18 | * 1.1.2 - Fix README filename for PGXS 19 | - Update to PostgreSQL 9.3 20 | - Fix faillure on NULL input (pgfadvise_loader) 21 | - Several fixes and layout changes 22 | 23 | 06/21/2012 Cédric Villemain 24 | * 1.1.2 - Change the open() call to use AllocateFile, and FreeFile 25 | 26 | 12/06/2011 Cédric Villemain 27 | * 1.1.1 - Fix Makefile again, as well as debian scripts (VPATH) 28 | - Add checks (make installcheck) 29 | - Improve .gitignore 30 | - Add a debian/watch file tracking pgfoundry release 31 | - Add regression files to VPATH build 32 | 33 | 09/07/2011 Cédric Villemain 34 | * 1.1.0 - Fix Makefile and remove the dir sql/ (useless and error prone) 35 | - Fix the printf of int64 by casting to long long int (i386 and 36 | adm64 behave differently with int64) 37 | - Updated to work with PostgreSQL 8.3 (TAKATSUKA Haruka) 38 | - Improve debian packaging (Dimitri Fontaine) 39 | - Add support for *BSD kernels 40 | - Remove mention of PGXS in the README 41 | 42 | 07/28/2011 Cédric Villemain 43 | * 1.0.0 - Output varbit containing vector information with pgfincore*() 44 | - Add Debian packaging (Dimitri Fontaine) 45 | - Update to work with PostgreSQL >= 9.1 (Jeff Janes) 46 | - Add total number of pages of memory with pgsysconf() 47 | - Add function pgsysconf_pretty() 48 | - Major rewrite of the functions 49 | - pgfadvise*() to handle simple posix_fadvise call 50 | - pgfadvise_loader() to restore file status (pages in/out cache) 51 | - pgfincore*() to handle mincore usage 52 | - pgsysconf*() to handle sysconf information 53 | - Use get_call_result_type() to build the tuple descriptor (suggested 54 | by RhodiumToad on IRC) 55 | - Remove limitation of usage on temp tables 56 | - Improve 9.1 installation (Extension) 57 | 58 | 04/30/2010 Cédric Villemain 59 | 60 | * 0.4.1 - use AllocateFile instead of fopen 61 | - call PG_GETARG* earlier 62 | - remove useless global counter 63 | - add error handler in pgfadv_snapshot() 64 | - errno to catch the last segment 65 | - improve Readme 66 | - some minor fix and beautify 67 | 68 | 01/05/2010 Cédric Villemain 69 | 70 | * 0.4.0 - fix test is not temp table 71 | - add posix_fadvise_willneed flag 72 | - add posix_fadvise_dontneed flag 73 | - add posix_fadvise_normal flag 74 | - add posix_fadvise_sequential flag 75 | - add posix_fadvise_random flag 76 | - rewrite main SRF 77 | - improve output (more informations) 78 | - fix copyright 79 | - add pgsysconf() 80 | - add pgmincore_snapshot to write mincore state in a file 81 | - add pgfadv_willneed_snapshot to read mincore state from file 82 | 83 | 10/26/2009 Cédric Villemain 84 | 85 | * 0.3.2 - fix fctx init 86 | 87 | 10/26/2009 Cédric Villemain 88 | 89 | * 0.3.1 - fix Makefile without PGXS 90 | - fix install doc in README 91 | 92 | 08/12/2009 Cédric Villemain 93 | 94 | * 0.3 - pgfincore now return a set of record 95 | - relname, relpath, block_disk, block_mem, group_mem 96 | - this version can only be build againt a postgresql > 8.3 /!\ 97 | 98 | 08/10/2009 Cédric Villemain 99 | 100 | * 0.2.1 - fix munmap call error 101 | 102 | 08/08/2009 Cédric Villemain 103 | 104 | * 0.2 - add support for 8.4 105 | - fix mmap error when file is empty 106 | 107 | 06/29/2009 Cédric Villemain 108 | 109 | * 0.1.1 - cleaning and fixing 110 | 111 | 06/27/2009 Cédric Villemain 112 | 113 | * 0.1 - functions are working, basicaly. 114 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | EXTENSION = pgfincore 2 | EXTVERSION = 1.3.1 3 | 4 | MODULES = $(EXTENSION) 5 | MODULEDIR = $(EXTENSION) 6 | DOCS = README.md 7 | DATA = $(EXTENSION)--1.2--1.3.1.sql \ 8 | $(EXTENSION)--$(EXTVERSION).sql 9 | 10 | REGRESS = $(EXTENSION) 11 | 12 | PG_CONFIG = pg_config 13 | 14 | PGXS := $(shell $(PG_CONFIG) --pgxs) 15 | 16 | include $(PGXS) 17 | 18 | dist: 19 | git archive --prefix=$(EXTENSION)-$(EXTVERSION)/ -o ../$(EXTENSION)_$(EXTVERSION).orig.tar.gz HEAD 20 | 21 | deb: 22 | make clean 23 | pg_buildext updatecontrol 24 | make -f debian/rules debian/control 25 | dh clean 26 | make dist 27 | dpkg-buildpackage -us -uc 28 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![CI](https://github.com/klando/pgfincore/actions/workflows/main.yml/badge.svg?branch=master)](https://github.com/klando/pgfincore/actions/workflows/main.yml) 2 | 3 | # PgFincore 4 | 5 | -------------------------------------------------------------- 6 | A set of functions to manage pages in memory from PostgreSQL 7 | -------------------------------------------------------------- 8 | 9 | A set of functions to handle low-level management of relations using mincore to 10 | explore cache memory. 11 | 12 | ## DESCRIPTION 13 | 14 | With PostgreSQL, each Table or Index is splitted in segments of (usually) 1GB, 15 | and each segment is splitted in pages in memory then in blocks for the 16 | filesystem. 17 | 18 | Those functions let you know which and how many disk block from a relation are 19 | in the page cache of the operating system. It can provide the result as a VarBit 20 | and can be stored in a table. Then using this table, it is possible to restore 21 | the page cache state for each block of the relation, even in another server, 22 | thanks to Streaming Replication. 23 | 24 | Other functions are used to set a *POSIX_FADVISE* flag on the entire relation 25 | (each segment). The more usefull are probably *WILLNEED* and *DONTNEED* which 26 | push and pop blocks of each segments of a relation from page cache, 27 | respectively. 28 | 29 | Each functions are call with at least a table name or an index name (or oid) 30 | as a parameter and walk each segment of the relation. 31 | 32 | ## DOWNLOAD 33 | 34 | You can grab the latest code with git: 35 | 36 | git clone git://git.postgresql.org/git/pgfincore.git 37 | or 38 | git://github.com/klando/pgfincore.git 39 | 40 | And the project is on pgfoundry : http://pgfoundry.org/projects/pgfincore 41 | 42 | ## INSTALL 43 | 44 | From source code: 45 | 46 | make clean 47 | make 48 | su 49 | make install 50 | 51 | For PostgreSQL >= 9.1, log in your database and: 52 | 53 | mydb=# CREATE EXTENSION pgfincore; 54 | 55 | For other release, create the functions from the sql script (it should be in 56 | your contrib directory): 57 | 58 | psql mydb -f pgfincore.sql 59 | 60 | PgFincore is also shipped with Debian scripts to build your own package: 61 | 62 | aptitude install debhelper postgresql-server-dev-all postgresql-server-dev-9.1 63 | # or postgresql-server-dev-8.4|postgresql-server-dev-9.0 64 | make deb 65 | dpkg -i ../postgresql-9.1-pgfincore_1.1.1-1_amd64.deb 66 | 67 | PgFincore is packaged for *RPM* at http://yum.postgresql.org/ 68 | PgFincore is packaged for *debian* at http://pgapt.debian.net/ 69 | 70 | 71 | 72 | 73 | ## EXAMPLES 74 | 75 | Here are some examples of usage. If you want more details go to Documentation_ 76 | 77 | ### Get current state of a relation 78 | 79 | May be useful: 80 | 81 | cedric=# select * from pgfincore('pgbench_accounts'); 82 | relpath | segment | os_page_size | rel_os_pages | pages_mem | group_mem | os_pages_free | databit | pages_dirty | group_dirty 83 | --------------------+---------+--------------+--------------+-----------+-----------+---------------+---------+-------------+------------- 84 | base/11874/16447 | 0 | 4096 | 262144 | 262144 | 1 | 81016 | | 0 | 0 85 | base/11874/16447.1 | 1 | 4096 | 65726 | 65726 | 1 | 81016 | | 0 | 0 86 | (2 rows) 87 | 88 | Time: 31.563 ms 89 | 90 | ### Load a table or an index in OS Page Buffer 91 | 92 | You may want to try to keep a table or an index into the OS Page Cache, or 93 | preload a table before your well know big query is executed (reducing the query 94 | time). 95 | 96 | To do so, just execute the following query: 97 | 98 | cedric=# select * from pgfadvise_willneed('pgbench_accounts'); 99 | relpath | os_page_size | rel_os_pages | os_pages_free 100 | --------------------+--------------+--------------+--------------- 101 | base/11874/16447 | 4096 | 262144 | 169138 102 | base/11874/16447.1 | 4096 | 65726 | 103352 103 | (2 rows) 104 | 105 | Time: 4462,936 ms 106 | 107 | * The column *os_page_size* report that page size is 4KB. 108 | * The column *rel_os_pages* is the number of pages of the specified file. 109 | * The column *os_pages_free* is the number of free pages in memory (for caching). 110 | 111 | ### Snapshot and Restore the OS Page Buffer state of a table or an index (or more) 112 | 113 | You may want to restore a table or an index into the OS Page Cache as it was 114 | while you did the snapshot. For example if you have to reboot your server, then 115 | when PostgreSQL start up the first queries might be slower because neither 116 | PostgreSQL or the OS have pages in their respective cache about the relations 117 | involved in those first queries. 118 | 119 | Executing a snapshot and a restore is very simple: 120 | 121 | -- Snapshot 122 | cedric=# create table pgfincore_snapshot as 123 | cedric-# select 'pgbench_accounts'::text as relname,*,now() as date_snapshot 124 | cedric-# from pgfincore('pgbench_accounts',true); 125 | 126 | -- Restore 127 | cedric=# select * from pgfadvise_loader('pgbench_accounts', 0, true, true, 128 | (select databit from pgfincore_snapshot 129 | where relname='pgbench_accounts' and segment = 0)); 130 | relpath | os_page_size | os_pages_free | pages_loaded | pages_unloaded 131 | ------------------+--------------+---------------+--------------+---------------- 132 | base/11874/16447 | 4096 | 80867 | 262144 | 0 133 | (1 row) 134 | 135 | Time: 35.349 ms 136 | 137 | * The column *pages_loaded* report how many pages have been read to memory 138 | (they may have already been in memoy) 139 | * The column *pages_unloaded* report how many pages have been removed from 140 | memory (they may not have already been in memoy); 141 | 142 | ## SYNOPSIS 143 | 144 | pgsysconf(OUT os_page_size bigint, OUT os_pages_free bigint, 145 | OUT os_total_pages bigint) 146 | RETURNS record 147 | 148 | pgsysconf_pretty(OUT os_page_size text, OUT os_pages_free text, 149 | OUT os_total_pages text) 150 | RETURNS record 151 | 152 | pgfadvise(IN relname regclass, IN fork text, IN action int, 153 | OUT relpath text, OUT os_page_size bigint, 154 | OUT rel_os_pages bigint, OUT os_pages_free bigint) 155 | RETURNS setof record 156 | 157 | pgfadvise_willneed(IN relname regclass, 158 | OUT relpath text, OUT os_page_size bigint, 159 | OUT rel_os_pages bigint, OUT os_pages_free bigint) 160 | RETURNS setof record 161 | 162 | pgfadvise_dontneed(IN relname regclass, 163 | OUT relpath text, OUT os_page_size bigint, 164 | OUT rel_os_pages bigint, OUT os_pages_free bigint) 165 | RETURNS setof record 166 | 167 | pgfadvise_normal(IN relname regclass, 168 | OUT relpath text, OUT os_page_size bigint, 169 | OUT rel_os_pages bigint, OUT os_pages_free bigint) 170 | RETURNS setof record 171 | 172 | pgfadvise_sequential(IN relname regclass, 173 | OUT relpath text, OUT os_page_size bigint, 174 | OUT rel_os_pages bigint, OUT os_pages_free bigint) 175 | RETURNS setof record 176 | 177 | pgfadvise_random(IN relname regclass, 178 | OUT relpath text, OUT os_page_size bigint, 179 | OUT rel_os_pages bigint, OUT os_pages_free bigint) 180 | RETURNS setof record 181 | 182 | pgfadvise_loader(IN relname regclass, IN fork text, IN segment int, 183 | IN load bool, IN unload bool, IN databit varbit, 184 | OUT relpath text, OUT os_page_size bigint, 185 | OUT os_pages_free bigint, OUT pages_loaded bigint, 186 | OUT pages_unloaded bigint) 187 | RETURNS setof record 188 | 189 | pgfadvise_loader(IN relname regclass, IN segment int, 190 | IN load bool, IN unload bool, IN databit varbit, 191 | OUT relpath text, OUT os_page_size bigint, 192 | OUT os_pages_free bigint, OUT pages_loaded bigint, 193 | OUT pages_unloaded bigint) 194 | RETURNS setof record 195 | 196 | pgfincore(IN relname regclass, IN fork text, IN getdatabit bool, 197 | OUT relpath text, OUT segment int, OUT os_page_size bigint, 198 | OUT rel_os_pages bigint, OUT pages_mem bigint, 199 | OUT group_mem bigint, OUT os_pages_free bigint, 200 | OUT databit varbit, OUT pages_dirty bigint, 201 | OUT group_dirty bigint) 202 | RETURNS setof record 203 | 204 | pgfincore(IN relname regclass, IN getdatabit bool, 205 | OUT relpath text, OUT segment int, OUT os_page_size bigint, 206 | OUT rel_os_pages bigint, OUT pages_mem bigint, 207 | OUT group_mem bigint, OUT os_pages_free bigint, 208 | OUT databit varbit, OUT pages_dirty bigint, 209 | OUT group_dirty bigint) 210 | RETURNS setof record 211 | 212 | pgfincore(IN relname regclass, 213 | OUT relpath text, OUT segment int, OUT os_page_size bigint, 214 | OUT rel_os_pages bigint, OUT pages_mem bigint, 215 | OUT group_mem bigint, OUT os_pages_free bigint, 216 | OUT databit varbit, OUT pages_dirty bigint, 217 | OUT group_dirty bigint) 218 | RETURNS setof record 219 | 220 | ## DOCUMENTATION 221 | 222 | ### pgsysconf 223 | 224 | This function output size of OS blocks, number of free page in the OS Page Buffer. 225 | 226 | cedric=# select * from pgsysconf(); 227 | os_page_size | os_pages_free | os_total_pages 228 | --------------+---------------+---------------- 229 | 4096 | 80431 | 4094174 230 | 231 | ### pgsysconf_pretty 232 | 233 | The same as above, but with pretty output. 234 | 235 | cedric=# select * from pgsysconf_pretty(); 236 | os_page_size | os_pages_free | os_total_pages 237 | --------------+---------------+---------------- 238 | 4096 bytes | 314 MB | 16 GB 239 | 240 | ### pgfadvise_WILLNEED 241 | 242 | This function set *WILLNEED* flag on the current relation. It means that the 243 | Operating Sytem will try to load as much pages as possible of the relation. 244 | Main idea is to preload files on server startup, perhaps using cache hit/miss 245 | ratio or most required relations/indexes. 246 | 247 | cedric=# select * from pgfadvise_willneed('pgbench_accounts'); 248 | relpath | os_page_size | rel_os_pages | os_pages_free 249 | --------------------+--------------+--------------+--------------- 250 | base/11874/16447 | 4096 | 262144 | 80650 251 | base/11874/16447.1 | 4096 | 65726 | 80650 252 | 253 | ### pgfadvise_DONTNEED 254 | 255 | This function set *DONTNEED* flag on the current relation. It means that the 256 | Operating System will first unload pages of the file if it need to free some 257 | memory. Main idea is to unload files when they are not usefull anymore (instead 258 | of perhaps more interesting pages) 259 | 260 | cedric=# select * from pgfadvise_dontneed('pgbench_accounts'); 261 | relpath | os_page_size | rel_os_pages | os_pages_free 262 | --------------------+--------------+--------------+--------------- 263 | base/11874/16447 | 4096 | 262144 | 342071 264 | base/11874/16447.1 | 4096 | 65726 | 408103 265 | 266 | 267 | ### pgfadvise_NORMAL 268 | 269 | This function set *NORMAL* flag on the current relation. 270 | 271 | ### pgfadvise_SEQUENTIAL 272 | 273 | This function set *SEQUENTIAL* flag on the current relation. 274 | 275 | ### pgfadvise_RANDOM 276 | 277 | This function set *RANDOM* flag on the current relation. 278 | 279 | ### pgfadvise_loader 280 | 281 | This function allow to interact directly with the Page Cache. 282 | It can be used to load and/or unload page from memory based on a varbit 283 | representing the map of the pages to load/unload accordingly. 284 | 285 | Work with relation pgbench_accounts, segment 0, arbitrary varbit map: 286 | 287 | -- Loading and Unloading 288 | cedric=# select * from pgfadvise_loader('pgbench_accounts', 0, true, true, B'111000'); 289 | relpath | os_page_size | os_pages_free | pages_loaded | pages_unloaded 290 | ------------------+--------------+---------------+--------------+---------------- 291 | base/11874/16447 | 4096 | 408376 | 3 | 3 292 | 293 | -- Loading 294 | cedric=# select * from pgfadvise_loader('pgbench_accounts', 0, true, false, B'111000'); 295 | relpath | os_page_size | os_pages_free | pages_loaded | pages_unloaded 296 | ------------------+--------------+---------------+--------------+---------------- 297 | base/11874/16447 | 4096 | 408370 | 3 | 0 298 | 299 | -- Unloading 300 | cedric=# select * from pgfadvise_loader('pgbench_accounts', 0, false, true, B'111000'); 301 | relpath | os_page_size | os_pages_free | pages_loaded | pages_unloaded 302 | ------------------+--------------+---------------+--------------+---------------- 303 | base/11874/16447 | 4096 | 408370 | 0 | 3 304 | 305 | ### pgfincore 306 | 307 | This function provide information about the file system cache (page cache). 308 | 309 | cedric=# select * from pgfincore('pgbench_accounts'); 310 | relpath | segment | os_page_size | rel_os_pages | pages_mem | group_mem | os_pages_free | databit | pages_dirty | group_dirty 311 | --------------------+---------+--------------+--------------+-----------+-----------+---------------+---------+-------------+------------- 312 | base/11874/16447 | 0 | 4096 | 262144 | 3 | 1 | 408444 | | 0 | 0 313 | base/11874/16447.1 | 1 | 4096 | 65726 | 0 | 0 | 408444 | | 0 | 0 314 | 315 | For the specified relation it returns: 316 | 317 | * relpath : the relation path 318 | * segment : the segment number analyzed 319 | * os_page_size : the size of one page 320 | * rel_os_pages : the total number of pages of the relation 321 | * pages_mem : the total number of relation's pages in page cache. 322 | (not the shared buffers from PostgreSQL but the OS cache) 323 | * group_mem : the number of groups of adjacent pages_mem 324 | * os_page_free : the number of free page in the OS page cache 325 | * databit : the varbit map of the file, because of its size it is useless to output 326 | Use pgfincore('pgbench_accounts',true) to activate it. 327 | * pages_dirty : if HAVE_FINCORE constant is define and the platorm provides the relevant information, like pages_mem but for dirtied pages 328 | * group_dirty : if HAVE_FINCORE constant is define and the platorm provides the relevant information, like group_mem but for dirtied pages 329 | 330 | ## DEBUG 331 | 332 | You can debug the PgFincore with the following error level: *DEBUG1* and 333 | *DEBUG5*. 334 | 335 | For example: 336 | 337 | set client_min_messages TO debug1; -- debug5 is only usefull to trace each block 338 | 339 | ## REQUIREMENTS 340 | 341 | * PgFincore needs mincore() or fincore() and POSIX_FADVISE 342 | 343 | ## LIMITATIONS 344 | 345 | * PgFincore has a limited mode when POSIX_FADVISE is not provided by the platform. 346 | 347 | * PgFincore needs PostgreSQL >= 8.3 348 | 349 | * PgFincore does not work on windows. 350 | 351 | ## SEE ALSO 352 | 353 | Data Bene, PostgreSQL Expertise, Technical Support and Assistance, Trainings: 354 | 355 | https://www.data-bene.io 356 | -------------------------------------------------------------------------------- /TODO: -------------------------------------------------------------------------------- 1 | * [sql] average contigous block or stats like that (what part of the file is in cache) 2 | * [code] split mmaping in shorter segment (say 64Mb) per sugestion from Andres Freund 3 | * graph 4 | -------------------------------------------------------------------------------- /debian/changelog: -------------------------------------------------------------------------------- 1 | pgfincore (1.3.1-1) unstable; urgency=medium 2 | 3 | * New upstream version. 4 | * Upload for PostgreSQL 16. 5 | * Use ${postgresql:Depends}. 6 | 7 | -- Christoph Berg Thu, 21 Sep 2023 17:51:54 +0200 8 | 9 | pgfincore (1.2.4-2) unstable; urgency=medium 10 | 11 | * Upload for PostgreSQL 15. 12 | 13 | -- Christoph Berg Fri, 21 Oct 2022 11:59:48 +0200 14 | 15 | pgfincore (1.2.4-1) unstable; urgency=medium 16 | 17 | * New version with PG 15 support. 18 | 19 | -- Christoph Berg Wed, 28 Sep 2022 14:13:32 +0200 20 | 21 | pgfincore (1.2.3-1) unstable; urgency=medium 22 | 23 | * Fix GitHub watch file. 24 | 25 | -- Christoph Berg Tue, 11 Jan 2022 12:37:15 +0100 26 | 27 | pgfincore (1.2.2-3) unstable; urgency=medium 28 | 29 | * Upload for PostgreSQL 14. 30 | 31 | -- Christoph Berg Wed, 03 Nov 2021 14:14:12 +0100 32 | 33 | pgfincore (1.2.2-2) unstable; urgency=medium 34 | 35 | * Upload for PostgreSQL 13. 36 | * Use dh --with pgxs. 37 | * R³: no. 38 | * DH 13. 39 | * debian/tests: Use 'make' instead of postgresql-server-dev-all. 40 | 41 | -- Christoph Berg Mon, 19 Oct 2020 12:39:15 +0200 42 | 43 | pgfincore (1.2.2-1) unstable; urgency=medium 44 | 45 | * Upload for PostgreSQL 12. 46 | 47 | -- Christoph Berg Tue, 29 Oct 2019 14:57:19 +0100 48 | 49 | pgfincore (1.2.1-2) unstable; urgency=medium 50 | 51 | * Upload for PostgreSQL 11. 52 | * Update PostgreSQL team address. 53 | 54 | -- Christoph Berg Fri, 12 Oct 2018 13:33:10 +0200 55 | 56 | pgfincore (1.2.1-1) unstable; urgency=medium 57 | 58 | * Team upload for PostgreSQL 10 support. 59 | * New upstream version. 60 | * debian/tests/control: Drop needs-root. 61 | 62 | -- Christoph Berg Fri, 22 Sep 2017 09:48:18 +0200 63 | 64 | pgfincore (1.2-2) unstable; urgency=medium 65 | 66 | * Upload with 9.6 support. 67 | * Update watch file to ignore debian/ tags on github. 68 | * Bump S-V and clean up results/. 69 | 70 | -- Christoph Berg Sat, 24 Sep 2016 13:16:54 +0200 71 | 72 | pgfincore (1.2-1) unstable; urgency=medium 73 | 74 | * New upstream release 1.2. 75 | 76 | -- Cédric Villemain Thu, 15 Sep 2016 13:13:14 +0200 77 | 78 | pgfincore (1.1.2-4) unstable; urgency=medium 79 | 80 | * Build for PostgreSQL 9.5. (Closes: #811134) 81 | * Bump Standards-Version to 3.9.6 (no changes needed). 82 | * debian/control.in: Drop obsolete XS-Testsuite: field. 83 | 84 | -- Martin Pitt Sat, 16 Jan 2016 11:50:19 +0100 85 | 86 | pgfincore (1.1.2-3) unstable; urgency=medium 87 | 88 | * Upload to unstable for 9.4. 89 | 90 | -- Christoph Berg Sun, 27 Jul 2014 11:16:09 +0200 91 | 92 | pgfincore (1.1.2-2) experimental; urgency=medium 93 | 94 | * Use "all" in debian/pgversions. 95 | * B-D on pg-common 158 to build against 9.3 and 9.4. 96 | * Use pg_buildext installcheck. 97 | * Set team as maintainer. 98 | 99 | -- Christoph Berg Sun, 06 Jul 2014 18:32:22 +0200 100 | 101 | pgfincore (1.1.2-1) unstable; urgency=low 102 | 103 | * New upstream release with PostgreSQL 9.3 support. (Closes: #725570) 104 | * Add autopkgtest support. 105 | * Add watch file looking for releases on github. 106 | 107 | -- Christoph Berg Tue, 10 Dec 2013 15:35:40 +0100 108 | 109 | pgfincore (1.1.1-1) unstable; urgency=low 110 | 111 | * New upstream release 112 | 113 | -- Cédric Villemain Fri, 02 Dec 2011 22:48:27 +0100 114 | 115 | pgfincore (1.1-1) unstable; urgency=low 116 | 117 | * New upstream release 118 | * Clean packaging for 9.1 (Closes: #639460) 119 | 120 | -- Dimitri Fontaine Mon, 05 Sep 2011 12:56:05 +0200 121 | 122 | pgfincore (1.0-1) unstable; urgency=low 123 | 124 | * New upstream release 125 | 126 | -- Dimitri Fontaine Wed, 27 Jul 2011 16:21:48 +0200 127 | 128 | pgfincore (0.4-1) unstable; urgency=low 129 | 130 | * Initial packaging 131 | 132 | -- Dimitri Fontaine Tue, 30 Nov 2010 15:27:25 +0100 133 | 134 | -------------------------------------------------------------------------------- /debian/control: -------------------------------------------------------------------------------- 1 | Source: pgfincore 2 | Section: database 3 | Priority: optional 4 | Maintainer: Debian PostgreSQL Maintainers 5 | Uploaders: Cédric Villemain , Dimitri Fontaine , Christoph Berg 6 | Build-Depends: debhelper-compat (= 13), postgresql-all (>= 217~) 7 | Standards-Version: 4.6.2 8 | Rules-Requires-Root: no 9 | Vcs-Git: git://git.postgresql.org/git/pgfincore.git 10 | Vcs-Browser: http://git.postgresql.org/gitweb/?p=pgfincore.git 11 | Homepage: http://villemain.org/projects/pgfincore 12 | 13 | Package: postgresql-16-pgfincore 14 | Architecture: any 15 | Depends: ${shlibs:Depends}, ${misc:Depends}, ${postgresql:Depends} 16 | Description: set of PostgreSQL functions to manage blocks in memory 17 | Those functions let you know which and how many disk block from a relation 18 | are in the page cache of the operating system, and eventually write the 19 | result to a file. Then using this file, it is possible to restore the page 20 | cache state for each block of the relation. 21 | -------------------------------------------------------------------------------- /debian/control.in: -------------------------------------------------------------------------------- 1 | Source: pgfincore 2 | Section: database 3 | Priority: optional 4 | Maintainer: Debian PostgreSQL Maintainers 5 | Uploaders: Cédric Villemain , Dimitri Fontaine , Christoph Berg 6 | Build-Depends: debhelper-compat (= 13), postgresql-all (>= 217~) 7 | Standards-Version: 4.6.2 8 | Rules-Requires-Root: no 9 | Vcs-Git: git://git.postgresql.org/git/pgfincore.git 10 | Vcs-Browser: http://git.postgresql.org/gitweb/?p=pgfincore.git 11 | Homepage: http://villemain.org/projects/pgfincore 12 | 13 | Package: postgresql-PGVERSION-pgfincore 14 | Architecture: any 15 | Depends: ${shlibs:Depends}, ${misc:Depends}, ${postgresql:Depends} 16 | Description: set of PostgreSQL functions to manage blocks in memory 17 | Those functions let you know which and how many disk block from a relation 18 | are in the page cache of the operating system, and eventually write the 19 | result to a file. Then using this file, it is possible to restore the page 20 | cache state for each block of the relation. 21 | -------------------------------------------------------------------------------- /debian/copyright: -------------------------------------------------------------------------------- 1 | Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ 2 | Upstream-Name: pgfincore 3 | Source: http://git.postgresql.org/gitweb/?p=pgfincore.git 4 | 5 | Files: * 6 | Copyright: 2009-2016 Cédric Villemain 7 | License: BSD-3-Clause 8 | 9 | Files: debian/* 10 | Copyright: 2013 Cédric Villemain 11 | License: BSD-3-Clause 12 | 13 | License: BSD-3-Clause 14 | Redistribution and use in source and binary forms, with or without 15 | modification, are permitted provided that the following conditions 16 | are met: 17 | 1. Redistributions of source code must retain the above copyright 18 | notice, this list of conditions and the following disclaimer. 19 | 2. Redistributions in binary form must reproduce the above copyright 20 | notice, this list of conditions and the following disclaimer in the 21 | documentation and/or other materials provided with the distribution. 22 | 3. Neither the name of the University nor the names of its contributors 23 | may be used to endorse or promote products derived from this software 24 | without specific prior written permission. 25 | . 26 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 27 | ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 28 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 29 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE HOLDERS OR 30 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 31 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 32 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 33 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 34 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 35 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 36 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 37 | 38 | -------------------------------------------------------------------------------- /debian/pgversions: -------------------------------------------------------------------------------- 1 | all 2 | -------------------------------------------------------------------------------- /debian/rules: -------------------------------------------------------------------------------- 1 | #!/usr/bin/make -f 2 | 3 | override_dh_installdocs: 4 | dh_installdocs --all README.* 5 | 6 | %: 7 | dh $@ --with pgxs 8 | -------------------------------------------------------------------------------- /debian/source/format: -------------------------------------------------------------------------------- 1 | 3.0 (quilt) 2 | -------------------------------------------------------------------------------- /debian/tests/control: -------------------------------------------------------------------------------- 1 | Depends: @, make 2 | Tests: installcheck 3 | Restrictions: allow-stderr 4 | -------------------------------------------------------------------------------- /debian/tests/installcheck: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | set -e 4 | 5 | pg_buildext installcheck 6 | -------------------------------------------------------------------------------- /debian/watch: -------------------------------------------------------------------------------- 1 | version=4 2 | https://github.com/klando/pgfincore/tags .*/([^/-]*).tar.gz 3 | -------------------------------------------------------------------------------- /examples/buffercache_pgfincore.sql: -------------------------------------------------------------------------------- 1 | with my_table as ( 2 | select oid 3 | , relfilenode 4 | , relname 5 | from pg_class 6 | where relname = 'pgbench_accounts' 7 | ) 8 | , t as ( 9 | select generate_series(1, relpages) as g 10 | from my_table 11 | join pg_class using (relname) 12 | ) 13 | , buf as ( 14 | select relblocknumber * 2 as bn -- Pgfincore use filesystem block size 15 | , usagecount as c 16 | , isdirty as d 17 | from my_table 18 | join pg_buffercache using (relfilenode) 19 | where relforknumber = 0 20 | ) 21 | , pgf as ( 22 | select (row_number() over (partition by c)) - 1 as bn -- pascal vs C 23 | , c 24 | , NULL as d 25 | from (select unnest( 26 | string_to_array( 27 | (pgfincore(my_table.oid, true)).databit::text, NULL 28 | ) 29 | ) as c 30 | from my_table ) g 31 | ) 32 | , fb as ( 33 | select pgf.bn as file_block_number 34 | , buf.c as pgcache 35 | , buf.d as pgdirty 36 | , pgf.c as oscache 37 | , pgf.d as osdirty 38 | from buf 39 | right join pgf using (bn) 40 | order by 1, 2, 3 41 | ), 42 | res as ( 43 | select * 44 | from fb 45 | ) 46 | select row_to_json(res) -- use "res" CTE if no JSON datatype (pg < 9.2) 47 | from res; 48 | -------------------------------------------------------------------------------- /expected/pgfincore.out: -------------------------------------------------------------------------------- 1 | CREATE EXTENSION pgfincore; 2 | -- 3 | -- test SYSCONF 4 | -- 5 | select from pgsysconf(); 6 | -- 7 | (1 row) 8 | 9 | select from pgsysconf_pretty(); 10 | -- 11 | (1 row) 12 | 13 | -- 14 | -- make a temp table to use below 15 | -- 16 | CREATE TEMP TABLE test AS SELECT generate_series(1,256) as a; 17 | -- 18 | -- this is not perfect testing but it is hard to predict what the OS will do 19 | -- for *sure* 20 | -- 21 | -- 22 | -- test fadvise_loader 23 | -- 24 | select from pgfadvise_loader('test', 0, true, true, B'1010'); 25 | -- 26 | (1 row) 27 | 28 | select from pgfadvise_loader('test', 0, true, false, B'1010'); 29 | -- 30 | (1 row) 31 | 32 | select from pgfadvise_loader('test', 0, false, true, B'1010'); 33 | -- 34 | (1 row) 35 | 36 | select from pgfadvise_loader('test', 0, false, false, B'1010'); 37 | -- 38 | (1 row) 39 | 40 | -- must not fail on empty databit input 41 | select from pgfadvise_loader('test', 0, false, false, B''); 42 | -- 43 | (1 row) 44 | 45 | -- ERROR on NULL databit input 46 | select from pgfadvise_loader('test', 0, false, false, NULL); 47 | ERROR: pgfadvise_loader: databit argument shouldn't be NULL 48 | CONTEXT: SQL function "pgfadvise_loader" statement 1 49 | -- 50 | -- test pgfincore 51 | -- 52 | select from pgfincore('test', true); 53 | -- 54 | (1 row) 55 | 56 | select from pgfincore('test'); 57 | -- 58 | (1 row) 59 | 60 | -- 61 | -- test DONTNEED, WILLNEED 62 | -- 63 | select from pgfadvise_willneed('test'); 64 | -- 65 | (1 row) 66 | 67 | select from pgfadvise_dontneed('test'); 68 | -- 69 | (1 row) 70 | 71 | -- 72 | -- test PGFADVISE flags 73 | -- 74 | select from pgfadvise_sequential('test'); 75 | -- 76 | (1 row) 77 | 78 | select from pgfadvise_random('test'); 79 | -- 80 | (1 row) 81 | 82 | select from pgfadvise_normal('test'); 83 | -- 84 | (1 row) 85 | 86 | -- 87 | -- tests drawers 88 | -- 89 | select NULL || pgfincore_drawer(databit) from pgfincore('test','main',true); 90 | ?column? 91 | ---------- 92 | 93 | (1 row) 94 | 95 | -------------------------------------------------------------------------------- /pgfincore--1.2--1.3.1.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/klando/pgfincore/dd964ee2b25ce86bfc2ac17f5cf683033d41d9b5/pgfincore--1.2--1.3.1.sql -------------------------------------------------------------------------------- /pgfincore--1.3.1.sql: -------------------------------------------------------------------------------- 1 | -- 2 | -- SYSCONF 3 | -- 4 | CREATE OR REPLACE FUNCTION 5 | pgsysconf(OUT os_page_size bigint, 6 | OUT os_pages_free bigint, 7 | OUT os_total_pages bigint) 8 | RETURNS record 9 | AS '$libdir/pgfincore' 10 | LANGUAGE C; 11 | 12 | COMMENT ON FUNCTION pgsysconf() 13 | IS 'Get system configuration information at run time: 14 | - os_page_size is _SC_PAGESIZE 15 | - os_pages_free is _SC_AVPHYS_PAGES 16 | - os_total_pages is _SC_PHYS_PAGES 17 | 18 | man 3 sysconf for details'; 19 | 20 | 21 | CREATE OR REPLACE FUNCTION 22 | pgsysconf_pretty(OUT os_page_size text, 23 | OUT os_pages_free text, 24 | OUT os_total_pages text) 25 | RETURNS record 26 | AS ' 27 | select pg_size_pretty(os_page_size) as os_page_size, 28 | pg_size_pretty(os_pages_free * os_page_size) as os_pages_free, 29 | pg_size_pretty(os_total_pages * os_page_size) as os_total_pages 30 | from pgsysconf()' 31 | LANGUAGE SQL; 32 | 33 | COMMENT ON FUNCTION pgsysconf_pretty() 34 | IS 'Pgsysconf() with human readable output'; 35 | 36 | -- 37 | -- PGFADVISE 38 | -- 39 | CREATE OR REPLACE FUNCTION 40 | pgfadvise(IN regclass, IN text, IN int, 41 | OUT relpath text, 42 | OUT os_page_size bigint, 43 | OUT rel_os_pages bigint, 44 | OUT os_pages_free bigint) 45 | RETURNS setof record 46 | AS '$libdir/pgfincore' 47 | LANGUAGE C; 48 | 49 | COMMENT ON FUNCTION pgfadvise(regclass, text, int) 50 | IS 'Predeclare an access pattern for file data'; 51 | 52 | CREATE OR REPLACE FUNCTION 53 | pgfadvise_willneed(IN regclass, 54 | OUT relpath text, 55 | OUT os_page_size bigint, 56 | OUT rel_os_pages bigint, 57 | OUT os_pages_free bigint) 58 | RETURNS setof record 59 | AS 'SELECT pgfadvise($1, ''main'', 10)' 60 | LANGUAGE SQL; 61 | 62 | CREATE OR REPLACE FUNCTION 63 | pgfadvise_dontneed(IN regclass, 64 | OUT relpath text, 65 | OUT os_page_size bigint, 66 | OUT rel_os_pages bigint, 67 | OUT os_pages_free bigint) 68 | RETURNS setof record 69 | AS 'SELECT pgfadvise($1, ''main'', 20)' 70 | LANGUAGE SQL; 71 | 72 | CREATE OR REPLACE FUNCTION 73 | pgfadvise_normal(IN regclass, 74 | OUT relpath text, 75 | OUT os_page_size bigint, 76 | OUT rel_os_pages bigint, 77 | OUT os_pages_free bigint) 78 | RETURNS setof record 79 | AS 'SELECT pgfadvise($1, ''main'', 30)' 80 | LANGUAGE SQL; 81 | 82 | CREATE OR REPLACE FUNCTION 83 | pgfadvise_sequential(IN regclass, 84 | OUT relpath text, 85 | OUT os_page_size bigint, 86 | OUT rel_os_pages bigint, 87 | OUT os_pages_free bigint) 88 | RETURNS setof record 89 | AS 'SELECT pgfadvise($1, ''main'', 40)' 90 | LANGUAGE SQL; 91 | 92 | CREATE OR REPLACE FUNCTION 93 | pgfadvise_random(IN regclass, 94 | OUT relpath text, 95 | OUT os_page_size bigint, 96 | OUT rel_os_pages bigint, 97 | OUT os_pages_free bigint) 98 | RETURNS setof record 99 | AS 'SELECT pgfadvise($1, ''main'', 50)' 100 | LANGUAGE SQL; 101 | 102 | -- 103 | -- PGFADVISE_LOADER 104 | -- 105 | CREATE OR REPLACE FUNCTION 106 | pgfadvise_loader(IN regclass, IN text, IN int, IN bool, IN bool, IN varbit, 107 | OUT relpath text, 108 | OUT os_page_size bigint, 109 | OUT os_pages_free bigint, 110 | OUT pages_loaded bigint, 111 | OUT pages_unloaded bigint) 112 | RETURNS setof record 113 | AS '$libdir/pgfincore' 114 | LANGUAGE C; 115 | 116 | COMMENT ON FUNCTION pgfadvise_loader(regclass, text, int, bool, bool, varbit) 117 | IS 'Restore cache from the snapshot, options to load/unload each block to/from cache'; 118 | 119 | 120 | CREATE OR REPLACE FUNCTION 121 | pgfadvise_loader(IN regclass, IN int, IN bool, IN bool, IN varbit, 122 | OUT relpath text, 123 | OUT os_page_size bigint, 124 | OUT os_pages_free bigint, 125 | OUT pages_loaded bigint, 126 | OUT pages_unloaded bigint) 127 | RETURNS setof record 128 | AS 'SELECT pgfadvise_loader($1, ''main'', $2, $3, $4, $5)' 129 | LANGUAGE SQL; 130 | 131 | -- 132 | -- PGFINCORE 133 | -- 134 | CREATE OR REPLACE FUNCTION 135 | pgfincore(IN regclass, IN text, IN bool, 136 | OUT relpath text, 137 | OUT segment int, 138 | OUT os_page_size bigint, 139 | OUT rel_os_pages bigint, 140 | OUT pages_mem bigint, 141 | OUT group_mem bigint, 142 | OUT os_pages_free bigint, 143 | OUT databit varbit, 144 | OUT pages_dirty bigint, 145 | OUT group_dirty bigint) 146 | RETURNS setof record 147 | AS '$libdir/pgfincore' 148 | LANGUAGE C; 149 | 150 | COMMENT ON FUNCTION pgfincore(regclass, text, bool) 151 | IS 'Utility to inspect and get a snapshot of the system cache'; 152 | 153 | CREATE OR REPLACE FUNCTION 154 | pgfincore(IN regclass, IN bool, 155 | OUT relpath text, 156 | OUT segment int, 157 | OUT os_page_size bigint, 158 | OUT rel_os_pages bigint, 159 | OUT pages_mem bigint, 160 | OUT group_mem bigint, 161 | OUT os_pages_free bigint, 162 | OUT databit varbit, 163 | OUT pages_dirty bigint, 164 | OUT group_dirty bigint) 165 | RETURNS setof record 166 | AS 'SELECT * from pgfincore($1, ''main'', $2)' 167 | LANGUAGE SQL; 168 | 169 | CREATE OR REPLACE FUNCTION 170 | pgfincore(IN regclass, 171 | OUT relpath text, 172 | OUT segment int, 173 | OUT os_page_size bigint, 174 | OUT rel_os_pages bigint, 175 | OUT pages_mem bigint, 176 | OUT group_mem bigint, 177 | OUT os_pages_free bigint, 178 | OUT databit varbit, 179 | OUT pages_dirty bigint, 180 | OUT group_dirty bigint) 181 | RETURNS setof record 182 | AS 'SELECT * from pgfincore($1, ''main'', false)' 183 | LANGUAGE SQL; 184 | 185 | CREATE OR REPLACE FUNCTION 186 | pgfincore_drawer(IN varbit, 187 | OUT drawer cstring) 188 | RETURNS cstring 189 | AS '$libdir/pgfincore' 190 | LANGUAGE C; 191 | 192 | COMMENT ON FUNCTION pgfincore_drawer(varbit) 193 | IS 'A naive drawing function to visualize page cache per object'; 194 | -------------------------------------------------------------------------------- /pgfincore.c: -------------------------------------------------------------------------------- 1 | /* 2 | * PgFincore 3 | * This project let you see and mainpulate objects in the FS page cache 4 | * Copyright (C) 2009-2011 Cédric Villemain 5 | */ 6 | 7 | /* POSIX stuff */ 8 | #define _XOPEN_SOURCE 600 /* fadvise */ 9 | 10 | #include /* fadvise */ 11 | #include /* exit, calloc, free */ 12 | #include /* stat, fstat */ 13 | #include /* size_t, mincore */ 14 | #include /* mmap, mincore */ 15 | #include /* sysconf, close */ 16 | /* } */ 17 | 18 | /* PostgreSQL stuff */ 19 | #include "postgres.h" /* general Postgres declarations */ 20 | 21 | #include "access/heapam.h" /* relation_open */ 22 | #include "catalog/catalog.h" /* relpath */ 23 | #include "catalog/namespace.h" /* makeRangeVarFromNameList */ 24 | #include "catalog/pg_type.h" /* TEXTOID for tuple_desc */ 25 | #include "funcapi.h" /* SRF */ 26 | #include "utils/builtins.h" /* textToQualifiedNameList */ 27 | #include "utils/rel.h" /* Relation */ 28 | #include "utils/varbit.h" /* bitstring datatype */ 29 | #include "storage/fd.h" 30 | #include "access/htup_details.h" /* heap_form_tuple */ 31 | #include "common/relpath.h" /* relpathbackend */ 32 | 33 | #ifdef PG_VERSION_NUM 34 | #define PG_MAJOR_VERSION (PG_VERSION_NUM / 100) 35 | #else 36 | #error "Unknown postgresql version" 37 | #endif 38 | 39 | #if PG_VERSION_NUM < 90300 40 | #error "Unsupported postgresql version" 41 | #endif 42 | 43 | #ifdef PG_MODULE_MAGIC 44 | PG_MODULE_MAGIC; 45 | #endif 46 | 47 | #define PGSYSCONF_COLS 3 48 | #define PGFADVISE_COLS 4 49 | #define PGFADVISE_LOADER_COLS 5 50 | #define PGFINCORE_COLS 10 51 | 52 | #define PGF_WILLNEED 10 53 | #define PGF_DONTNEED 20 54 | #define PGF_NORMAL 30 55 | #define PGF_SEQUENTIAL 40 56 | #define PGF_RANDOM 50 57 | 58 | #define FINCORE_PRESENT 0x1 59 | #define FINCORE_DIRTY 0x2 60 | #ifndef HAVE_FINCORE 61 | #define FINCORE_BITS 1 62 | #else 63 | #define FINCORE_BITS 2 64 | #endif 65 | /* 66 | * pgfadvise_fctx structure is needed 67 | * to keep track of relation path, segment number, ... 68 | */ 69 | typedef struct 70 | { 71 | int advice; /* the posix_fadvise advice */ 72 | TupleDesc tupd; /* the tuple descriptor */ 73 | Relation rel; /* the relation */ 74 | unsigned int segcount; /* the segment current number */ 75 | char *relationpath; /* the relation path */ 76 | } pgfadvise_fctx; 77 | 78 | /* 79 | * pgfadvise structure is needed 80 | * to return values 81 | */ 82 | typedef struct 83 | { 84 | size_t pageSize; /* os page size */ 85 | size_t pagesFree; /* free page cache */ 86 | size_t filesize; /* the filesize */ 87 | } pgfadviseStruct; 88 | 89 | /* 90 | * pgfloader structure is needed 91 | * to return values 92 | */ 93 | typedef struct 94 | { 95 | size_t pageSize; /* os page size */ 96 | size_t pagesFree; /* free page cache */ 97 | size_t pagesLoaded; /* pages loaded */ 98 | size_t pagesUnloaded; /* pages unloaded */ 99 | } pgfloaderStruct; 100 | 101 | /* 102 | * pgfincore_fctx structure is needed 103 | * to keep track of relation path, segment number, ... 104 | */ 105 | typedef struct 106 | { 107 | bool getvector; /* output varbit data ? */ 108 | TupleDesc tupd; /* the tuple descriptor */ 109 | Relation rel; /* the relation */ 110 | unsigned int segcount; /* the segment current number */ 111 | char *relationpath; /* the relation path */ 112 | } pgfincore_fctx; 113 | 114 | /* 115 | * pgfadvise_loader_struct structure is needed 116 | * to keep track of relation path, segment number, ... 117 | */ 118 | typedef struct 119 | { 120 | size_t pageSize; /* os page size */ 121 | size_t pagesFree; /* free page cache */ 122 | size_t rel_os_pages; 123 | size_t pages_mem; 124 | size_t group_mem; 125 | size_t pages_dirty; 126 | size_t group_dirty; 127 | VarBit *databit; 128 | } pgfincoreStruct; 129 | 130 | Datum pgsysconf(PG_FUNCTION_ARGS); 131 | 132 | Datum pgfadvise(PG_FUNCTION_ARGS); 133 | static int pgfadvise_file(char *filename, int advice, pgfadviseStruct *pgfdv); 134 | 135 | Datum pgfadvise_loader(PG_FUNCTION_ARGS); 136 | static int pgfadvise_loader_file(char *filename, 137 | bool willneed, bool dontneed, 138 | VarBit *databit, 139 | pgfloaderStruct *pgfloader); 140 | 141 | Datum pgfincore(PG_FUNCTION_ARGS); 142 | static int pgfincore_file(char *filename, pgfincoreStruct *pgfncr); 143 | 144 | Datum pgfincore_drawer(PG_FUNCTION_ARGS); 145 | 146 | #if PG_MAJOR_VERSION < 1600 147 | #define relpathpg(rel, forkName) \ 148 | relpathbackend((rel)->rd_node, (rel)->rd_backend, (forkname_to_number(text_to_cstring(forkName)))) 149 | #else 150 | #define relpathpg(rel, forkName) \ 151 | relpathbackend((rel)->rd_locator, (rel)->rd_backend, (forkname_to_number(text_to_cstring(forkName)))) 152 | #endif 153 | 154 | /* 155 | * pgsysconf 156 | * just output the actual system value for 157 | * _SC_PAGESIZE --> Page Size 158 | * _SC_AVPHYS_PAGES --> Free page in memory 159 | * _SC_PHYS_PAGES --> Total memory 160 | * 161 | */ 162 | PG_FUNCTION_INFO_V1(pgsysconf); 163 | Datum 164 | pgsysconf(PG_FUNCTION_ARGS) 165 | { 166 | HeapTuple tuple; 167 | TupleDesc tupdesc; 168 | Datum values[PGSYSCONF_COLS]; 169 | bool nulls[PGSYSCONF_COLS]; 170 | 171 | /* initialize nulls array to build the tuple */ 172 | memset(nulls, 0, sizeof(nulls)); 173 | 174 | /* Build a tuple descriptor for our result type */ 175 | if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) 176 | elog(ERROR, "pgsysconf: return type must be a row type"); 177 | 178 | /* Page size */ 179 | values[0] = Int64GetDatum(sysconf(_SC_PAGESIZE)); 180 | 181 | /* free page in memory */ 182 | values[1] = Int64GetDatum(sysconf(_SC_AVPHYS_PAGES)); 183 | 184 | /* total memory */ 185 | values[2] = Int64GetDatum(sysconf(_SC_PHYS_PAGES)); 186 | 187 | /* Build and return the result tuple. */ 188 | tuple = heap_form_tuple(tupdesc, values, nulls); 189 | PG_RETURN_DATUM( HeapTupleGetDatum(tuple) ); 190 | } 191 | 192 | #if defined(USE_POSIX_FADVISE) 193 | /* 194 | * pgfadvise_file 195 | */ 196 | static int 197 | pgfadvise_file(char *filename, int advice, pgfadviseStruct *pgfdv) 198 | { 199 | /* 200 | * We use the AllocateFile(2) provided by PostgreSQL. We're going to 201 | * close it ourselves even if PostgreSQL close it anyway at transaction 202 | * end. 203 | */ 204 | FILE *fp; 205 | int fd; 206 | struct stat st; 207 | int adviceFlag; 208 | 209 | /* 210 | * OS Page size and Free pages 211 | */ 212 | pgfdv->pageSize = sysconf(_SC_PAGESIZE); 213 | 214 | /* 215 | * Fopen and fstat file 216 | * fd will be provided to posix_fadvise 217 | * if there is no file, just return 1, it is expected to leave the SRF 218 | */ 219 | fp = AllocateFile(filename, "rb"); 220 | if (fp == NULL) 221 | return 1; 222 | 223 | fd = fileno(fp); 224 | if (fstat(fd, &st) == -1) 225 | { 226 | FreeFile(fp); 227 | elog(ERROR, "pgfadvise: Can not stat object file : %s", filename); 228 | return 2; 229 | } 230 | 231 | /* 232 | * the file size is used in the SRF to output the number of pages used by 233 | * the segment 234 | */ 235 | pgfdv->filesize = st.st_size; 236 | elog(DEBUG1, "pgfadvise: working on %s of %lld bytes", 237 | filename, (long long int) pgfdv->filesize); 238 | 239 | /* FADVISE_WILLNEED */ 240 | if (advice == PGF_WILLNEED) 241 | { 242 | adviceFlag = POSIX_FADV_WILLNEED; 243 | elog(DEBUG1, "pgfadvise: setting advice POSIX_FADV_WILLNEED"); 244 | } 245 | /* FADVISE_DONTNEED */ 246 | else if (advice == PGF_DONTNEED) 247 | { 248 | adviceFlag = POSIX_FADV_DONTNEED; 249 | elog(DEBUG1, "pgfadvise: setting advice POSIX_FADV_DONTNEED"); 250 | 251 | } 252 | /* POSIX_FADV_NORMAL */ 253 | else if (advice == PGF_NORMAL) 254 | { 255 | adviceFlag = POSIX_FADV_NORMAL; 256 | elog(DEBUG1, "pgfadvise: setting advice POSIX_FADV_NORMAL"); 257 | 258 | } 259 | /* POSIX_FADV_SEQUENTIAL */ 260 | else if (advice == PGF_SEQUENTIAL) 261 | { 262 | adviceFlag = POSIX_FADV_SEQUENTIAL; 263 | elog(DEBUG1, "pgfadvise: setting advice POSIX_FADV_SEQUENTIAL"); 264 | 265 | } 266 | /* POSIX_FADV_RANDOM */ 267 | else if (advice == PGF_RANDOM) 268 | { 269 | adviceFlag = POSIX_FADV_RANDOM; 270 | elog(DEBUG1, "pgfadvise: setting advice POSIX_FADV_RANDOM"); 271 | 272 | } 273 | else 274 | { 275 | elog(ERROR, "pgfadvise: invalid advice: %d", advice); 276 | return 2; 277 | } 278 | 279 | /* 280 | * Call posix_fadvise with the relevant advice on the file descriptor 281 | */ 282 | posix_fadvise(fd, 0, 0, adviceFlag); 283 | 284 | /* close the file */ 285 | FreeFile(fp); 286 | 287 | /* 288 | * OS things : Pages free 289 | */ 290 | pgfdv->pagesFree = sysconf(_SC_AVPHYS_PAGES); 291 | 292 | return 0; 293 | } 294 | #else 295 | static int 296 | pgfadvise_file(char *filename, int advice, pgfadviseStruct *pgfdv) 297 | { 298 | elog(ERROR, "POSIX_FADVISE UNSUPPORTED on your platform"); 299 | return 9; 300 | } 301 | #endif 302 | 303 | /* 304 | * pgfadvise is a function that handle the process to have a sharelock 305 | * on the relation and to walk the segments. 306 | * for each segment it call the posix_fadvise with the required flag 307 | * parameter 308 | */ 309 | PG_FUNCTION_INFO_V1(pgfadvise); 310 | Datum 311 | pgfadvise(PG_FUNCTION_ARGS) 312 | { 313 | /* SRF Stuff */ 314 | FuncCallContext *funcctx; 315 | pgfadvise_fctx *fctx; 316 | 317 | /* our structure use to return values */ 318 | pgfadviseStruct *pgfdv; 319 | 320 | /* our return value, 0 for success */ 321 | int result; 322 | 323 | /* The file we are working on */ 324 | char filename[MAXPGPATH]; 325 | 326 | /* stuff done only on the first call of the function */ 327 | if (SRF_IS_FIRSTCALL()) 328 | { 329 | MemoryContext oldcontext; 330 | 331 | Oid relOid = PG_GETARG_OID(0); 332 | text *forkName = PG_GETARG_TEXT_P(1); 333 | int advice = PG_GETARG_INT32(2); 334 | 335 | /* 336 | * Postgresql stuff to return a tuple 337 | */ 338 | TupleDesc tupdesc; 339 | 340 | /* create a function context for cross-call persistence */ 341 | funcctx = SRF_FIRSTCALL_INIT(); 342 | 343 | /* 344 | * switch to memory context appropriate for multiple function calls 345 | */ 346 | oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); 347 | 348 | /* allocate memory for user context */ 349 | fctx = (pgfadvise_fctx *) palloc(sizeof(pgfadvise_fctx)); 350 | 351 | /* Build a tuple descriptor for our result type */ 352 | if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) 353 | elog(ERROR, "pgfadvise: return type must be a row type"); 354 | 355 | /* provide the tuple descriptor to the fonction structure */ 356 | fctx->tupd = tupdesc; 357 | 358 | /* open the current relation, accessShareLock */ 359 | // TODO use try_relation_open instead ? 360 | fctx->rel = relation_open(relOid, AccessShareLock); 361 | 362 | /* we get the common part of the filename of each segment of a relation */ 363 | fctx->relationpath = relpathpg(fctx->rel, forkName); 364 | 365 | /* Here we keep track of current action in all calls */ 366 | fctx->advice = advice; 367 | 368 | /* segcount is used to get the next segment of the current relation */ 369 | fctx->segcount = 0; 370 | 371 | /* And finally we keep track of our initialization */ 372 | elog(DEBUG1, "pgfadvise: init done for %s, in fork %s", 373 | fctx->relationpath, text_to_cstring(forkName)); 374 | funcctx->user_fctx = fctx; 375 | MemoryContextSwitchTo(oldcontext); 376 | } 377 | 378 | /* After the first call, we recover our context */ 379 | funcctx = SRF_PERCALL_SETUP(); 380 | fctx = funcctx->user_fctx; 381 | 382 | /* 383 | * If we are still looking the first segment 384 | * relationpath should not be suffixed 385 | */ 386 | if (fctx->segcount == 0) 387 | snprintf(filename, 388 | MAXPGPATH, 389 | "%s", 390 | fctx->relationpath); 391 | else 392 | snprintf(filename, 393 | MAXPGPATH, 394 | "%s.%u", 395 | fctx->relationpath, 396 | fctx->segcount); 397 | 398 | elog(DEBUG1, "pgfadvise: about to work with %s, current advice : %d", 399 | filename, fctx->advice); 400 | 401 | /* 402 | * Call posix_fadvise with the advice, returning the structure 403 | */ 404 | pgfdv = (pgfadviseStruct *) palloc(sizeof(pgfadviseStruct)); 405 | result = pgfadvise_file(filename, fctx->advice, pgfdv); 406 | 407 | /* 408 | * When we have work with all segments of the current relation 409 | * We exit from the SRF 410 | * Else we build and return the tuple for this segment 411 | */ 412 | if (result) 413 | { 414 | elog(DEBUG1, "pgfadvise: closing %s", fctx->relationpath); 415 | relation_close(fctx->rel, AccessShareLock); 416 | pfree(fctx); 417 | SRF_RETURN_DONE(funcctx); 418 | } 419 | else { 420 | /* 421 | * Postgresql stuff to return a tuple 422 | */ 423 | HeapTuple tuple; 424 | Datum values[PGFADVISE_COLS]; 425 | bool nulls[PGFADVISE_COLS]; 426 | 427 | /* initialize nulls array to build the tuple */ 428 | memset(nulls, 0, sizeof(nulls)); 429 | 430 | /* prepare the number of the next segment */ 431 | fctx->segcount++; 432 | 433 | /* Filename */ 434 | values[0] = CStringGetTextDatum( filename ); 435 | /* os page size */ 436 | values[1] = Int64GetDatum( (int64) pgfdv->pageSize ); 437 | /* number of pages used by segment */ 438 | values[2] = Int64GetDatum( (int64) ((pgfdv->filesize+pgfdv->pageSize-1)/pgfdv->pageSize) ); 439 | /* free page cache */ 440 | values[3] = Int64GetDatum( (int64) pgfdv->pagesFree ); 441 | /* Build the result tuple. */ 442 | tuple = heap_form_tuple(fctx->tupd, values, nulls); 443 | 444 | /* Ok, return results, and go for next call */ 445 | SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple)); 446 | } 447 | } 448 | 449 | #if defined(USE_POSIX_FADVISE) 450 | /* 451 | * pgfadvise_file 452 | */ 453 | static int 454 | pgfadvise_loader_file(char *filename, 455 | bool willneed, bool dontneed, VarBit *databit, 456 | pgfloaderStruct *pgfloader) 457 | { 458 | bits8 *sp; 459 | int bitlen; 460 | bits8 x; 461 | int i, k; 462 | 463 | /* 464 | * We use the AllocateFile(2) provided by PostgreSQL. We're going to 465 | * close it ourselves even if PostgreSQL close it anyway at transaction 466 | * end. 467 | */ 468 | FILE *fp; 469 | int fd; 470 | struct stat st; 471 | 472 | /* 473 | * OS things : Page size 474 | */ 475 | pgfloader->pageSize = sysconf(_SC_PAGESIZE); 476 | 477 | /* 478 | * we count the action we perform 479 | * both are theorical : we don't know if the page was or not in memory 480 | * when we call posix_fadvise 481 | */ 482 | pgfloader->pagesLoaded = 0; 483 | pgfloader->pagesUnloaded = 0; 484 | 485 | /* 486 | * Fopen and fstat file 487 | * fd will be provided to posix_fadvise 488 | * if there is no file, just return 1, it is expected to leave the SRF 489 | */ 490 | fp = AllocateFile(filename, "rb"); 491 | if (fp == NULL) 492 | return 1; 493 | 494 | fd = fileno(fp); 495 | if (fstat(fd, &st) == -1) 496 | { 497 | FreeFile(fp); 498 | elog(ERROR, "pgfadvise_loader: Can not stat object file: %s", filename); 499 | return 2; 500 | } 501 | 502 | elog(DEBUG1, "pgfadvise_loader: working on %s", filename); 503 | 504 | bitlen = VARBITLEN(databit); 505 | sp = VARBITS(databit); 506 | for (i = 0; i < bitlen - BITS_PER_BYTE; i += BITS_PER_BYTE, sp++) 507 | { 508 | x = *sp; 509 | /* Is this bit set ? */ 510 | for (k = 0; k < BITS_PER_BYTE; k++) 511 | { 512 | if (IS_HIGHBIT_SET(x)) 513 | { 514 | if (willneed) 515 | { 516 | (void) posix_fadvise(fd, 517 | ((i+k) * pgfloader->pageSize), 518 | pgfloader->pageSize, 519 | POSIX_FADV_WILLNEED); 520 | pgfloader->pagesLoaded++; 521 | } 522 | } 523 | else if (dontneed) 524 | { 525 | (void) posix_fadvise(fd, 526 | ((i+k) * pgfloader->pageSize), 527 | pgfloader->pageSize, 528 | POSIX_FADV_DONTNEED); 529 | pgfloader->pagesUnloaded++; 530 | } 531 | 532 | x <<= 1; 533 | } 534 | } 535 | /* 536 | * XXX this copy/paste of code to finnish to walk the bits is not pretty 537 | */ 538 | if (i < bitlen) 539 | { 540 | /* print the last partial byte */ 541 | x = *sp; 542 | for (k = i; k < bitlen; k++) 543 | { 544 | if (IS_HIGHBIT_SET(x)) 545 | { 546 | if (willneed) 547 | { 548 | (void) posix_fadvise(fd, 549 | (k * pgfloader->pageSize), 550 | pgfloader->pageSize, 551 | POSIX_FADV_WILLNEED); 552 | pgfloader->pagesLoaded++; 553 | } 554 | } 555 | else if (dontneed) 556 | { 557 | (void) posix_fadvise(fd, 558 | (k * pgfloader->pageSize), 559 | pgfloader->pageSize, 560 | POSIX_FADV_DONTNEED); 561 | pgfloader->pagesUnloaded++; 562 | } 563 | x <<= 1; 564 | } 565 | } 566 | FreeFile(fp); 567 | 568 | /* 569 | * OS things : Pages free 570 | */ 571 | pgfloader->pagesFree = sysconf(_SC_AVPHYS_PAGES); 572 | 573 | return 0; 574 | } 575 | #else 576 | static int 577 | pgfadvise_loader_file(char *filename, 578 | bool willneed, bool dontneed, VarBit *databit, 579 | pgfloaderStruct *pgfloader) 580 | { 581 | elog(ERROR, "POSIX_FADVISE UNSUPPORTED on your platform"); 582 | return 9; 583 | } 584 | #endif 585 | 586 | /* 587 | * 588 | * pgfadv_loader to handle work with varbit map of buffer cache. 589 | * it is actually used for loading/unloading block to/from buffer cache 590 | * 591 | */ 592 | PG_FUNCTION_INFO_V1(pgfadvise_loader); 593 | Datum 594 | pgfadvise_loader(PG_FUNCTION_ARGS) 595 | { 596 | Oid relOid = PG_GETARG_OID(0); 597 | text *forkName = PG_GETARG_TEXT_P(1); 598 | int segmentNumber = PG_GETARG_INT32(2); 599 | bool willneed = PG_GETARG_BOOL(3); 600 | bool dontneed = PG_GETARG_BOOL(4); 601 | VarBit *databit; 602 | 603 | /* our structure use to return values */ 604 | pgfloaderStruct *pgfloader; 605 | 606 | Relation rel; 607 | char *relationpath; 608 | char filename[MAXPGPATH]; 609 | 610 | /* our return value, 0 for success */ 611 | int result; 612 | 613 | /* 614 | * Postgresql stuff to return a tuple 615 | */ 616 | HeapTuple tuple; 617 | TupleDesc tupdesc; 618 | Datum values[PGFADVISE_LOADER_COLS]; 619 | bool nulls[PGFADVISE_LOADER_COLS]; 620 | 621 | if (PG_ARGISNULL(5)) 622 | elog(ERROR, "pgfadvise_loader: databit argument shouldn't be NULL"); 623 | 624 | databit = PG_GETARG_VARBIT_P(5); 625 | 626 | /* initialize nulls array to build the tuple */ 627 | memset(nulls, 0, sizeof(nulls)); 628 | 629 | /* Build a tuple descriptor for our result type */ 630 | if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) 631 | elog(ERROR, "return type must be a row type"); 632 | 633 | /* open the current relation in accessShareLock */ 634 | rel = relation_open(relOid, AccessShareLock); 635 | 636 | /* we get the common part of the filename of each segment of a relation */ 637 | relationpath = relpathpg(rel, forkName); 638 | 639 | /* 640 | * If we are looking the first segment, 641 | * relationpath should not be suffixed 642 | */ 643 | if (segmentNumber == 0) 644 | snprintf(filename, 645 | MAXPGPATH, 646 | "%s", 647 | relationpath); 648 | else 649 | snprintf(filename, 650 | MAXPGPATH, 651 | "%s.%u", 652 | relationpath, 653 | (int) segmentNumber); 654 | 655 | /* 656 | * We don't need the relation anymore 657 | * the only purpose was to get a consistent filename 658 | * (if file disappear, an error is logged) 659 | */ 660 | relation_close(rel, AccessShareLock); 661 | 662 | /* 663 | * Call pgfadvise_loader with the varbit 664 | */ 665 | pgfloader = (pgfloaderStruct *) palloc(sizeof(pgfloaderStruct)); 666 | result = pgfadvise_loader_file(filename, 667 | willneed, dontneed, databit, 668 | pgfloader); 669 | if (result != 0) 670 | elog(ERROR, "Can't read file %s, fork(%s)", 671 | filename, text_to_cstring(forkName)); 672 | /* Filename */ 673 | values[0] = CStringGetTextDatum( filename ); 674 | /* os page size */ 675 | values[1] = Int64GetDatum( pgfloader->pageSize ); 676 | /* free page cache */ 677 | values[2] = Int64GetDatum( pgfloader->pagesFree ); 678 | /* pages loaded */ 679 | values[3] = Int64GetDatum( pgfloader->pagesLoaded ); 680 | /* pages unloaded */ 681 | values[4] = Int64GetDatum( pgfloader->pagesUnloaded ); 682 | 683 | /* Build and return the result tuple. */ 684 | tuple = heap_form_tuple(tupdesc, values, nulls); 685 | PG_RETURN_DATUM( HeapTupleGetDatum(tuple) ); 686 | } 687 | 688 | /* 689 | * pgfincore_file handle the mmaping, mincore process (and access file, etc.) 690 | */ 691 | static int 692 | pgfincore_file(char *filename, pgfincoreStruct *pgfncr) 693 | { 694 | int flag=1; 695 | int flag_dirty=1; 696 | 697 | int len, bitlen; 698 | bits8 *r; 699 | bits8 x = 0; 700 | register int64 pageIndex; 701 | 702 | 703 | /* 704 | * We use the AllocateFile(2) provided by PostgreSQL. We're going to 705 | * close it ourselves even if PostgreSQL close it anyway at transaction 706 | * end. 707 | */ 708 | FILE *fp; 709 | int fd; 710 | struct stat st; 711 | 712 | #ifndef HAVE_FINCORE 713 | void *pa = (char *) 0; 714 | #endif 715 | unsigned char *vec = (unsigned char *) 0; 716 | 717 | /* 718 | * OS Page size 719 | */ 720 | pgfncr->pageSize = sysconf(_SC_PAGESIZE); 721 | 722 | /* 723 | * Initialize counters 724 | */ 725 | pgfncr->pages_mem = 0; 726 | pgfncr->group_mem = 0; 727 | pgfncr->pages_dirty = 0; 728 | pgfncr->group_dirty = 0; 729 | pgfncr->rel_os_pages = 0; 730 | 731 | /* 732 | * Fopen and fstat file 733 | * fd will be provided to posix_fadvise 734 | * if there is no file, just return 1, it is expected to leave the SRF 735 | */ 736 | fp = AllocateFile(filename, "rb"); 737 | if (fp == NULL) 738 | return 1; 739 | 740 | fd = fileno(fp); 741 | 742 | if (fstat(fd, &st) == -1) 743 | { 744 | FreeFile(fp); 745 | elog(ERROR, "Can not stat object file : %s", 746 | filename); 747 | return 2; 748 | } 749 | 750 | /* 751 | * if file ok 752 | * then process 753 | */ 754 | if (st.st_size != 0) 755 | { 756 | /* number of pages in the current file */ 757 | pgfncr->rel_os_pages = (st.st_size+pgfncr->pageSize-1)/pgfncr->pageSize; 758 | 759 | #ifndef HAVE_FINCORE 760 | pa = mmap(NULL, st.st_size, PROT_NONE, MAP_SHARED, fd, 0); 761 | if (pa == MAP_FAILED) 762 | { 763 | int save_errno = errno; 764 | FreeFile(fp); 765 | elog(ERROR, "Can not mmap object file : %s, errno = %i,%s\nThis error can happen if there is not enought space in memory to do the projection. Please mail cedric@villemain.org with '[pgfincore] ENOMEM' as subject.", 766 | filename, save_errno, strerror(save_errno)); 767 | return 3; 768 | } 769 | #endif 770 | 771 | /* Prepare our vector containing all blocks information */ 772 | vec = calloc(1, (st.st_size+pgfncr->pageSize-1)/pgfncr->pageSize); 773 | if ((void *)0 == vec) 774 | { 775 | #ifndef HAVE_FINCORE 776 | munmap(pa, st.st_size); 777 | #endif 778 | FreeFile(fp); 779 | elog(ERROR, "Can not calloc object file : %s", 780 | filename); 781 | return 4; 782 | } 783 | 784 | #ifndef HAVE_FINCORE 785 | /* Affect vec with mincore */ 786 | if (mincore(pa, st.st_size, vec) != 0) 787 | { 788 | int save_errno = errno; 789 | munmap(pa, st.st_size); 790 | elog(ERROR, "mincore(%p, %lld, %p): %s\n", 791 | pa, (long long int)st.st_size, vec, strerror(save_errno)); 792 | #else 793 | /* Affect vec with fincore */ 794 | if (fincore(fd, 0, st.st_size, vec) != 0) 795 | { 796 | int save_errno = errno; 797 | elog(ERROR, "fincore(%u, 0, %lld, %p): %s\n", 798 | fd, (long long int)st.st_size, vec, strerror(save_errno)); 799 | #endif 800 | free(vec); 801 | FreeFile(fp); 802 | return 5; 803 | } 804 | 805 | /* 806 | * prepare the bit string 807 | */ 808 | bitlen = FINCORE_BITS * ((st.st_size+pgfncr->pageSize-1)/pgfncr->pageSize); 809 | len = VARBITTOTALLEN(bitlen); 810 | /* 811 | * set to 0 so that *r is always initialised and string is zero-padded 812 | * XXX: do we need to free that ? 813 | */ 814 | pgfncr->databit = (VarBit *) palloc0(len); 815 | SET_VARSIZE(pgfncr->databit, len); 816 | VARBITLEN(pgfncr->databit) = bitlen; 817 | 818 | r = VARBITS(pgfncr->databit); 819 | x = HIGHBIT; 820 | 821 | /* handle the results */ 822 | for (pageIndex = 0; pageIndex <= pgfncr->rel_os_pages; pageIndex++) 823 | { 824 | // block in memory 825 | if (vec[pageIndex] & FINCORE_PRESENT) 826 | { 827 | pgfncr->pages_mem++; 828 | *r |= x; 829 | if (FINCORE_BITS > 1) 830 | { 831 | if (vec[pageIndex] & FINCORE_DIRTY) 832 | { 833 | pgfncr->pages_dirty++; 834 | *r |= (x >> 1); 835 | /* we flag to detect contigous blocks in the same state */ 836 | if (flag_dirty) 837 | pgfncr->group_dirty++; 838 | flag_dirty = 0; 839 | } 840 | else 841 | flag_dirty = 1; 842 | } 843 | elog (DEBUG5, "in memory blocks : %lld / %lld", 844 | (long long int) pageIndex, (long long int) pgfncr->rel_os_pages); 845 | 846 | /* we flag to detect contigous blocks in the same state */ 847 | if (flag) 848 | pgfncr->group_mem++; 849 | flag = 0; 850 | } 851 | else 852 | flag=1; 853 | 854 | 855 | x >>= FINCORE_BITS; 856 | if (x == 0) 857 | { 858 | x = HIGHBIT; 859 | r++; 860 | } 861 | } 862 | } 863 | elog(DEBUG1, "pgfincore %s: %lld of %lld block in linux cache, %lld groups", 864 | filename, (long long int) pgfncr->pages_mem, (long long int) pgfncr->rel_os_pages, (long long int) pgfncr->group_mem); 865 | 866 | /* 867 | * free and close 868 | */ 869 | free(vec); 870 | #ifndef HAVE_FINCORE 871 | munmap(pa, st.st_size); 872 | #endif 873 | FreeFile(fp); 874 | 875 | /* 876 | * OS things : Pages free 877 | */ 878 | pgfncr->pagesFree = sysconf(_SC_AVPHYS_PAGES); 879 | 880 | return 0; 881 | } 882 | 883 | /* 884 | * pgfincore is a function that handle the process to have a sharelock 885 | * on the relation and to walk the segments. 886 | * for each segment it call the appropriate function depending on 'action' 887 | * parameter 888 | */ 889 | PG_FUNCTION_INFO_V1(pgfincore); 890 | Datum 891 | pgfincore(PG_FUNCTION_ARGS) 892 | { 893 | /* SRF Stuff */ 894 | FuncCallContext *funcctx; 895 | pgfincore_fctx *fctx; 896 | 897 | /* our structure use to return values */ 898 | pgfincoreStruct *pgfncr; 899 | 900 | /* our return value, 0 for success */ 901 | int result; 902 | 903 | /* The file we are working on */ 904 | char filename[MAXPGPATH]; 905 | 906 | /* stuff done only on the first call of the function */ 907 | if (SRF_IS_FIRSTCALL()) 908 | { 909 | MemoryContext oldcontext; 910 | 911 | Oid relOid = PG_GETARG_OID(0); 912 | text *forkName = PG_GETARG_TEXT_P(1); 913 | bool getvector = PG_GETARG_BOOL(2); 914 | 915 | /* 916 | * Postgresql stuff to return a tuple 917 | */ 918 | TupleDesc tupdesc; 919 | 920 | /* create a function context for cross-call persistence */ 921 | funcctx = SRF_FIRSTCALL_INIT(); 922 | 923 | /* 924 | * switch to memory context appropriate for multiple function calls 925 | */ 926 | oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); 927 | 928 | /* allocate memory for user context */ 929 | fctx = (pgfincore_fctx *) palloc(sizeof(pgfincore_fctx)); 930 | 931 | /* Build a tuple descriptor for our result type */ 932 | if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) 933 | elog(ERROR, "pgfadvise: return type must be a row type"); 934 | 935 | /* provide the tuple descriptor to the fonction structure */ 936 | fctx->tupd = tupdesc; 937 | 938 | /* are we going to grab and output the varbit data (can be large) */ 939 | fctx->getvector = getvector; 940 | 941 | /* open the current relation, accessShareLock */ 942 | // TODO use try_relation_open instead ? 943 | fctx->rel = relation_open(relOid, AccessShareLock); 944 | 945 | /* we get the common part of the filename of each segment of a relation */ 946 | fctx->relationpath = relpathpg(fctx->rel, forkName); 947 | 948 | /* segcount is used to get the next segment of the current relation */ 949 | fctx->segcount = 0; 950 | 951 | /* And finally we keep track of our initialization */ 952 | elog(DEBUG1, "pgfincore: init done for %s, in fork %s", 953 | fctx->relationpath, text_to_cstring(forkName)); 954 | funcctx->user_fctx = fctx; 955 | MemoryContextSwitchTo(oldcontext); 956 | } 957 | 958 | /* After the first call, we recover our context */ 959 | funcctx = SRF_PERCALL_SETUP(); 960 | fctx = funcctx->user_fctx; 961 | 962 | /* 963 | * If we are still looking the first segment 964 | * relationpath should not be suffixed 965 | */ 966 | if (fctx->segcount == 0) 967 | snprintf(filename, 968 | MAXPGPATH, 969 | "%s", 970 | fctx->relationpath); 971 | else 972 | snprintf(filename, 973 | MAXPGPATH, 974 | "%s.%u", 975 | fctx->relationpath, 976 | fctx->segcount); 977 | 978 | elog(DEBUG1, "pgfincore: about to work with %s", filename); 979 | 980 | /* 981 | * Call pgfincore with the advice, returning the structure 982 | */ 983 | pgfncr = (pgfincoreStruct *) palloc(sizeof(pgfincoreStruct)); 984 | result = pgfincore_file(filename, pgfncr); 985 | 986 | /* 987 | * When we have work with all segment of the current relation, test success 988 | * We exit from the SRF 989 | */ 990 | if (result) 991 | { 992 | elog(DEBUG1, "pgfincore: closing %s", fctx->relationpath); 993 | relation_close(fctx->rel, AccessShareLock); 994 | pfree(fctx); 995 | SRF_RETURN_DONE(funcctx); 996 | } 997 | else 998 | { 999 | /* 1000 | * Postgresql stuff to return a tuple 1001 | */ 1002 | HeapTuple tuple; 1003 | Datum values[PGFINCORE_COLS]; 1004 | bool nulls[PGFINCORE_COLS]; 1005 | 1006 | /* initialize nulls array to build the tuple */ 1007 | memset(nulls, 0, sizeof(nulls)); 1008 | 1009 | /* Filename */ 1010 | values[0] = CStringGetTextDatum(filename); 1011 | /* Segment Number */ 1012 | values[1] = Int32GetDatum(fctx->segcount); 1013 | /* os page size */ 1014 | values[2] = Int64GetDatum(pgfncr->pageSize); 1015 | /* number of pages used by segment */ 1016 | values[3] = Int64GetDatum(pgfncr->rel_os_pages); 1017 | /* number of pages in OS cache */ 1018 | values[4] = Int64GetDatum(pgfncr->pages_mem); 1019 | /* number of group of contigous page in os cache */ 1020 | values[5] = Int64GetDatum(pgfncr->group_mem); 1021 | /* free page cache */ 1022 | values[6] = Int64GetDatum(pgfncr->pagesFree); 1023 | /* the map of the file with bit set for in os cache page */ 1024 | if (fctx->getvector && pgfncr->rel_os_pages) 1025 | { 1026 | values[7] = VarBitPGetDatum(pgfncr->databit); 1027 | } 1028 | else 1029 | { 1030 | nulls[7] = true; 1031 | values[7] = (Datum) NULL; 1032 | } 1033 | /* number of pages dirty in OS cache */ 1034 | values[8] = Int64GetDatum(pgfncr->pages_dirty); 1035 | /* number of group of contigous dirty pages in os cache */ 1036 | values[9] = Int64GetDatum(pgfncr->group_dirty); 1037 | /* Build the result tuple. */ 1038 | tuple = heap_form_tuple(fctx->tupd, values, nulls); 1039 | 1040 | /* prepare the number of the next segment */ 1041 | fctx->segcount++; 1042 | 1043 | /* Ok, return results, and go for next call */ 1044 | SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple)); 1045 | } 1046 | } 1047 | 1048 | /* 1049 | * pgfincore_drawer A very naive renderer. (for testing) 1050 | */ 1051 | PG_FUNCTION_INFO_V1(pgfincore_drawer); 1052 | Datum 1053 | pgfincore_drawer(PG_FUNCTION_ARGS) 1054 | { 1055 | char *result, 1056 | *r; 1057 | int len,i,k; 1058 | VarBit *databit; 1059 | bits8 *sp; 1060 | bits8 x; 1061 | 1062 | if (PG_ARGISNULL(0)) 1063 | elog(ERROR, "pgfincore_drawer: databit argument shouldn't be NULL"); 1064 | 1065 | databit = PG_GETARG_VARBIT_P(0); 1066 | 1067 | len = VARBITLEN(databit); 1068 | result = (char *) palloc((len/FINCORE_BITS) + 1); 1069 | sp = VARBITS(databit); 1070 | r = result; 1071 | 1072 | for (i = 0; i <= len - BITS_PER_BYTE; i += BITS_PER_BYTE, sp++) 1073 | { 1074 | x = *sp; 1075 | /* Is this bit set ? */ 1076 | for (k = 0; k < (BITS_PER_BYTE/FINCORE_BITS); k++) 1077 | { 1078 | char out = ' '; 1079 | if (IS_HIGHBIT_SET(x)) 1080 | out = '.' ; 1081 | x <<= 1; 1082 | if (FINCORE_BITS > 1) 1083 | { 1084 | if (IS_HIGHBIT_SET(x)) 1085 | out = '*'; 1086 | x <<= 1; 1087 | } 1088 | *r++ = out; 1089 | } 1090 | } 1091 | if (i < len) 1092 | { 1093 | /* print the last partial byte */ 1094 | x = *sp; 1095 | for (k = i; k < (len/FINCORE_BITS); k++) 1096 | { 1097 | char out = ' '; 1098 | if (IS_HIGHBIT_SET(x)) 1099 | out = '.' ; 1100 | x <<= 1; 1101 | if (FINCORE_BITS > 1) 1102 | { 1103 | if (IS_HIGHBIT_SET(x)) 1104 | out = '*'; 1105 | x <<= 1; 1106 | } 1107 | *r++ = out; 1108 | } 1109 | } 1110 | 1111 | *r = '\0'; 1112 | PG_RETURN_CSTRING(result); 1113 | } 1114 | -------------------------------------------------------------------------------- /pgfincore.control: -------------------------------------------------------------------------------- 1 | # pgfincore extension 2 | comment = 'examine and manage the os buffer cache' 3 | default_version = '1.3.1' 4 | module_pathname = '$libdir/pgfincore' 5 | directory = pgfincore 6 | relocatable = true 7 | -------------------------------------------------------------------------------- /sql/pgfincore.sql: -------------------------------------------------------------------------------- 1 | CREATE EXTENSION pgfincore; 2 | 3 | -- 4 | -- test SYSCONF 5 | -- 6 | select from pgsysconf(); 7 | select from pgsysconf_pretty(); 8 | 9 | -- 10 | -- make a temp table to use below 11 | -- 12 | CREATE TEMP TABLE test AS SELECT generate_series(1,256) as a; 13 | 14 | -- 15 | -- this is not perfect testing but it is hard to predict what the OS will do 16 | -- for *sure* 17 | -- 18 | 19 | -- 20 | -- test fadvise_loader 21 | -- 22 | select from pgfadvise_loader('test', 0, true, true, B'1010'); 23 | select from pgfadvise_loader('test', 0, true, false, B'1010'); 24 | select from pgfadvise_loader('test', 0, false, true, B'1010'); 25 | select from pgfadvise_loader('test', 0, false, false, B'1010'); 26 | -- must not fail on empty databit input 27 | select from pgfadvise_loader('test', 0, false, false, B''); 28 | -- ERROR on NULL databit input 29 | select from pgfadvise_loader('test', 0, false, false, NULL); 30 | 31 | -- 32 | -- test pgfincore 33 | -- 34 | select from pgfincore('test', true); 35 | select from pgfincore('test'); 36 | 37 | -- 38 | -- test DONTNEED, WILLNEED 39 | -- 40 | select from pgfadvise_willneed('test'); 41 | select from pgfadvise_dontneed('test'); 42 | 43 | -- 44 | -- test PGFADVISE flags 45 | -- 46 | select from pgfadvise_sequential('test'); 47 | select from pgfadvise_random('test'); 48 | select from pgfadvise_normal('test'); 49 | 50 | -- 51 | -- tests drawers 52 | -- 53 | select NULL || pgfincore_drawer(databit) from pgfincore('test','main',true); 54 | --------------------------------------------------------------------------------