├── .gitignore ├── LICENSE.txt ├── README.md ├── debian ├── changelog ├── compat ├── control ├── postgresql-metrics.install ├── postgresql-metrics.links ├── postgresql-metrics.service ├── postgresql-metrics.triggers └── rules ├── etc └── postgresql-metrics │ ├── default │ └── postgresql-metrics.yml │ └── postgresql-metrics.yml ├── postgresql_metrics ├── __init__.py ├── common.py ├── default_metrics.py ├── localhost_postgres_stats.py ├── metrics_gatherer.py ├── metrics_logic.py ├── postgres_queries.py └── prepare_db.py ├── requirements.txt └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | *.py[cod] 3 | build/ 4 | dist/ 5 | *.egg-info/ 6 | .installed.cfg 7 | *.egg 8 | 9 | # Installer logs 10 | pip-log.txt 11 | pip-delete-this-directory.txt 12 | 13 | *.log 14 | 15 | .idea/* 16 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2014-2015 Spotify AB 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Spotify PostgreSQL Metrics 2 | 3 | Service to extract and provide metrics on your PostgreSQL database. 4 | 5 | This tool is a CLI (command line) tool that can be called to extract 6 | statistics and create metrics from your PostgreSQL database cluster. 7 | CLI is runnable in long running process mode, which will periodically 8 | send the gathered metrics forward. 9 | 10 | The default metrics format is a [Metrics 2.0](http://metrics20.org/) 11 | compatible JSON, which is created by a set of functions listed in 12 | the configuration file. 13 | 14 | The extracted metrics can be printed out as direct output of 15 | the CLI tool, or sent out of the host the postgresql-metrics 16 | process is running in using [FFWD](https://github.com/spotify/ffwd). 17 | 18 | The [FFWD](https://github.com/spotify/ffwd) format is 19 | plain JSON sent over a UDP socket, so you can use whatever 20 | UDP socket endpoint, which understands JSON, to consume the metrics. 21 | 22 | 23 | ## Prerequisites 24 | 25 | The versions mentioned below are tested to work, but the code should 26 | work on many unlisted versions as well. Just add an issue or send 27 | a pull-request with missing prerequisites, if you test and confirm 28 | postgresql-metrics to work on other versions of the mentioned 29 | technologies. 30 | 31 | * Python 3.5 or later 32 | * PostgreSQL 9.3 or later 33 | * Debian based distribution with systemd (packaging requirement only) 34 | 35 | 36 | ## Building and Installing 37 | 38 | You can build a Debian package by running the following in project root: 39 | 40 | ``` 41 | dpkg-buildpackage -us -uc -b 42 | ``` 43 | 44 | Notice that postgresql-metrics includes by default a systemd service to 45 | run as long running process, pushing metrics to FFWD as gathered. 46 | You need to stop the long running process after installing the package for configuration. 47 | 48 | ``` 49 | sudo systemctl stop postgresql-metrics 50 | ``` 51 | 52 | If you want to debug the process. For the systemd service, you can run 53 | *sudo journalctl -u postgresql-metrics* to see the service's log. 54 | 55 | ### Edit Configuration 56 | 57 | Edit the configuration in */etc/postgresql-metrics/postgresql-metrics.yml* 58 | and in */etc/postgresql-metrics/default/postgresql-metrics.yml*. Notice that 59 | the configuration in the default folder will be overwritten value by value 60 | from the configuration in the configuration root. 61 | 62 | If you are not interested in using the default configuration overwriting 63 | functionality, just delete one of the configurations mentioned above, and 64 | keep using a single configuration file. 65 | 66 | Edit at least the values under *postgres* section in the configuration to 67 | match your PostgreSQL cluster setup. Remember also to list the *databases* 68 | you want to gather metrics from. By database in this context we mean 69 | a database name you created within your PostgreSQL cluster. 70 | 71 | ### Prepare Database 72 | 73 | Before starting to extract the defined metrics, you need to setup your 74 | database cluster using the *prepare-db* CLI call. This will create 75 | the required extensions for your database, and a few functions that are 76 | used by the statistics gathering queries from the metrics process. 77 | The configured metrics user will be also granted access to the created 78 | statistics functions and views. 79 | 80 | You need to provide administrator user to the *prepare-db* call, which 81 | the tool is kind enough to ask. You don't need to provide credentials if 82 | you are running the *prepare-db* with a local user that is configured to be 83 | trusted locally by the PostgreSQL cluster (in *pg_hba.conf*), and is 84 | a super user, like the default *postgres* user created by some distribution 85 | packages (e.g. Debian). You can do the prepare-db call e.g. as follows: 86 | 87 | ``` 88 | sudo su -c "postgresql-metrics prepare-db" postgres 89 | ``` 90 | 91 | It is safe to call the *prepare-db* multiple times for the same database 92 | (the call is idempotent). 93 | 94 | ### Grant Access for Metrics User 95 | 96 | In addition to granting access to the statistics gathering functions and views 97 | within your PostgreSQL cluster (previous step), you need to also add access 98 | to the metrics user into the host based access file (*pg_hba.conf*). 99 | 100 | Add one line per database you are monitoring into the end of the *pg_hba.conf* 101 | file for your cluster: 102 | 103 | ``` 104 | host my_database_name postgresql_metrics_user 127.0.0.1/32 md5 # metrics user access 105 | ``` 106 | 107 | Replace the *my_database_name* and *postgresql_metrics_user* with the values you 108 | configured into the postgresql-metrics configuration in **Edit Configuration** 109 | step above. 110 | 111 | You need to reload (or restart) your server after editing *pg_hba.conf* for 112 | the changes to take effect. 113 | 114 | ### Getting Metrics 115 | 116 | After you have the postgresql-metrics configured, and the database prepared, 117 | you can print out all the metrics that will be extracted from your database 118 | by calling: 119 | 120 | ``` 121 | postgresql-metrics all 122 | ``` 123 | 124 | You need to call the command above as a user that has access to the WAL log 125 | directory under PostgreSQL, or the metric gathering WAL file amounts will fail. 126 | Single failed metric calls will not prevent the rest of gathering process. 127 | 128 | You can also start the long running process again, if using systemd: 129 | 130 | ``` 131 | sudo systemctl start postgresql-metrics 132 | ``` 133 | 134 | ## Explaining the Gathered Metrics 135 | 136 | This section explains the metrics we gather using this tool. 137 | 138 | Notice that there are many system specific metrics that you should gather 139 | in addition to the Postgres specific metrics, for example: 140 | 141 | * CPU usage 142 | * Network usage, sent / received bytes per related network interface 143 | * Memory usage 144 | * Disk I/O operations 145 | * I/O await times 146 | * Disk usage and free space left 147 | 148 | 149 | ### Database Specific Metrics 150 | 151 | Called once per configured database inside your Postgres cluster. 152 | 153 | * **get_stats_disk_usage_for_database**: 154 | This metric shows the size of each of your databases in bytes. 155 | Don't forget to measure the total disk usage of the disk your data 156 | directory resides in as well. 157 | 158 | * **get_stats_tx_rate_for_database**: 159 | Shows the rate of transactions executed per second since the last call 160 | of this function. Shows the rate of executed rollbacks also separately. 161 | 162 | * **get_stats_seconds_since_last_vacuum_per_table**: 163 | This metric shows the amount of time passed in seconds since the last 164 | vacuum was run per table in your database. 165 | 166 | * **get_stats_oldest_transaction_timestamp**: 167 | This metric shows the time the longest running transaction has been open 168 | in your database. This should be usually close to zero, but sometimes, 169 | for example when an administrator forgets to close a maintenance connection, 170 | you will see this value going up. Long running transactions are bad for 171 | your database, so fix the issue as soon as you see this metric increase. 172 | 173 | * **get_stats_index_hit_rates**: 174 | This metric shows you the amount of reads hitting the table indexes 175 | versus the amount of reads requiring sequential scan through the table. 176 | Depending on your table, the amount of data, and the created indexes, 177 | the index hit rate varies. You should understand your data well enough 178 | to know when high index usage is desirable to low index usage. 179 | 180 | * **get_stats_table_bloat**: 181 | This metric shows the amount of wasted space in the database table due 182 | to the MVCC process. Deletes and updates to the table just mark 183 | the obsolete data free, but does not really delete it. Vacuums do free 184 | some of this wasted data, but to get totally rid of table bloat you must 185 | re-create the table with vacuum full. 186 | The current implementation of table bloat metric is rather heavy, so you 187 | might want to disable it in case you see issues with it. 188 | 189 | 190 | ### Database Cluster (Global) Metrics 191 | 192 | Called once per your Postgres cluster. 193 | 194 | * **get_stats_client_connections**: 195 | This metrics shows the amount of connections open to the database 196 | at the moment. The actual metrics gathering connections should be visible 197 | here as well. Notice that having more than a hundred connections open is 198 | usually a bad thing. Consider using a connection pooler, like pgbouncer. 199 | 200 | * **get_stats_lock_statistics**: 201 | This metric shows locks being waited upon by queries, and the amount of 202 | locks granted. In general having any query waiting for locks for any 203 | extended period of time is a sign of problems, like heavy lock contention. 204 | 205 | * **get_stats_heap_hit_statistics**: 206 | This metric shows the amount of reads hitting the memory buffers on your 207 | cluster, and also the amount of reads hitting the disk (or disk caches on 208 | the operating system). Also the heap hit ratio is calculated based on 209 | these values. 210 | 211 | Notice that the read amounts are not actual read queries, but the amount 212 | of blocks read. You will get good idea of amount of reads hitting your 213 | database, when comparing these values with the transaction rate. 214 | 215 | * **get_stats_replication_delays**: 216 | This metric shows the amount of bytes the replication delay is behind master 217 | per each slave. If the slave and the master are in synchronous state, 218 | the replication delay is zero. 219 | 220 | * **get_stats_wal_file_amount**: 221 | This graph shows the amount of files in your database clusters WAL log 222 | directory (pg_wal or pg_xlog). If the WAL file amount starts to suddenly 223 | increase, you probably have issues with your WAL archiving process, which 224 | might lead to the disk filling up, and you database cluster crashing. 225 | 226 | * **get_xid_remaining_ratio, get_multixact_remaining_ratio, get_multixact_members_remaining_ratio**: 227 | These metric shows the corresponding remaining % of transaction ids ("xid"), multixact ids ("mxid"), 228 | and multixact members that are available for postgres to use before exhaustion. 229 | Useful for ensuring that the vacuuming is working as intended for your postgres instance. 230 | 231 | * **get_multixact_members_per_mxid**: 232 | This metric emits the number of multixact members there are per multixact ID. A larger number means 233 | that it'll be quicker for the multixact members exhaustion to happen (as can 234 | be seen in **get_multixact_members_usage_ratio**). 235 | 236 | 237 | ## Short Overview of Python Modules 238 | 239 | * **common.py**: Contains code common for the whole package, like logging and 240 | configuration parsing. 241 | * **default_metrics.py**: Is responsible for turning statistics values into 242 | the default metrics JSON format. 243 | * **localhost_postgres_stats.py**: Functions for statistics extraction from 244 | local Postgres data directory. These calls are automatically global from 245 | a database cluster perspective. 246 | * **metrics_gatherer.py**: Functions for calling the statistics extraction 247 | functions and converting the results into correct metrics format. Called 248 | from the metrics_logic.py. 249 | * **metrics_logic.py**: Contains the CLI tool, initialization, and 250 | the scheduling logic. 251 | * **postgres_queries.py**: Functions for statistics extraction from 252 | the Postgres database. 253 | * **prepare_db.py**: Code for preparing your databases for the metrics 254 | gathering process. 255 | 256 | 257 | ## How to Add More Metrics 258 | 259 | If you want to add more metrics into postgresql-metrics tool, you can do it 260 | by making the following changes to the source: 261 | 262 | 1. If you gather the metric using a Postgres SQL query, add the code into 263 | *postgres_queries.py*, and if you gather the metric by accessing the local 264 | Postgres data directory, add the code into *localhost_postgres_stats.py*. 265 | 266 | 2. Write a function for formatting your gathered metric values into wanted 267 | format, as is done in **default_metrics.py**. You can either expand 268 | the default metrics, or write your own format into another module. 269 | 270 | 3. Write a function into **metrics_gatherer.py**, which will call the metric 271 | extraction functionality you wrote on the first step, and then the metric 272 | value formatting function you wrote on the previous step. 273 | 274 | 4. Add the name of your metrics gatherer function, written in the previous 275 | step, into *postgresql-metrics* configuration file, with wanted time 276 | interval to call the metric gathering function. Notice that you need to 277 | add the function into the correct list of functions depending on whether 278 | you gather a metric that covers your whole database cluster, or a metric 279 | that targets a single database in your cluster. Data directory based 280 | metrics must be a 'global' metric. 281 | 282 | 5. Update this README with explanation on what your new metric is about. 283 | -------------------------------------------------------------------------------- /debian/changelog: -------------------------------------------------------------------------------- 1 | postgresql-metrics (0.3.3) unstable; urgency=low 2 | 3 | * Support cascade replication in get_stats_replication_delays 4 | 5 | -- Richard Mahlberg Fri, 9 Jul 2021 06:25:00 +0000 6 | 7 | postgresql-metrics (0.3.2) unstable; urgency=low 8 | 9 | * Add metrics for seeing how close the postgres instance is to 10 | xid, mxid and member mxid exhaustion. 11 | - get_xid_remaining_ratio 12 | - get_multixact_remaining_ratio 13 | - get_multixact_members_remaining_ratio 14 | - get_multixact_members_per_mxid 15 | 16 | -- Andreas Scherman Thu, 11 Mar 2021 11:00:00 +0000 17 | 18 | postgresql-metrics (0.3.1) unstable; urgency=low 19 | 20 | * Make `sec-since-oldest-xact-start` metric database-specific 21 | * Fix building issue with psycopg2 22 | 23 | -- Volodymyr Pavlenko Thu, 25 Feb 2021 10:25:00 +0000 24 | 25 | postgresql-metrics (0.3.0) unstable; urgency=low 26 | 27 | * Fix for PostgreSQL versions >= 10 28 | * Migrate to using Python3 29 | 30 | -- Hannu Varjoranta Mon, 28 Jan 2019 11:00:00 +0000 31 | 32 | postgresql-metrics (0.2.6) unstable; urgency=low 33 | 34 | * Wrap replication status metric to function 35 | * Non superuser can access replication status metric 36 | 37 | -- Hannu Varjoranta Fri, 2 Nov 2018 14:04:00 +0000 38 | 39 | postgresql-metrics (0.2.5) unstable; urgency=low 40 | 41 | * Report replication status from WAL receivers 42 | 43 | -- Hannu Varjoranta Mon, 29 Oct 2018 16:32:00 +0000 44 | 45 | postgresql-metrics (0.2.4) unstable; urgency=low 46 | 47 | * Support for PostgreSQL 10 48 | 49 | -- Richard Mahlberg Thu, 16 Nov 2017 09:42:30 +0000 50 | 51 | postgresql-metrics (0.2.3) unstable; urgency=low 52 | 53 | * Fix some metrics conversion issues 54 | 55 | -- Hannu Varjoranta Fri, 19 May 2017 11:28:33 +0000 56 | 57 | postgresql-metrics (0.2.2) unstable; urgency=low 58 | 59 | * fix bug in replication delay metric 60 | * add support for syslog based logging 61 | 62 | -- Hannu Varjoranta Mon, 14 Sep 2015 08:04:11 +0000 63 | 64 | postgresql-metrics (0.2.1) unstable; urgency=low 65 | 66 | * Add support for syslog logging 67 | 68 | -- Hannu Varjoranta Tue, 8 Sep 2015 09:34:58 +0000 69 | 70 | postgresql-metrics (0.2.0) unstable; urgency=low 71 | 72 | * Initial open sourced release 73 | 74 | -- Hannu Varjoranta Fri, 21 Aug 2015 11:05:01 +0000 75 | -------------------------------------------------------------------------------- /debian/compat: -------------------------------------------------------------------------------- 1 | 9 2 | -------------------------------------------------------------------------------- /debian/control: -------------------------------------------------------------------------------- 1 | Source: postgresql-metrics 2 | Section: non-free/net 3 | Priority: extra 4 | Maintainer: Hannu Varjoranta 5 | Build-Depends: 6 | python3 (>= 3.5), 7 | python3-dev, 8 | libpq-dev, 9 | debhelper (>= 9), 10 | dh-virtualenv (>= 1.0), 11 | dh-systemd (>= 1.5) 12 | Standards-Version: 3.9.5 13 | X-Python-Version: >= 3.5 14 | 15 | Package: postgresql-metrics 16 | Architecture: any 17 | Pre-Depends: dpkg (>= 1.16.1), python3 (>= 3.5), ${misc:Pre-Depends} 18 | Description: Simple service to provide metrics for your PostgreSQL database 19 | -------------------------------------------------------------------------------- /debian/postgresql-metrics.install: -------------------------------------------------------------------------------- 1 | etc/postgresql-metrics/default/postgresql-metrics.yml 2 | etc/postgresql-metrics/postgresql-metrics.yml 3 | -------------------------------------------------------------------------------- /debian/postgresql-metrics.links: -------------------------------------------------------------------------------- 1 | opt/venvs/postgresql-metrics/bin/postgresql-metrics usr/bin/postgresql-metrics 2 | -------------------------------------------------------------------------------- /debian/postgresql-metrics.service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=postgresql-metrics process 3 | StartLimitInterval=100 4 | StartLimitBurst=10 5 | 6 | [Service] 7 | Type=simple 8 | ExecStart=/usr/bin/postgresql-metrics long-running-ffwd 9 | Restart=on-failure 10 | RestartSec=5 11 | 12 | [Install] 13 | WantedBy=multi-user.target 14 | 15 | -------------------------------------------------------------------------------- /debian/postgresql-metrics.triggers: -------------------------------------------------------------------------------- 1 | interest-noawait /usr/bin/python3 2 | -------------------------------------------------------------------------------- /debian/rules: -------------------------------------------------------------------------------- 1 | #!/usr/bin/make -f 2 | 3 | %: 4 | dh $@ --with python-virtualenv --with systemd --python python3 --preinstall=no-manylinux1 5 | -------------------------------------------------------------------------------- /etc/postgresql-metrics/default/postgresql-metrics.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # Postgres database cluster to connect for gathering metrics 3 | postgres: 4 | host: "127.0.0.1" 5 | port: 5432 6 | user: "postgres" 7 | password: "postgres" 8 | 9 | # List all the databases in your cluster you want to get stats from 10 | databases: [] 11 | 12 | # If datadir is empty, checking PGDATA env variable also 13 | data_dir: 14 | 15 | log: 16 | # Notice that simple CLI calls will always log only to stderr independent 17 | # of these configuration values. 18 | log_to_stderr: false 19 | log_to_syslog: false 20 | log_to_file: true 21 | # Log levels one of: critical, error, warning, notice, info, debug 22 | log_level: info 23 | rotate_file_log: true 24 | # Default file rotate size is is 10 MB. 25 | file_rotate_max_size: 10485760 26 | filename: /var/log/postgresql-metrics/postgresql-metrics.log 27 | # Syslog facility to use, if syslog logging enabled. 28 | syslog_facility: user 29 | 30 | # FFWD is used for pushing metrics out for the long running process 31 | ffwd: 32 | host: "127.0.0.1" 33 | port: 19000 34 | 35 | # You need to have a matchingly named metrics function in metrics_gatherer.py, 36 | # to include it in any of the DB function sections here below. 37 | # 38 | # Each entry must be a tuple with the function name, and a time interval in seconds 39 | # to call that metrics function. 40 | # 41 | # db_functions: Functions called once per each database in cluster. 42 | db_functions: 43 | - ["get_stats_disk_usage_for_database", 180] 44 | - ["get_stats_tx_rate_for_database", 60] 45 | - ["get_stats_seconds_since_last_vacuum_per_table", 60] 46 | - ["get_stats_oldest_transaction_timestamp", 180] 47 | - ["get_stats_index_hit_rates", 60] 48 | # table bloat is a heavy query, which might take many minutes to execute for huge tables 49 | # 43200 seconds = 12*60*60 seconds = 12 hours 50 | - ["get_stats_table_bloat", 43200] 51 | # replication status relies on `pg_stat_wal_receiver`, which is only available on postgres 9.6+ 52 | # - ["get_stats_incoming_replication_status", 30] 53 | 54 | # global_db_functions: Functions called once per the whole database cluster. 55 | global_db_functions: 56 | - ["get_stats_client_connections", 60] 57 | - ["get_stats_lock_statistics", 60] 58 | - ["get_stats_heap_hit_statistics", 60] 59 | - ["get_stats_replication_delays", 60] 60 | - ["get_stats_wal_file_amount", 180] 61 | - ["get_multixact_members_per_mxid", 60] 62 | - ["get_multixact_members_remaining_ratio", 60] 63 | - ["get_multixact_remaining_ratio", 60] 64 | - ["get_xid_remaining_ratio", 60] 65 | -------------------------------------------------------------------------------- /etc/postgresql-metrics/postgresql-metrics.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # Notice that this configuration will overwrite the configuration 3 | # in default folder, keeping the unchanged values from default configuration. 4 | postgres: 5 | host: "127.0.0.1" 6 | user: "postgresql_metrics_user" 7 | password: "postgresql_metrics_password" 8 | databases: 9 | - postgres 10 | 11 | # Notice that if you overwrite a list here, like db_functions, 12 | # you will overwrite only the values that have same function name. 13 | # There is no need to copy everything from the default configuration. 14 | -------------------------------------------------------------------------------- /postgresql_metrics/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /postgresql_metrics/common.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ 15 | This module defines some basic common application functionality, like logging. 16 | """ 17 | 18 | import os 19 | 20 | import logbook 21 | import yaml 22 | 23 | 24 | def get_logger(logger_name="postgresql-metrics"): 25 | return logbook.Logger(logger_name) 26 | 27 | 28 | def figure_out_log_level(given_level): 29 | if isinstance(given_level, str): 30 | return logbook.lookup_level(given_level.strip().upper()) 31 | else: 32 | return given_level 33 | 34 | 35 | def init_logging_stderr(log_level='notset', bubble=False): 36 | handler = logbook.StderrHandler(level=figure_out_log_level(log_level), bubble=bubble) 37 | handler.push_application() 38 | get_logger().debug("stderr logging initialized") 39 | 40 | 41 | def init_logging_file(filename, log_level='notset', rotate_log=True, rotate_max_size=10485760, 42 | bubble=True): 43 | log_dir = os.path.dirname(filename) 44 | if not os.path.exists(log_dir): 45 | os.makedirs(log_dir) 46 | if rotate_log is True: 47 | handler = logbook.RotatingFileHandler(filename, level=figure_out_log_level(log_level), 48 | max_size=int(rotate_max_size), bubble=bubble) 49 | else: 50 | handler = logbook.FileHandler(filename, level=figure_out_log_level(log_level), 51 | bubble=bubble) 52 | handler.push_application() 53 | get_logger().debug("file based logging initialized in directory: " + log_dir) 54 | 55 | 56 | def init_logging_syslog(log_level='notset', facility='local0', bubble=True): 57 | handler = logbook.SyslogHandler('postgresql-metrics', facility=facility, 58 | level=figure_out_log_level(log_level), bubble=bubble) 59 | handler.push_application() 60 | get_logger().debug("syslog logging initialized") 61 | 62 | 63 | def merge_configs(to_be_merged, default): 64 | """Merges two configuration dictionaries by overwriting values with 65 | same keys, with the priority on values given on the 'left' side, so 66 | the to_be_merged dict. 67 | 68 | Notice that with lists in the configuration, it skips from the default 69 | (right side) the tuples in that which already exist in the left side 70 | to_be_merged list. This is used to be able to override time intervals for 71 | default values in the configuration. 72 | 73 | Example: 74 | In [1]: x = [["get_stats_disk_usage_for_database", 180], 75 | ["get_stats_tx_rate_for_database", 500]] 76 | In [2]: y = [["get_stats_seconds_since_last_vacuum_per_table", 60], 77 | ["get_stats_tx_rate_for_database", 60]] 78 | In [3]: merge_configs(x, y) 79 | Out[3]: 80 | [['get_stats_disk_usage_for_database', 180], 81 | ['get_stats_tx_rate_for_database', 500], 82 | ['get_stats_seconds_since_last_vacuum_per_table', 60]] 83 | """ 84 | if isinstance(to_be_merged, dict) and isinstance(default, dict): 85 | for k, v in default.items(): 86 | if k not in to_be_merged: 87 | to_be_merged[k] = v 88 | else: 89 | to_be_merged[k] = merge_configs(to_be_merged[k], v) 90 | elif isinstance(to_be_merged, list) and isinstance(default, list): 91 | same_keys = set() 92 | for x in to_be_merged: 93 | for y in default: 94 | if isinstance(x, (list, set, tuple)) and isinstance(y, (list, set, tuple)) and len( 95 | x) > 0 and len(y) > 0 and x[0] == y[0]: 96 | same_keys.add(x[0]) 97 | for y in default: 98 | if not isinstance(y, (list, set, tuple)) or y[0] not in same_keys: 99 | to_be_merged.append(y) 100 | return to_be_merged 101 | 102 | 103 | def find_and_parse_config(config_path): 104 | """Finds the service configuration file and parses it. 105 | Checks also a directory called default, to check for default configuration values, 106 | that will be overwritten by the actual configuration found on given path. 107 | """ 108 | config_filename = os.path.basename(config_path) 109 | config_root = os.path.dirname(config_path) 110 | default_root = os.path.join(config_root, 'default') 111 | config_dict = {} 112 | for config_dir in (default_root, config_root): 113 | current_path = os.path.join(config_dir, config_filename) 114 | if os.path.isfile(current_path): 115 | with open(current_path, 'r') as f: 116 | read_config_dict = yaml.safe_load(f) 117 | config_dict = merge_configs(read_config_dict, config_dict) 118 | return config_dict 119 | -------------------------------------------------------------------------------- /postgresql_metrics/default_metrics.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ 15 | This module defines the default output format for the gathered metrics. 16 | 17 | This metrics format follows the Metrics 2.0 conventions: 18 | http://metrics20.org/ 19 | """ 20 | 21 | DEFAULT_METRIC_NAMESPACE = 'postgresql' 22 | DEFAULT_METRIC_TYPE = 'metric' 23 | 24 | 25 | def create_default_metric(value, attributes=None): 26 | metric = { 27 | 'type': DEFAULT_METRIC_TYPE, 28 | 'key': DEFAULT_METRIC_NAMESPACE, 29 | 'value': value, 30 | } 31 | if attributes: 32 | metric['attributes'] = dict(attributes) 33 | return metric 34 | 35 | 36 | def metric_client_connections(value): 37 | return create_default_metric(value, 38 | {'what': 'client-connections', 39 | 'unit': 'connection'}) 40 | 41 | 42 | def metric_database_size(database_name, value): 43 | return create_default_metric(value, 44 | {'what': 'database-size', 45 | 'database': database_name, 46 | 'unit': 'B'}) 47 | 48 | 49 | def metric_transaction_rate(database_name, value): 50 | return create_default_metric(value, 51 | {'what': 'transaction-rate', 52 | 'type': 'transactions', 53 | 'database': database_name, 54 | 'unit': 'transaction'}) 55 | 56 | 57 | def metric_rollbacks_rate(database_name, value): 58 | return create_default_metric(value, 59 | {'what': 'transaction-rollbacks', 60 | 'type': 'transactions', 61 | 'database': database_name, 62 | 'unit': 'transaction'}) 63 | 64 | 65 | def metric_seconds_since_last_vacuum(database_name, table_name, value): 66 | return create_default_metric(value, 67 | {'what': 'last-vacuum', 68 | 'database': database_name, 69 | 'table': table_name, 70 | 'unit': 's'}) 71 | 72 | 73 | def metric_blocks_read_from_disk(database_name, value): 74 | return create_default_metric(float(value), 75 | {'what': 'blocks-read-from-disk', 76 | 'type': 'heap-reads', 77 | 'database': database_name, 78 | 'unit': 'blocks'}) 79 | 80 | 81 | def metric_blocks_read_from_buffer(database_name, value): 82 | return create_default_metric(float(value), 83 | {'what': 'blocks-read-from-buffer', 84 | 'type': 'heap-reads', 85 | 'database': database_name, 86 | 'unit': 'blocks'}) 87 | 88 | 89 | def metric_blocks_heap_hit_ratio(database_name, value): 90 | return create_default_metric(float(value), 91 | {'what': 'blocks-heap-hit-ratio', 92 | 'database': database_name, 93 | 'unit': 'buffer_hit%'}) 94 | 95 | 96 | def metric_locks_granted(locktype, value): 97 | return create_default_metric(value, 98 | {'what': 'locks_granted', 99 | 'type': 'locks', 100 | 'locktype': locktype, 101 | 'unit': 'lock'}) 102 | 103 | 104 | def metric_locks_waiting(locktype, value): 105 | return create_default_metric(value, 106 | {'what': 'locks_waiting', 107 | 'type': 'locks', 108 | 'locktype': locktype, 109 | 'unit': 'lock'}) 110 | 111 | 112 | def metric_sec_since_oldest_xact_start(database_name, value): 113 | return create_default_metric(value, 114 | {'what': 'sec-since-oldest-xact-start', 115 | 'database': database_name, 116 | 'unit': 's'}) 117 | 118 | 119 | def metric_xid_remaining_ratio(value): 120 | return create_default_metric(value, 121 | {'what': 'xid-remaining', 122 | 'unit': '%'}) 123 | 124 | 125 | def metric_multixact_remaining_ratio(value): 126 | return create_default_metric(value, 127 | {'what': 'mxid-remaining', 128 | 'unit': '%'}) 129 | 130 | 131 | def metric_multixact_members_per_mxid(value): 132 | return create_default_metric(value, 133 | {'what': 'multixact-members-per-mxid', 134 | 'unit': 'members/id'}) 135 | 136 | 137 | def metric_multixact_members_remaining_ratio(value): 138 | return create_default_metric(value, 139 | {'what': 'multixact-members-remaining', 140 | 'unit': '%'}) 141 | 142 | def metric_wal_file_amount(value): 143 | return create_default_metric(value, 144 | {'what': 'wal-file-amount', 145 | 'unit': 'file'}) 146 | 147 | 148 | def metric_table_bloat(database_name, table_name, value): 149 | return create_default_metric(float(value), 150 | {'what': 'table-bloat', 151 | 'database': database_name, 152 | 'table': table_name, 153 | 'unit': 'bloat%'}) 154 | 155 | 156 | def metric_index_hit_ratio(database_name, table_name, value): 157 | return create_default_metric(float(value), 158 | {'what': 'index-hit', 159 | 'database': database_name, 160 | 'table': table_name, 161 | 'unit': 'index_hit%'}) 162 | 163 | 164 | def metric_replication_delay_bytes(client_addr, value): 165 | return create_default_metric(value, 166 | {'what': 'replication-delay-bytes', 167 | 'slave': client_addr, 168 | 'unit': 'B'}) 169 | 170 | 171 | def metric_incoming_replication_running(replication_host, value): 172 | return create_default_metric(value, 173 | {'what': 'incoming-replication-running', 174 | 'master': replication_host, 175 | 'unit': 'msg'}) 176 | -------------------------------------------------------------------------------- /postgresql_metrics/localhost_postgres_stats.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ 15 | This module contains code for statistics extraction that is based 16 | on access to local Postgres data directory. 17 | """ 18 | 19 | import os 20 | import re 21 | 22 | from postgresql_metrics.common import get_logger 23 | 24 | LOG = get_logger() 25 | 26 | 27 | def get_multixact_member_files(data_dir): 28 | try: 29 | members_dir = os.path.join(data_dir, "pg_multixact", "members") 30 | if os.path.isdir(members_dir): 31 | return len([f for f in os.listdir(members_dir) if os.path.isfile(os.path.join(members_dir, f))]) 32 | else: 33 | LOG.exception(f"Missing pg_multixact/members directory in data_dir: {data_dir}") 34 | except OSError: 35 | LOG.exception('Failed accessing multixact member files in: {data_dir}. Is data dir readable by user?') 36 | return 0 37 | 38 | 39 | def get_amount_of_wal_files(data_dir): 40 | amount_of_wal_files = 0 41 | try: 42 | if data_dir and os.path.isdir(data_dir): 43 | wal_dir = os.path.join(data_dir, 'pg_wal') 44 | if not os.path.isdir(wal_dir): 45 | wal_dir = os.path.join(data_dir, 'pg_xlog') 46 | 47 | # each WAL file is named as 24-character hexadecimal number 48 | for possible_wal_file_name in os.listdir(wal_dir): 49 | if re.match('^[0-9A-F]{24}$', possible_wal_file_name): 50 | amount_of_wal_files += 1 51 | except OSError: 52 | LOG.exception('Failed accessing WAL files. Is data dir readable by user?') 53 | return amount_of_wal_files 54 | -------------------------------------------------------------------------------- /postgresql_metrics/metrics_gatherer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ 15 | This module provides all the metrics in correct format for output. 16 | 17 | Steps for adding statistic values as metrics: 18 | 1) Write function for extracting the statistics value: 19 | * extract from DB, then write it in postgres_queries.py 20 | * extract from local Postgres data directory, 21 | then write it in localhost_postgres_stats.py 22 | 2) Write a function into default_metrics.py for transferring a statistic into a metric. 23 | 3) Write a function in this module to call both of the above defined functions, 24 | returning the metrics in correct form (always in a list, see below). 25 | 4) Add the name of the function in this module into the configuration, 26 | and define the interval the metric should be called. 27 | """ 28 | 29 | from postgresql_metrics.default_metrics import ( 30 | metric_client_connections, 31 | metric_database_size, 32 | metric_transaction_rate, 33 | metric_rollbacks_rate, 34 | metric_seconds_since_last_vacuum, 35 | metric_blocks_read_from_disk, 36 | metric_blocks_read_from_buffer, 37 | metric_blocks_heap_hit_ratio, 38 | metric_locks_granted, 39 | metric_locks_waiting, 40 | metric_sec_since_oldest_xact_start, 41 | metric_table_bloat, 42 | metric_index_hit_ratio, 43 | metric_replication_delay_bytes, 44 | metric_wal_file_amount, 45 | metric_incoming_replication_running, 46 | metric_multixact_members_per_mxid, 47 | metric_multixact_remaining_ratio, 48 | metric_xid_remaining_ratio, 49 | metric_multixact_members_remaining_ratio, 50 | ) 51 | 52 | from postgresql_metrics.localhost_postgres_stats import get_amount_of_wal_files, get_multixact_member_files 53 | 54 | from postgresql_metrics.postgres_queries import ( 55 | get_client_connections_amount, 56 | get_disk_usage_for_database, 57 | get_transaction_rate_for_database, 58 | get_seconds_since_last_vacuum_per_table, 59 | get_heap_hit_statistics, 60 | get_lock_statistics, 61 | get_oldest_transaction_timestamp, 62 | get_table_bloat, 63 | get_index_hit_rates, 64 | get_replication_delays, 65 | get_tables_with_oids_for_current_db, 66 | get_wal_receiver_status, 67 | get_max_mxid_age, 68 | get_max_xid_age, 69 | ) 70 | 71 | MEMBERS_PER_MEMBER_FILE = 52352 72 | MAX_MULTIXACT_MEMBERS = 2**32 73 | WRAPAROUND_LIMIT = (2**32/2) - 1 74 | 75 | # Notice that all functions here are expected to return a list of metrics. 76 | # Notice also that the names of these functions should match the configuration. 77 | 78 | 79 | def get_stats_client_connections(_data_dir, db_connection): 80 | client_amount = get_client_connections_amount(db_connection) 81 | return [metric_client_connections(client_amount)] 82 | 83 | 84 | def get_stats_disk_usage_for_database(_data_dir, db_connection): 85 | db_size = get_disk_usage_for_database(db_connection) 86 | return [metric_database_size(db_size[0], db_size[1])] 87 | 88 | 89 | def get_stats_tx_rate_for_database(_data_dir, db_connection): 90 | db_name, tx_rate, tx_rollbacks = get_transaction_rate_for_database(db_connection) 91 | if tx_rate is not None: 92 | return [metric_transaction_rate(db_name, tx_rate), 93 | metric_rollbacks_rate(db_name, tx_rollbacks)] 94 | else: 95 | return [] 96 | 97 | 98 | def get_stats_seconds_since_last_vacuum_per_table(_data_dir, db_connection): 99 | last_vacuums_data = get_seconds_since_last_vacuum_per_table(db_connection) 100 | metrics = [] 101 | for db_name, table_name, seconds_since in last_vacuums_data: 102 | metrics.append(metric_seconds_since_last_vacuum(db_name, table_name, seconds_since)) 103 | return metrics 104 | 105 | 106 | def get_stats_heap_hit_statistics(_data_dir, db_connection): 107 | db_name, heap_read, heap_hit, heap_hit_ratio = get_heap_hit_statistics(db_connection) 108 | metrics = [] 109 | if heap_hit_ratio is not None: 110 | metrics.append(metric_blocks_read_from_disk(db_name, heap_read)) 111 | metrics.append(metric_blocks_read_from_buffer(db_name, heap_hit)) 112 | metrics.append(metric_blocks_heap_hit_ratio(db_name, heap_hit_ratio)) 113 | return metrics 114 | 115 | 116 | def get_stats_lock_statistics(_data_dir, db_connection): 117 | locks_by_type, [total_locks_waiting, total_locks_granted] = get_lock_statistics(db_connection) 118 | metrics = [] 119 | for lock_type, [locks_waiting, locks_granted] in locks_by_type.items(): 120 | metrics.append(metric_locks_granted(lock_type, locks_granted)) 121 | metrics.append(metric_locks_waiting(lock_type, locks_waiting)) 122 | metrics.append(metric_locks_granted("total", total_locks_granted)) 123 | metrics.append(metric_locks_waiting("total", total_locks_waiting)) 124 | return metrics 125 | 126 | 127 | def get_stats_oldest_transaction_timestamp(_data_dir, db_connection): 128 | db_name, sec_since_oldest_xact_start = get_oldest_transaction_timestamp(db_connection) 129 | metrics = [] 130 | if sec_since_oldest_xact_start is not None: 131 | metrics.append(metric_sec_since_oldest_xact_start(db_name, sec_since_oldest_xact_start)) 132 | return metrics 133 | 134 | 135 | def get_stats_table_bloat(_data_dir, db_connection): 136 | tables_with_oids = get_tables_with_oids_for_current_db(db_connection) 137 | metrics = [] 138 | for table_oid, table_name in tables_with_oids: 139 | db_name, table_bloat_percentage = get_table_bloat(db_connection, table_oid) 140 | if db_name: 141 | metrics.append(metric_table_bloat(db_name, table_name, table_bloat_percentage)) 142 | return metrics 143 | 144 | 145 | def get_stats_index_hit_rates(_data_dir, db_connection): 146 | index_hit_rates = get_index_hit_rates(db_connection) 147 | metrics = [] 148 | for db_name, table_name, index_hit_ratio in index_hit_rates: 149 | if index_hit_ratio is not None: 150 | metrics.append(metric_index_hit_ratio(db_name, table_name, index_hit_ratio)) 151 | return metrics 152 | 153 | 154 | def get_stats_replication_delays(_data_dir, db_connection): 155 | replication_delays = get_replication_delays(db_connection) 156 | metrics = [] 157 | for client_addr, delay_in_bytes in replication_delays: 158 | metrics.append(metric_replication_delay_bytes(client_addr, delay_in_bytes)) 159 | return metrics 160 | 161 | 162 | def _get_multixact_members(data_dir): 163 | return get_multixact_member_files(data_dir) * MEMBERS_PER_MEMBER_FILE 164 | 165 | 166 | def get_multixact_members_per_mxid(data_dir, db_connection): 167 | members = _get_multixact_members(data_dir) 168 | mxid_age = get_max_mxid_age(db_connection) 169 | if not mxid_age: 170 | return [] 171 | members_per_id = round(members / mxid_age, 2) 172 | return [metric_multixact_members_per_mxid(members_per_id)] 173 | 174 | 175 | def get_multixact_members_remaining_ratio(data_dir, _db_connection): 176 | members = _get_multixact_members(data_dir) 177 | ratio = round(members / MAX_MULTIXACT_MEMBERS, 2) 178 | percentage_remaining = (1.0 - ratio) * 100 179 | return [metric_multixact_members_remaining_ratio(percentage_remaining)] 180 | 181 | 182 | def get_multixact_remaining_ratio(_data_dir, db_connection): 183 | mxid_age = get_max_mxid_age(db_connection) 184 | if not mxid_age: 185 | return [] 186 | ratio = round(mxid_age / WRAPAROUND_LIMIT, 2) 187 | percentage_remaining = (1.0 - ratio) * 100 188 | return [metric_multixact_remaining_ratio(percentage_remaining)] 189 | 190 | 191 | def get_xid_remaining_ratio(_data_dir, db_connection): 192 | xid_age = get_max_xid_age(db_connection) 193 | if not xid_age: 194 | return [] 195 | ratio = round(xid_age / WRAPAROUND_LIMIT, 2) 196 | percentage_remaining = (1.0 - ratio) * 100 197 | return [metric_xid_remaining_ratio(percentage_remaining)] 198 | 199 | 200 | def get_stats_wal_file_amount(data_dir, _db_connection): 201 | return [metric_wal_file_amount(get_amount_of_wal_files(data_dir))] 202 | 203 | 204 | def get_stats_incoming_replication_status(_data_dir, db_connection): 205 | return [metric_incoming_replication_running(host, is_streaming) 206 | for host, is_streaming in get_wal_receiver_status(db_connection)] 207 | -------------------------------------------------------------------------------- /postgresql_metrics/metrics_logic.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ 16 | This module contains the CLI and the logic for gathering and 17 | forwarding Postgres metrics into a metrics gatherer using Spotify FFWD. 18 | 19 | https://github.com/spotify/ffwd 20 | 21 | Notice that the REAMDE.md in the repository root contains short descriptions 22 | of all the Python modules within this project. 23 | """ 24 | import argparse 25 | import json 26 | import os 27 | import socket 28 | import time 29 | 30 | from postgresql_metrics import metrics_gatherer 31 | from postgresql_metrics.postgres_queries import ( 32 | get_db_name_from_connection, 33 | get_db_connection, 34 | get_major_version 35 | ) 36 | from postgresql_metrics.prepare_db import prepare_databases_for_metrics 37 | from postgresql_metrics.common import ( 38 | init_logging_file, 39 | init_logging_stderr, 40 | init_logging_syslog, 41 | get_logger, 42 | find_and_parse_config 43 | ) 44 | 45 | LOG = get_logger() 46 | 47 | DEFAULT_CONFIG_PATH = "/etc/postgresql-metrics/postgresql-metrics.yml" 48 | 49 | 50 | # LAST_RUN_TIMES_FOR_STATS is a dict of dicts, i.e. key pointing to a key pointing to a value: 51 | # database name -> stats function pointer -> last run timestamp 52 | # This means that we have separate last run timestamps per database and per stats function. 53 | LAST_RUN_TIMES_FOR_STATS = {} 54 | 55 | DEFAULT_FFWD_PORT = 19000 56 | DEFAULT_FFWD_HOST = '127.0.0.1' 57 | 58 | 59 | def push_to_ffwd(metric_dicts, ffwd_addr, data_formatter=json.dumps): 60 | s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) 61 | try: 62 | for metric in metric_dicts: 63 | data = data_formatter(metric) 64 | LOG.debug('send UDP packet to {} with data:\n{}', ffwd_addr, data) 65 | s.sendto(bytes(data, 'UTF-8'), ffwd_addr) 66 | finally: 67 | s.close() 68 | 69 | 70 | def _is_time_to_call_stats_func_and_update_ts(database_name, metrics_func, run_interval_sec): 71 | """Check if it is time to schedule new metrics gathering call, 72 | and assume the call will be made immediately if yes. 73 | This means that the call timestamp for the given database_name and metrics_func 74 | is updated within this function. 75 | """ 76 | last_run_timestamp = LAST_RUN_TIMES_FOR_STATS.get(database_name, {}).get(metrics_func, 0) 77 | if time.time() - last_run_timestamp > run_interval_sec: 78 | if database_name not in LAST_RUN_TIMES_FOR_STATS: 79 | LAST_RUN_TIMES_FOR_STATS[database_name] = {} 80 | LAST_RUN_TIMES_FOR_STATS[database_name][metrics_func] = time.time() 81 | return True 82 | return False 83 | 84 | 85 | def _call_all_db_functions(db_stats_functions, db_parameters, schedule=False, db_name=None): 86 | """Iterates through all given statistics functions, calling them with the given parameter. 87 | The db_parameter can be a database connection or a file path to Postgres data directory, 88 | depending on the statistics function to call. 89 | """ 90 | metrics = [] 91 | for (db_metrics_func, interval_s) in db_stats_functions: 92 | if schedule: 93 | if not db_name: 94 | # DB name is given only if we want to make database specific scheduling. 95 | db_name = "__cluster_global__" 96 | is_call_required = \ 97 | _is_time_to_call_stats_func_and_update_ts(db_name, db_metrics_func, interval_s) 98 | else: 99 | is_call_required = True 100 | if is_call_required: 101 | try: 102 | LOG.debug('calling stats function {}', db_metrics_func.__name__) 103 | metrics.extend(db_metrics_func(*db_parameters)) 104 | except Exception: 105 | LOG.exception('failed calling stats function: ' + db_metrics_func.__name__) 106 | return metrics 107 | 108 | 109 | def get_stats_functions_from_conf(func_key_name, conf): 110 | """Finds the statistics function configured, and ensures that the callables 111 | are found from metrics_gatherer.py.""" 112 | stats_functions = [] 113 | if func_key_name in conf and conf[func_key_name] is not None: 114 | for func_name, call_interval in conf[func_key_name]: 115 | stats_func = getattr(metrics_gatherer, func_name) 116 | if not stats_func or not callable(stats_func): 117 | raise Exception("statistics function '" + func_name + 118 | "' not found in configuration under key name: " + func_key_name) 119 | stats_functions.append((stats_func, int(call_interval))) 120 | return stats_functions 121 | 122 | 123 | def get_all_stats_functions_from_conf(conf): 124 | db_functions = get_stats_functions_from_conf('db_functions', conf) 125 | global_db_functions = get_stats_functions_from_conf('global_db_functions', conf) 126 | # `data_dir_functions` is deprecated, but to preserve backwards compatibility still read 127 | data_dir_functions = get_stats_functions_from_conf('data_dir_functions', conf) 128 | if data_dir_functions: 129 | LOG.warn("data_dir_functions field in config is deprecated -- consider moving functions to global_db_functions") 130 | all_global_db_functions = data_dir_functions + global_db_functions 131 | return db_functions, all_global_db_functions 132 | 133 | 134 | def get_all_metrics_now(db_connections, conf): 135 | """Get all the metrics immediately without any scheduling. 136 | First gets the global stats with first available database connection, 137 | and then gets the rest per database. 138 | """ 139 | db_functions, global_db_functions = get_all_stats_functions_from_conf(conf) 140 | data_dir = figure_out_postgres_data_dir(db_connections[0], conf) 141 | 142 | all_metrics = _call_all_db_functions(global_db_functions, (data_dir, db_connections[0])) 143 | for db_connection in db_connections: 144 | all_metrics.extend(_call_all_db_functions(db_functions, (data_dir, db_connection))) 145 | return all_metrics 146 | 147 | 148 | def get_all_metrics_scheduled(db_connections, conf): 149 | """Get all the metrics in scheduled manner, not calling all the functions every time. 150 | First gets the global stats with first available database connection, 151 | and then gets the rest per database. 152 | """ 153 | db_functions, global_db_functions = get_all_stats_functions_from_conf(conf) 154 | data_dir = figure_out_postgres_data_dir(db_connections[0], conf) 155 | 156 | all_metrics = _call_all_db_functions(global_db_functions, (data_dir, db_connections[0]), schedule=True) 157 | for db_connection in db_connections: 158 | db_name = get_db_name_from_connection(db_connection) 159 | all_metrics.extend(_call_all_db_functions(db_functions, (data_dir, db_connection), 160 | schedule=True, db_name=db_name)) 161 | return all_metrics 162 | 163 | 164 | def run_long_running_ffwd(conf): 165 | db_connections = get_db_connections_with_conf(conf) 166 | ffwd_address = (DEFAULT_FFWD_HOST, DEFAULT_FFWD_PORT) 167 | if conf and conf.get('ffwd'): 168 | ffwd_address = (conf['ffwd'].get('host', DEFAULT_FFWD_HOST), 169 | int(conf['ffwd'].get('port', DEFAULT_FFWD_PORT))) 170 | try: 171 | LOG.info("starting a long running statistics polling loop with {} database(s)", 172 | len(db_connections)) 173 | while True: 174 | try: 175 | # Notice that the scheduling is separate from this few second sleep, 176 | # but as the granularity is in tens of seconds, few seconds interval is enough. 177 | time.sleep(5.0) 178 | db_connections = confirm_connections_work(conf, db_connections) 179 | metrics = get_all_metrics_scheduled(db_connections, conf) 180 | if metrics: 181 | LOG.info("sending {} metrics to ffwd...", len(metrics)) 182 | push_to_ffwd(metrics, ffwd_address) 183 | except (KeyboardInterrupt, SystemExit): 184 | LOG.warn('*** keyboard interrupt / system exit ***') 185 | raise 186 | except Exception: 187 | LOG.exception('metrics check failed') 188 | finally: 189 | for db_connection in db_connections: 190 | if not db_connection.closed: 191 | db_connection.close() 192 | 193 | 194 | def confirm_connections_work(conf, db_connections): 195 | """Call this to confirm that all connections are still alive before using them. 196 | Will recreate any closed connections.""" 197 | confirmed_connections = [] 198 | for db_connection in db_connections: 199 | if db_connection.closed: 200 | db_name = get_db_name_from_connection(db_connection) 201 | LOG.warn("database connection is closed to db '{}', reconnecting", db_name) 202 | confirmed_connections.append(connect_to_single_db_with_conf(conf, db_name)) 203 | else: 204 | confirmed_connections.append(db_connection) 205 | return confirmed_connections 206 | 207 | 208 | def connect_to_single_db_with_conf(conf, database_name): 209 | LOG.info("open database connection to {}:{}, user '{}', database '{}'", 210 | conf['postgres']['host'], conf['postgres']['port'], 211 | conf['postgres']['user'], database_name) 212 | return get_db_connection(database_name, 213 | conf['postgres']['user'], 214 | conf['postgres']['password'], 215 | host=conf['postgres']['host'], 216 | port=int(conf['postgres']['port'])) 217 | 218 | 219 | def get_db_connections_with_conf(conf): 220 | connections = [] 221 | if 'databases' in conf['postgres']: 222 | if not conf['postgres']['databases']: 223 | raise Exception("no target databases defined in configuration") 224 | for database_name in conf['postgres']['databases']: 225 | connections.append(connect_to_single_db_with_conf(conf, database_name)) 226 | elif 'database' in conf['postgres']: 227 | # this is here just for backward compatibility, before the databases option handled above 228 | connections.append(connect_to_single_db_with_conf(conf, conf['postgres']['database'])) 229 | if not connections: 230 | raise Exception("could not connect to database with configuration:\n" + str(conf)) 231 | return connections 232 | 233 | 234 | def figure_out_postgres_data_dir(db_connection, conf): 235 | data_dir = conf['postgres']['data_dir'] 236 | if not data_dir: 237 | db_version = get_major_version(db_connection) 238 | data_dir = "/var/lib/postgresql/{0}/main".format(db_version) 239 | if not os.path.isdir(data_dir): 240 | LOG.debug("data directory '{}' doesn't exist", data_dir) 241 | data_dir = None 242 | else: 243 | LOG.debug('using postgres data directory: {}', data_dir) 244 | return data_dir 245 | 246 | 247 | DESCRIPTION = """Spotify PostgreSQL Metrics 248 | This tool fetches metrics from a Postgres database cluster, 249 | and returns the results in Metrics 2.0 compatible JSON format. 250 | 251 | You can run the 'long-running-ffwd' as a background process that keeps 252 | sending the gathered metrics into FFWD as configured, or you can call 253 | this CLI tool directly for simply printing out the metrics for other 254 | purposes. 255 | 256 | Run the prepare-db command to prepare your monitored databases in 257 | the Postgres cluster for the statistics gathering. You need to run 258 | the prepare-db command with database super-user credentials. 259 | """ 260 | 261 | USAGE = """Usage: postgresql-metrics 262 | 263 | can be: 264 | all Show all available metrics 265 | long-running-ffwd Run in infinite loop, sending metrics to FFWD 266 | prepare-db Create required users, extensions, and functions for metrics. 267 | """ 268 | 269 | 270 | def main(): 271 | parser = argparse.ArgumentParser(description=DESCRIPTION, usage=USAGE) 272 | parser.add_argument("command", help="the command to run") 273 | parser.add_argument("-c", "--config-path", default=DEFAULT_CONFIG_PATH, 274 | help="configuration path, checks also folder 'default' on given path [{}]" 275 | .format(DEFAULT_CONFIG_PATH)) 276 | 277 | args = parser.parse_args() 278 | 279 | conf = find_and_parse_config(args.config_path) 280 | if 'postgres' not in conf: 281 | raise Exception("failed parsing configuration from: " + args.config_path) 282 | log_level = conf.get('log', {}).get('log_level', 'debug') 283 | 284 | if args.command == 'all': 285 | init_logging_stderr(log_level) 286 | db_connections = get_db_connections_with_conf(conf) 287 | get_all_metrics_now(db_connections, conf) 288 | print("# sleep 5 s to get diffs on derivative metrics") 289 | time.sleep(5.0) 290 | for metric in get_all_metrics_now(db_connections, conf): 291 | print(metric) 292 | 293 | elif args.command == 'long-running-ffwd': 294 | if conf['log']['log_to_stderr'] is True: 295 | init_logging_stderr(log_level) 296 | if conf['log']['log_to_file'] is True: 297 | init_logging_file(conf['log']['filename'], log_level, 298 | conf['log']['rotate_file_log'], conf['log']['file_rotate_max_size']) 299 | if conf['log']['log_to_syslog'] is True: 300 | init_logging_syslog(log_level, facility=conf['log']['syslog_facility']) 301 | run_long_running_ffwd(conf) 302 | 303 | elif args.command == 'prepare-db': 304 | init_logging_stderr(log_level) 305 | prepare_databases_for_metrics(conf) 306 | 307 | 308 | if __name__ == '__main__': # if this script is called from command line 309 | main() 310 | -------------------------------------------------------------------------------- /postgresql_metrics/postgres_queries.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ 15 | This module contains code for statistics extraction that is based 16 | on having a connection to a Postgres database, and running queries through it. 17 | """ 18 | 19 | import psycopg2 20 | import re 21 | 22 | from postgresql_metrics.common import get_logger 23 | 24 | LOG = get_logger() 25 | 26 | 27 | # contains mappings of metric-name: (last_timestamp, last_value) 28 | # used to derive metric value diffs between the current and the previous value 29 | DERIVE_DICT = dict() 30 | 31 | # regex used to extra host from conninfo string 32 | CONNINFO_HOST_RE = re.compile(r'($|\s)host=(?P.*?)(^|\s)') 33 | 34 | 35 | def get_db_connection(database, username, password, host='127.0.0.1', port=5432, 36 | connect_timeout=10): 37 | connection = psycopg2.connect(user=username, 38 | password=password, 39 | host=host, 40 | port=int(port), 41 | database=database, 42 | connect_timeout=connect_timeout) 43 | connection.autocommit = True 44 | return connection 45 | 46 | 47 | def get_db_name_from_connection(connection): 48 | """example dsn: dbname=varjodb user=varjo password=xxxxxxxx host=127.0.0.1 49 | This works also for closed connection. 50 | """ 51 | for dsn_part in connection.dsn.split(): 52 | key, value = dsn_part.split('=') 53 | if key.strip() == 'dbname': 54 | return value.strip() 55 | return None 56 | 57 | 58 | def get_metric_diff(db_name, metric_name, current_time, current_value): 59 | derive_dict_key = db_name + "_" + metric_name 60 | diff = None 61 | if derive_dict_key in DERIVE_DICT: 62 | last_time, last_value = DERIVE_DICT[derive_dict_key] 63 | seconds_since_last_check = int((current_time - last_time).total_seconds()) 64 | if seconds_since_last_check == 0: 65 | diff = 0 66 | else: 67 | diff = float(current_value - last_value) / seconds_since_last_check 68 | DERIVE_DICT[derive_dict_key] = (current_time, current_value) 69 | return diff 70 | 71 | 72 | def query(cursor, sql, params=None): 73 | """accepts a database connection or cursor""" 74 | if type(cursor) == psycopg2._psycopg.connection: 75 | cursor = cursor.cursor() 76 | LOG.debug('QUERY "{}" {}', sql, params) 77 | try: 78 | if params: 79 | cursor.execute(sql, params) 80 | else: 81 | cursor.execute(sql) 82 | results = cursor.fetchall() 83 | except Exception: 84 | LOG.exception("failed calling the database") 85 | results = [] 86 | LOG.debug('QUERY RESULT: {}', results) 87 | return results 88 | 89 | 90 | def get_tables_with_oids_for_current_db(conn): 91 | tables = [] 92 | results = query(conn, 93 | "SELECT oid, relname FROM pg_class WHERE relkind = 'r' " 94 | "AND relname NOT LIKE 'pg_%' AND relname NOT LIKE 'sql_%'") 95 | for result in results: 96 | table_oid, table_name = result 97 | tables.append((table_oid, table_name)) 98 | return tables 99 | 100 | 101 | def get_client_connections_amount(conn): 102 | results = query(conn, 'SELECT count(*) FROM pg_stat_activity') 103 | if results: 104 | return results[0][0] 105 | return None 106 | 107 | 108 | def get_disk_usage_for_database(conn): 109 | sql = ("SELECT datname, pg_database_size(datname) FROM pg_database " 110 | "WHERE datname = current_database()") 111 | results = query(conn, sql) 112 | if results: 113 | return results[0] 114 | return None 115 | 116 | 117 | def get_major_version(conn): 118 | """Get the major version part of the PostgreSQL version, i.e. the first two digits""" 119 | results = query(conn, "SELECT substring(version() from $$(\d+\.\d+)\.\d+$$)") 120 | if results: 121 | return results[0][0] 122 | return None 123 | 124 | 125 | def get_transaction_rate_for_database(conn): 126 | sql = ("SELECT current_database(), datname, now(), xact_commit + xact_rollback, xact_rollback " 127 | "FROM pg_stat_database WHERE datname = current_database()") 128 | results = query(conn, sql) 129 | db_name, dat_name, time_now, transactions_now, rollbacks_now = results[0] 130 | if None in results[0]: 131 | LOG.error("Fetching transactions got 'None' in result set") 132 | return None, None, None 133 | recent_transactions = get_metric_diff(db_name, 'transactions', time_now, transactions_now) 134 | recent_rollbacks = get_metric_diff(db_name, 'rollbacks', time_now, rollbacks_now) 135 | return dat_name, recent_transactions, recent_rollbacks 136 | 137 | 138 | def get_seconds_since_last_vacuum_per_table(conn): 139 | """Returns a list of tuples: (db_name, table_name, seconds_since_last_vacuum) 140 | where seconds_since_last_vacuum is 0 if no vacuum is done ever (stays flat zero)""" 141 | sql = ("SELECT current_database(), relname, now(), last_vacuum, last_autovacuum " 142 | "FROM pg_stat_user_tables") 143 | results = query(conn, sql) 144 | table_last_vacuum_list = [] 145 | for db_name, table_name, time_now, last_vacuum, last_autovacuum in results: 146 | latest_vacuum = None 147 | if last_vacuum or last_autovacuum: 148 | latest_vacuum = max([x for x in (last_vacuum, last_autovacuum) if x]) 149 | seconds_since_last_vacuum = int((time_now - (latest_vacuum or time_now)).total_seconds()) 150 | table_last_vacuum_list.append((db_name, table_name, seconds_since_last_vacuum)) 151 | return table_last_vacuum_list 152 | 153 | 154 | def get_heap_hit_statistics(conn): 155 | sql = ("SELECT current_database(), now(), sum(heap_blks_read), sum(heap_blks_hit) " 156 | "FROM pg_statio_user_tables") 157 | results = query(conn, sql) 158 | if not results or None in results[0]: 159 | LOG.error("fetching heap_hit_statistics got empty results: {}", str(results)) 160 | return None, None, None, None 161 | db_name, time_now, heap_read_now, heap_hit_now = results[0] 162 | recent_heap_read = get_metric_diff(db_name, 'heap_read', time_now, heap_read_now) 163 | recent_heap_hit = get_metric_diff(db_name, 'heap_hit', time_now, heap_hit_now) 164 | recent_heap_hit_ratio = None 165 | if recent_heap_read is not None: 166 | if recent_heap_hit == 0: 167 | recent_heap_hit_ratio = 0 168 | else: 169 | recent_heap_hit_ratio = recent_heap_hit / float(recent_heap_hit + recent_heap_read) 170 | return db_name, recent_heap_read, recent_heap_hit, recent_heap_hit_ratio 171 | 172 | 173 | def get_lock_statistics(conn): 174 | sql = ("SELECT locktype, granted, count(*) FROM pg_locks GROUP BY locktype, granted") 175 | results = query(conn, sql) 176 | total = [0, 0] 177 | lock_stats = {} 178 | for lock_type, granted, count in results: 179 | if lock_type not in lock_stats: 180 | lock_stats[lock_type] = [0, 0] 181 | lock_stats[lock_type][granted] = count 182 | total[granted] += count 183 | return [lock_stats, total] 184 | 185 | 186 | def get_oldest_transaction_timestamp(conn): 187 | sql = ("SELECT datname, now(), xact_start FROM pg_stat_activity " 188 | "WHERE xact_start IS NOT NULL AND datname=current_database() " 189 | "ORDER BY xact_start ASC LIMIT 1") 190 | results = query(conn, sql) 191 | if results: 192 | db_name, time_now, xact_start = results[0] 193 | seconds_since_oldest_xact_start = int((time_now - (xact_start or time_now)).total_seconds()) 194 | return db_name, seconds_since_oldest_xact_start 195 | return None, None 196 | 197 | 198 | def get_max_mxid_age(conn): 199 | # `mxid_age` is only available on postgres 9.5 and newer 200 | if conn.server_version < 95000: 201 | LOG.error("Unable to check mxid_age on versions of postgres below 9.5") 202 | return None 203 | sql = "SELECT max(mxid_age(relminmxid)) FROM pg_class WHERE relminmxid <> '0'" 204 | results = query(conn, sql) 205 | if not results: 206 | return None 207 | mxid_age, = results[0] 208 | return int(mxid_age) 209 | 210 | 211 | def get_max_xid_age(conn): 212 | sql = "SELECT max(age(datfrozenxid)) FROM pg_database" 213 | results = query(conn, sql) 214 | if not results: 215 | return None 216 | xid_age, = results[0] 217 | return int(xid_age) 218 | 219 | 220 | def get_replication_delays(conn): 221 | sql = ("SELECT client_addr, " 222 | "pg_xlog_location_diff(pg_current_xlog_location(), replay_location) AS bytes_diff " 223 | "FROM public.pg_stat_repl") 224 | if is_in_recovery(conn): 225 | # pg_current_xlog_location cannot be called in a replica 226 | # use pg_last_xlog_receive_location for monitoring cascade replication 227 | sql = sql.replace("pg_current_xlog_location", "pg_last_xlog_receive_location") 228 | if conn.server_version >= 100000: # PostgreSQL 10 and higher 229 | sql = sql.replace('_xlog', '_wal') 230 | sql = sql.replace('_location', '_lsn') 231 | all_delays = [] 232 | results = query(conn, sql) 233 | for result_row in results: 234 | client_addr = result_row[0] 235 | bytes_diff = int(result_row[1]) 236 | all_delays.append((client_addr, bytes_diff)) 237 | return all_delays 238 | 239 | 240 | def get_table_bloat(conn, table_oid): 241 | """Based on extension pgstattuple, so you need to call CREATE EXTENSION before using this. 242 | Check the function get_tables_with_oids_for_current_db to see how to get table oids. 243 | """ 244 | results = query(conn, "SELECT current_database, dead_tuple_percent " 245 | "FROM pgstattuple_for_table_oid(%s)", [table_oid]) 246 | if results: 247 | db_name, dead_tuple_percent = results[0] 248 | return db_name, dead_tuple_percent / 100.0 249 | return None, None 250 | 251 | 252 | def get_index_hit_rates(conn): 253 | sql = ("SELECT current_database() as db_name, relname as table_name, " 254 | "idx_scan as index_hit, seq_scan as index_miss " 255 | "FROM pg_stat_user_tables") 256 | results = query(conn, sql) 257 | index_hit_rates = [] 258 | LOG.debug(results) 259 | for db_name, table_name, index_hit, index_miss in results: 260 | if index_hit is not None and index_miss is not None: 261 | if index_hit == 0: 262 | recent_ratio = 0 263 | else: 264 | recent_ratio = index_hit / float(index_miss + index_hit) 265 | index_hit_rates.append((db_name, table_name, recent_ratio)) 266 | else: 267 | index_hit_rates.append((db_name, table_name, None)) 268 | return index_hit_rates 269 | 270 | 271 | def get_wal_receiver_status(conn): 272 | sql = ("SELECT conninfo, CASE WHEN status = 'streaming' THEN 1 ELSE 0 END " 273 | "FROM public.stat_incoming_replication") 274 | results = query(conn, sql) 275 | host_replication_status = [] 276 | for conn_info, status in results: 277 | host = CONNINFO_HOST_RE.search(conn_info).groupdict().get('host', 'UNKNOWN') 278 | host_replication_status.append((host, status)) 279 | return host_replication_status 280 | 281 | 282 | def is_in_recovery(conn): 283 | return query(conn, "SELECT pg_is_in_recovery()")[0][0] 284 | -------------------------------------------------------------------------------- /postgresql_metrics/prepare_db.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ 15 | This module contains code for preparing each monitored database in the cluster 16 | for the functionality required in postgresql-metrics project. 17 | 18 | This includes creating an appropriate metrics user, enabling required extensions, 19 | and creating required functions and views. 20 | """ 21 | import getpass 22 | 23 | import psycopg2 24 | from psycopg2 import sql 25 | 26 | from postgresql_metrics.postgres_queries import get_db_connection 27 | from postgresql_metrics.common import get_logger 28 | 29 | LOG = get_logger("postgresql-metrics-prepare-db") 30 | 31 | REPLICATION_STATS_VIEW = 'public.pg_stat_repl' 32 | PGSTATTUPLES_FUNC_NAME = 'pgstattuple_for_table_oid' 33 | PGSTATTUPLES_FUNC = PGSTATTUPLES_FUNC_NAME + '(BIGINT)' 34 | 35 | PGVERSION_WAL_RECEIVER = 90600 36 | INCOMING_REPLICATION_STATS_VIEW = "stat_incoming_replication" 37 | 38 | 39 | def query_user_for_superuser_credentials(): 40 | username = input("Provide a Postgres role name with superuser privileges " 41 | "in the configured cluster: ") 42 | password = getpass.getpass("Give the password: ") 43 | return username, password 44 | 45 | 46 | def connect_as_super_user(db_name, conf): 47 | db_connection = None 48 | try: 49 | db_connection = psycopg2.connect(database=db_name) 50 | db_connection.autocommit = True 51 | except psycopg2.OperationalError: 52 | LOG.info("could not connect as local superuser with current user, credentials required") 53 | 54 | if not db_connection: 55 | superuser, password = query_user_for_superuser_credentials() 56 | db_connection = get_db_connection(db_name, superuser, password, 57 | host=conf['postgres']['host'], 58 | port=int(conf['postgres']['port'])) 59 | 60 | if not db_connection or db_connection.closed: 61 | raise Exception("failed connecting the database: " + db_name) 62 | 63 | return db_connection 64 | 65 | 66 | def check_if_database_is_slave(db_connection): 67 | """Returns True if the queried database is a slave node, 68 | i.e. is in recovery mode streaming data from master. 69 | """ 70 | with db_connection.cursor() as c: 71 | c.execute("SELECT pg_is_in_recovery()") 72 | result = c.fetchone() 73 | return bool(result) and result[0] 74 | 75 | 76 | def check_if_role_exists(db_connection, role_name): 77 | with db_connection.cursor() as c: 78 | c.execute("SELECT rolname FROM pg_roles WHERE rolname=%s", [role_name]) 79 | result = c.fetchone() 80 | return bool(result) and result[0] == role_name 81 | 82 | 83 | def check_if_role_has_db_privilege(db_connection, role_name, db_name, privilege): 84 | with db_connection.cursor() as c: 85 | c.execute("SELECT * FROM has_database_privilege(%s, %s, %s)", 86 | [role_name, db_name, privilege]) 87 | result = c.fetchone() 88 | return bool(result) and result[0] 89 | 90 | 91 | def check_if_role_has_table_privilege(db_connection, role_name, table_name, privilege): 92 | with db_connection.cursor() as c: 93 | c.execute("SELECT * FROM has_table_privilege(%s, %s, %s)", 94 | [role_name, table_name, privilege]) 95 | result = c.fetchone() 96 | return bool(result) and result[0] 97 | 98 | 99 | def check_if_role_has_function_privilege(db_connection, role_name, function_name, privilege): 100 | with db_connection.cursor() as c: 101 | c.execute("SELECT * FROM has_function_privilege(%s, %s, %s)", 102 | [role_name, function_name, privilege]) 103 | result = c.fetchone() 104 | return bool(result) and result[0] 105 | 106 | 107 | def check_if_replication_stats_view_exists(db_connection): 108 | with db_connection.cursor() as c: 109 | c.execute("SELECT table_name FROM information_schema.tables " 110 | "WHERE table_name='pg_stat_repl'") 111 | result = c.fetchone() 112 | return bool(result) and result[0] == 'pg_stat_repl' 113 | 114 | 115 | def check_if_pgstattuples_extension_exists(db_connection): 116 | with db_connection.cursor() as c: 117 | c.execute("SELECT proname FROM pg_proc WHERE proname=%s", [PGSTATTUPLES_FUNC_NAME]) 118 | result = c.fetchone() 119 | return bool(result) and result[0] == 'pgstattuples' 120 | 121 | 122 | def create_role_with_login(db_connection, metrics_user, metrics_user_password): 123 | LOG.info("creating role '{}' with login privilege", metrics_user) 124 | with db_connection.cursor() as c: 125 | c.execute("CREATE ROLE " + metrics_user + " WITH PASSWORD %s LOGIN;", 126 | [metrics_user_password]) 127 | 128 | 129 | def create_replication_stats_view(db_connection): 130 | LOG.info("creating view {}", REPLICATION_STATS_VIEW) 131 | sql = """CREATE OR REPLACE FUNCTION public.pg_stat_repl() 132 | RETURNS SETOF pg_catalog.pg_stat_replication AS $$ 133 | BEGIN 134 | RETURN QUERY(SELECT * FROM pg_catalog.pg_stat_replication); 135 | END$$ LANGUAGE plpgsql SECURITY DEFINER;""" 136 | with db_connection.cursor() as c: 137 | c.execute(sql) 138 | c.execute("CREATE VIEW " + REPLICATION_STATS_VIEW 139 | + " AS SELECT * FROM public.pg_stat_repl()") 140 | 141 | def create_pgstattuples_extension(db_connection): 142 | LOG.info("creating extension pgstattuple with access function {}", PGSTATTUPLES_FUNC) 143 | sql = "CREATE OR REPLACE FUNCTION " + PGSTATTUPLES_FUNC + """ 144 | RETURNS TABLE (current_database NAME, table_len BIGINT, tuple_count BIGINT, 145 | tuple_len BIGINT, tuple_percent FLOAT, dead_tuple_count BIGINT, 146 | dead_tuple_len BIGINT, dead_tuple_percent FLOAT, free_space BIGINT, 147 | free_percent FLOAT) AS $$ 148 | BEGIN 149 | RETURN QUERY(SELECT current_database(), * FROM pgstattuple($1)); 150 | END$$ LANGUAGE plpgsql SECURITY DEFINER;""" 151 | with db_connection.cursor() as c: 152 | c.execute("CREATE EXTENSION IF NOT EXISTS pgstattuple;") 153 | c.execute(sql) 154 | 155 | def check_if_incoming_replication_status_view_exists(db_connection): 156 | with db_connection.cursor() as c: 157 | c.execute("SELECT table_name FROM information_schema.tables " 158 | "WHERE table_name=%s", (INCOMING_REPLICATION_STATS_VIEW,)) 159 | result = c.fetchone() 160 | return bool(result) and result[0] == INCOMING_REPLICATION_STATS_VIEW 161 | 162 | def create_incoming_replication_status_view(db_connection): 163 | LOG.info("creating view {}", INCOMING_REPLICATION_STATS_VIEW) 164 | func_sql = """CREATE OR REPLACE FUNCTION public.stat_incoming_replication() 165 | RETURNS SETOF pg_catalog.pg_stat_wal_receiver AS $$ 166 | BEGIN 167 | RETURN QUERY(SELECT * FROM pg_catalog.pg_stat_wal_receiver); 168 | END$$ LANGUAGE plpgsql SECURITY DEFINER;""" 169 | view_sql = "CREATE OR REPLACE VIEW public.{0} AS SELECT * FROM {0}()".format( 170 | INCOMING_REPLICATION_STATS_VIEW) 171 | with db_connection.cursor() as c: 172 | c.execute(func_sql) 173 | c.execute(view_sql) 174 | 175 | def prepare_databases_for_metrics(conf): 176 | """Tries first to connect to localhost database as default user, 177 | which works if the local user is setup as local Postgres superuser. 178 | If this fails, queries for Postgres superuser credentials. 179 | """ 180 | metrics_user = conf['postgres']['user'] 181 | metrics_user_password = conf['postgres']['password'] 182 | LOG.info("prepare databases for metrics user '{}'", metrics_user) 183 | 184 | db_names = [] 185 | if 'databases' in conf['postgres']: 186 | db_names = conf['postgres']['databases'] 187 | elif 'database' in conf['postgres']: 188 | db_names = [conf['postgres']['database']] 189 | 190 | for db_name in db_names: 191 | LOG.info("connecting to database '{}' as super user", db_name) 192 | db_connection = connect_as_super_user(db_name, conf) 193 | 194 | if check_if_database_is_slave(db_connection): 195 | LOG.info("database is a slave, run prepare-db on master") 196 | break 197 | 198 | if not check_if_role_exists(db_connection, metrics_user): 199 | create_role_with_login(db_connection, metrics_user, metrics_user_password) 200 | else: 201 | LOG.info("role already exists: {}", metrics_user) 202 | 203 | if not check_if_role_has_db_privilege(db_connection, metrics_user, db_name, 'connect'): 204 | LOG.info("grant connect privilege to user '{}' for database: {}", 205 | metrics_user, db_name) 206 | with db_connection.cursor() as c: 207 | c.execute("GRANT CONNECT ON database " + db_name + " TO " + metrics_user) 208 | else: 209 | LOG.info("role '{}' already has connect privilege to database: {}", 210 | metrics_user, db_name) 211 | 212 | if not check_if_replication_stats_view_exists(db_connection): 213 | create_replication_stats_view(db_connection) 214 | else: 215 | LOG.info("replication stats view already exists") 216 | 217 | if not check_if_role_has_table_privilege(db_connection, metrics_user, 218 | REPLICATION_STATS_VIEW, 'select'): 219 | LOG.info("grant select privilege to user '{}' for relation: {}", 220 | metrics_user, REPLICATION_STATS_VIEW) 221 | with db_connection.cursor() as c: 222 | c.execute("GRANT SELECT ON " + REPLICATION_STATS_VIEW + " TO " + metrics_user) 223 | else: 224 | LOG.info("role '{}' already has select privilege to relation: {}", 225 | metrics_user, REPLICATION_STATS_VIEW) 226 | 227 | if not check_if_pgstattuples_extension_exists(db_connection): 228 | create_pgstattuples_extension(db_connection) 229 | else: 230 | LOG.info("pgstattuples extension already exists") 231 | 232 | if not check_if_role_has_function_privilege(db_connection, metrics_user, 233 | PGSTATTUPLES_FUNC, 'execute'): 234 | LOG.info("grant execute privilege to user '{}' for function: {}", 235 | metrics_user, PGSTATTUPLES_FUNC) 236 | with db_connection.cursor() as c: 237 | c.execute("GRANT EXECUTE ON FUNCTION " + PGSTATTUPLES_FUNC + " TO " 238 | + metrics_user) 239 | else: 240 | LOG.info("role '{}' already has execute privilege to function: {}", 241 | metrics_user, PGSTATTUPLES_FUNC) 242 | 243 | if db_connection.server_version >= PGVERSION_WAL_RECEIVER: 244 | if not check_if_incoming_replication_status_view_exists(db_connection): 245 | create_incoming_replication_status_view(db_connection) 246 | else: 247 | LOG.info("incoming replication status view already exists") 248 | 249 | if not check_if_role_has_table_privilege(db_connection, metrics_user, 250 | INCOMING_REPLICATION_STATS_VIEW, 'select'): 251 | LOG.info("grant select privilege to user '{}' for relation: {}", 252 | metrics_user, INCOMING_REPLICATION_STATS_VIEW) 253 | with db_connection.cursor() as c: 254 | c.execute(sql.SQL("GRANT SELECT ON {} TO {}").format( 255 | sql.Identifier(INCOMING_REPLICATION_STATS_VIEW), 256 | sql.Identifier(metrics_user))) 257 | else: 258 | LOG.info("role '{}' already has select privilege to relation: {}", 259 | metrics_user, REPLICATION_STATS_VIEW) 260 | else: 261 | LOG.info("skipping setup for incoming replication view, requires Postgres version >= %s", 262 | PGVERSION_WAL_RECEIVER) 263 | 264 | LOG.info("database '{}' prepared for metrics user: {}", db_name, metrics_user) 265 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | wheel>=0.32.3 2 | setuptools>=40.8.0 3 | psycopg2-binary>=2.7 4 | logbook>=0.10.1 5 | pip>=9 6 | pyyaml>=3.11 7 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from setuptools import find_packages 15 | from setuptools import setup 16 | 17 | 18 | def requirements(f): 19 | reqs = open(f, 'r').read().splitlines() 20 | reqs = [r for r in reqs if not r.strip().startswith('#')] 21 | return reqs 22 | 23 | 24 | setup(name='postgresql-metrics', 25 | version='0.3.3', 26 | author=u'Hannu Varjoranta', 27 | author_email='hannu.varjoranta@spotify.com', 28 | url='https://github.com/spotify/postgresql-metrics', 29 | description='Simple service to provide metrics for your PostgreSQL database', 30 | packages=find_packages(), 31 | install_requires=requirements('requirements.txt'), 32 | entry_points={ 33 | 'console_scripts': [ 34 | 'postgresql-metrics=postgresql_metrics.metrics_logic:main', 35 | ]} 36 | ) 37 | --------------------------------------------------------------------------------