├── AUTHORS ├── ChangeLog ├── LICENSE ├── README.rst ├── TODO.md ├── contrib ├── Docker │ ├── Dockerfile │ └── conf_files │ │ ├── haproxystats.conf │ │ └── supervisor │ │ ├── haproxystats-process.ini │ │ └── haproxystats-pull.ini ├── ansible-playbook │ ├── group_vars │ │ └── all │ ├── hosts │ ├── main-playbook.yml │ └── roles │ │ ├── create-newuser │ │ └── tasks │ │ │ └── main.yml │ │ ├── haproxy-socket-dir-permission │ │ └── tasks │ │ │ └── main.yml │ │ ├── haproxystats-config-file │ │ ├── tasks │ │ │ └── main.yml │ │ └── templates │ │ │ └── haproxystats.conf.j2 │ │ ├── install-dependency-package │ │ └── tasks │ │ │ └── main.yml │ │ ├── install-haproxystats │ │ └── tasks │ │ │ └── main.yml │ │ └── systemd-files │ │ ├── tasks │ │ └── main.yml │ │ └── templates │ │ ├── haproxystats-process.service.j2 │ │ └── haproxystats-pull.service.j2 ├── nagios │ ├── check_haproxystats_process.sh │ ├── check_haproxystats_process_number_of_procs.sh │ ├── check_haproxystats_pull.sh │ └── check_haproxystats_queue_size.py ├── puppet │ ├── manifests │ │ ├── init.pp │ │ └── params.pp │ └── templates │ │ ├── defaults.conf.erb │ │ ├── exclude_backend.conf.erb │ │ ├── exclude_frontend.conf.erb │ │ ├── haproxystats-process-monit-check.sh.erb │ │ ├── process-systemd-overwrites.conf.erb │ │ ├── process.conf.erb │ │ ├── pull-systemd-overwrites.conf.erb │ │ └── pull.conf.erb ├── systemd │ ├── haproxystats-process.service │ └── haproxystats-pull.service └── tcp_server.py ├── haproxystats-architecture.png ├── haproxystats.conf ├── haproxystats ├── __init__.py ├── metrics.py ├── process.py ├── pull.py └── utils.py ├── requirements.txt ├── setup.cfg └── setup.py /AUTHORS: -------------------------------------------------------------------------------- 1 | Christian Rovner 2 | Hossein 3 | Igor Vuk 4 | Jose Pedro Oliveira 5 | Marcin Deranek 6 | Marcin Deranek 7 | Patrick Kaeding 8 | Pavlos Parissis 9 | Pavlos Parissis 10 | hos7ein 11 | -------------------------------------------------------------------------------- /ChangeLog: -------------------------------------------------------------------------------- 1 | CHANGES 2 | ======= 3 | 4 | 0.5.2 5 | ----- 6 | 7 | * RELEASE 0.5.2 version 8 | * Use the network target on the Systemd unit files 9 | * Calculate average only when actual connections have been made 10 | * Update binary path in systemd file for haproxystats-process service 11 | * Update binary path in systemd file for haproxystats-pull service 12 | * Update python packages for CentOS distribution 13 | 14 | 0.5.1 15 | ----- 16 | 17 | * RELEASE 0.5.1 version 18 | * Fix runtime error when specifying a custom list of metrics 19 | * Check liveness of workers and exit if at least one is dead 20 | * Fix formatting in README for group-namespace option 21 | 22 | 0.5.0 23 | ----- 24 | 25 | * RELEASE 0.5.0 version 26 | * Support grouping metrics by frontend, backend and server names 27 | 28 | 0.4.2 29 | ----- 30 | 31 | * RELEASE 0.4.2 version 32 | * Add missing comma 33 | 34 | 0.4.1 35 | ----- 36 | 37 | * RELEASE 0.4.1 version 38 | * Add supporting of ubuntu hosts and update Ansible playbook 39 | * Also report about total requests for backend 40 | * update readme file 41 | * update Ansible Playbook and readme file 42 | 43 | 0.4.0 44 | ----- 45 | 46 | * RELEASE 0.4.0 version 47 | * Handle the absent of daemon's metric 48 | * It's a counter, so should be in different section 49 | * Update the list of supported metrics 50 | * Use int rather string in the default values 51 | * Use int rather string in default value 52 | * Switch to correct section name for pycodestyle 53 | * PEP257 compatible docstrings 54 | * Switch from yarl.UR to urlparse from standard Python library 55 | * Update reference of UNIX sockets to include TCP 56 | * PEP257 compatible docstring 57 | * Simplify the return logic of two functions 58 | * Add configuration\_check\_for\_servers() 59 | * PEP257 compatible docstrings 60 | * Disable pylint warning for too-many-branches 61 | * Change the log severity to info for task report 62 | * Add support for pulling statistics from TCP sockets 63 | * Make that code more readable 64 | * PEP257 compatible docstrinigs 65 | * Fix typo in README 66 | 67 | 0.3.15 68 | ------ 69 | 70 | * RELEASE 0.3.15 version 71 | * Add dcon and dses in the list of FRONTEND\_METRICS 72 | * Add ctime in the list of SERVER\_AVG\_METRICS 73 | * Add chkdown in the list of server metrics 74 | * Add \`slim\` to server metrics, in addition to frontends and backends 75 | * Mention in README about ansible and move code under contrib 76 | * fix update readme file 77 | * - Add Ansible Playbook For Fast Deploy haproxystats On CentOS 7.x - Update README File 78 | * Fix a typo in README.rst 79 | * Place metric back in the queue in case of OSError exc 80 | * Log a message when graphite dispatcher queue is full 81 | * Make docstrings compatible with EP257 82 | 83 | 0.3.14 84 | ------ 85 | 86 | * RELEASE 0.3.14 version 87 | * Fix wrong calculation for Uptime\_sec 88 | 89 | 0.3.13 90 | ------ 91 | 92 | * RELEASE 0.3.13 version 93 | * Convert Idle\_pct to CpuUsagePct 94 | 95 | 0.3.12 96 | ------ 97 | 98 | * RELEASE 0.3.12 version 99 | * Fix yet another regression introduced by 9e78b6918 100 | * Fix indentation issues 101 | * Drop unused columns to safe memory 102 | * Fix performance regression introduced by 9e78b6918 103 | 104 | 0.3.11 105 | ------ 106 | 107 | * RELEASE 0.3.11 version 108 | * Compute standard deviation and percentiles for Idle\_pct 109 | * Sanitize values in pxname and svname columns 110 | * Simplify the way we exclude frontends/backends 111 | * Add TotalServer metrics for backend, close #4 112 | * Update URI references for HAProxy documentation 113 | 114 | 0.3.10 115 | ------ 116 | 117 | * RELEASE 0.3.10 version 118 | * Make sure we cancel get() coroutine when times out 119 | 120 | 0.3.9 121 | ----- 122 | 123 | * RELEASE 0.3.9 version 124 | * Avoid replacing '.' with '\_' in namespace setting 125 | * Avoid crashing when daemon statistics are inconsistent 126 | * Do a comparison to None singleton with is not 127 | * Rename variable epoch to timestamp 128 | * Place keywords in the same line 129 | 130 | 0.3.8 131 | ----- 132 | 133 | * RELEASE 0.3.8 version 134 | * Declare failure when zero UNIX socket are found 135 | * Rephrase various comments and log messages 136 | * Check queue length when it is safe to do 137 | * Remove shebang and add vim file encoding 138 | * Remove '%' prompt character from instructions 139 | * Move external software under contrib directory 140 | * Fix a typo in README 141 | * Add more keywords in setup.cfg 142 | * Update README 143 | * Ignore pep8 W503 144 | * Increase readability of code 145 | * Remove rst file extension for LICENSE 146 | * Add rst file extension for LICENSE 147 | * Update TODO 148 | * Add missing colons to declare a block 149 | * Update development instructions in README 150 | * Add instructions to build a development environemnt 151 | * Update TODO 152 | 153 | 0.3.7 154 | ----- 155 | 156 | * RELEASE 0.3.7 version 157 | * Include average metrics when processing per daemon 158 | * Remove redundant string replacement 159 | * Provide number of metrics which are sent 160 | * Fix a silly bug in conditional statement 161 | * Update docstrings/comments 162 | * Mention that WallClockTime includes sending time 163 | * Add/update docstrings and fix pylint warnings 164 | * Fix a but where we skip CompressBpsIn metric 165 | * Mention in documentation about SslConnPercentage 166 | * Calculate percentage also for SslCurrConns 167 | 168 | 0.3.6 169 | ----- 170 | 171 | * RELEASE 0.3.6 version 172 | * Produce statistics for haproxystats-process 173 | * Fix a typo for option name in the puppet module 174 | 175 | 0.3.5 176 | ----- 177 | 178 | * RELEASE 0.3.5 version 179 | * Remove check for accessibility for dst-dir and tmp-dst-dir 180 | * Disable some pylint checks 181 | * Calculate percentages for HAProxy workers 182 | * Use correct name 183 | * Add a missing comma 184 | * Update installation instructions 185 | 186 | 0.3.4 187 | ----- 188 | 189 | * RELEASE 0.3.4 version 190 | * Add items in TODO 191 | * Add home-page in setup.cfg 192 | * Make sure configuration has valid metric names 193 | * Remove done tasks from TODO 194 | 195 | 0.3.3 196 | ----- 197 | 198 | * RELEASE 0.3.3 version 199 | * Add a tcp\_server which simulates graphite-relay 200 | * Add support for per process statistics 201 | * Make it more clear that we perform aggregation 202 | 203 | 0.3.2 204 | ----- 205 | 206 | * RELEASE 0.3.2 version 207 | * Make sure we can \_\_init\_\_ method of parent class 208 | * Remove unnecessary fallback 209 | * Catch the case where config is in invalid format 210 | * Perform a sanity check on configuration 211 | * Disable pylint warning at the correct line 212 | * Add sanity checks for parameters in puppet module 213 | * Add queue-size parameter in the example config 214 | * Update puppet module 215 | 216 | 0.3.1 217 | ----- 218 | 219 | * RELEASE 0.3.1 version 220 | * Document queue-size parameter of pull section 221 | * Rephrase the paragraph in Queuing system 222 | * Use fallback to True for prefix-hostname 223 | * User hyphen rather underscore 224 | * Use correct key name for queue-size parameter 225 | * Add connect and write timeout to defaults/README 226 | 227 | 0.3.0 228 | ----- 229 | 230 | * RELEASE 0.3.0 version 231 | * Add support for excluding frontends and backends 232 | * Revert "Replace in README UTF-8 characters with plain text" 233 | * Refactor the loglevel part 234 | * Disable pylint checks for few cases 235 | * Make load\_file\_content to skip commented out lines 236 | * Catch the case when incoming directory disappears 237 | * Disable few pylint checks 238 | * Remove unnecessary fallback 239 | * Add aggr-server-metrics parameter in the defaults 240 | 241 | 0.2.1 242 | ----- 243 | 244 | * RELEASE 0.2.1 version 245 | * Increase severity to info, to easier debugging 246 | * Replace in README UTF-8 characters with plain text 247 | * Provide more accurate sleep time to avoid interval drifting 248 | 249 | 0.2.0 250 | ----- 251 | 252 | * RELEASE 0.2.0 version 253 | * Rephrase few sentences and fix spelling mistakes 254 | 255 | 0.1.16 256 | ------ 257 | 258 | * RELEASE 0.1.16 version 259 | * Compute averages rather sum for act/bck metrics 260 | * Update README 261 | * Remove README under nagios 262 | * Update nagios and monit checks 263 | * Update puppet code 264 | * Add minimum version for pyinotify in requirements 265 | * Update requirements.txt 266 | * Add a function to load content of file to a list 267 | * Catch OSError when pandas parses csv file 268 | 269 | 0.1.15 270 | ------ 271 | 272 | * RELEASE 0.1.15 version 273 | * Handle TCP socket failures in a better way 274 | * Remove unused variable 275 | * Rename decorator, retries --> retry\_on\_failures 276 | * Add docstrings in functions 277 | * Remove log\_hook as it is not needed anymore 278 | * Rearrange comments 279 | * Replace a for loop with a list comprehension 280 | * Remove unnecessary dependencies from Unit files 281 | 282 | 0.1.14 283 | ------ 284 | 285 | * RELEASE 0.1.14 286 | * Catch the case when FD is closed from outside 287 | * Update on puppet recipe 288 | * Add puppet classes 289 | * Add monit configuration 290 | * Updates on nagios checks 291 | * Update TODO 292 | 293 | 0.1.13 294 | ------ 295 | 296 | * RELEASE 0.1.13 297 | * Make configurable the aggregation of server's stats 298 | * Make metric names configurable 299 | * Rearrange metric name lists 300 | * Add nagios checks 301 | 302 | 0.1.12 303 | ------ 304 | 305 | * RELEASE 0.1.12 306 | * Report only the relevant sections for each program 307 | 308 | 0.1.11 309 | ------ 310 | 311 | * RELEASE 0.1.11 version 312 | * Fix regression introduced by 5d1003ca 313 | 314 | 0.1.10 315 | ------ 316 | 317 | * RELEASE 0.1.10 version 318 | * Use float rather int for measuring wall clock 319 | * Report wallclock time for processing statistics 320 | * Add support for connect and write timeouts 321 | * Remove metrics from utils.py 322 | * Move metrics to a separate module 323 | * Update docstrings/comments 324 | 325 | 0.1.9 326 | ----- 327 | 328 | * RELEASE 0.1.9 version 329 | * Remove unnecessary server metrics 330 | * Disable computation of stats across all backends 331 | 332 | 0.1.8 333 | ----- 334 | 335 | * RELEASE 0.1.8 version 336 | * Introduce queue-size for pull program 337 | * Log exception when we fail to send data to graphite 338 | * Avoid leaking FD when connection timeout 339 | * Log when we close TCP connection to graphite 340 | * Rewrite comment 341 | 342 | 0.1.7 343 | ----- 344 | 345 | * RELEASE 0.1.7 version 346 | * Set timeout on TCP socket for graphite handler 347 | 348 | 0.1.6 349 | ----- 350 | 351 | * RELEASE 0.1.6 version 352 | * Avoid a crash if incoming directory can't be removed 353 | * Log version on startup 354 | 355 | 0.1.5 356 | ----- 357 | 358 | * RELEASE 0.1.5 359 | * Catch few more Connection exceptions on close 360 | * Report wall clock time for pulling statistics 361 | * Emit an error when we fail to remove temporary dir 362 | * Add retry logic on connections to UNIX sockets 363 | * Warn if tasks are canceled when reach pull-timeout 364 | * More reasonable values for pull retry logic 365 | * Update TODO 366 | * Fix a regression introduced by b61ce6bcc 367 | * Remove unused variable 368 | * Remove unnecessary space 369 | * Rename parameter signal to signalname 370 | * Rearrange log messages when parsing sites stats 371 | * Rearrange log messages when parsing daemon stats 372 | * Error if Pandas data frame is empty 373 | * Prevent passing empty data to Pandas 374 | * Reorder log messages and change severities 375 | * Warn if data directory doesn't have any files 376 | 377 | 0.1.4 378 | ----- 379 | 380 | * RELEASE 0.1.4 381 | * Catch socket.timeout 382 | * Log filenames in debug mode 383 | 384 | 0.1.3 385 | ----- 386 | 387 | * RELEASE 0.1.3 388 | * Remove old data file from temporary directory 389 | 390 | 0.1.2 391 | ----- 392 | 393 | * RELEASE 0.1.2 394 | * Introduce pull-timeout 395 | * Keep isolate shutdown and write\_file functions 396 | * Catch timeout during the connection 397 | 398 | 0.1.1 399 | ----- 400 | 401 | * RELEASE 0.1.1 402 | * Rework the retries decorator 403 | 404 | 0.1.0 405 | ----- 406 | 407 | * RELEASE 0.1.0 408 | * Shield workers from death due to errors from dispatchers 409 | 410 | 0.0.5 411 | ----- 412 | 413 | * RELEASE 0.0.5 version 414 | * Don't try to process empty Pandas data frame 415 | 416 | 0.0.4 417 | ----- 418 | 419 | * RELEASE 0.0.4 version 420 | * Break early when STOP item is fetched from queue 421 | * Don't enable local-store by default 422 | * Perform a clean shutdown in case no tasks are running 423 | * Wait for socket files to be created 424 | * Don't proceed if watched dir isn't created 425 | * Introduce a configuration parameter for process 426 | * Add systemd unit files 427 | 428 | 0.0.3 429 | ----- 430 | 431 | * RELEASE 0.0.3 version 432 | * Don't include 'haproxy' in graphite path 433 | * Fix typo in the section name 434 | 435 | 0.0.2 436 | ----- 437 | 438 | * RELEASE 0.0.2 version 439 | * Fix a directory pathname in the defaults 440 | * Insert also 'haproxy' in the graphite namespace 441 | * Add requirements file 442 | 443 | 0.0.1 444 | ----- 445 | 446 | * RELEASE 0.0.1 version 447 | * Change suffix for README in setup.cfg 448 | * The 1st functional version of the program:-) 449 | * Initial commit 450 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | .. README.rst 2 | 3 | ============ 4 | haproxystats 5 | ============ 6 | 7 | *A HAProxy statistics collection program* 8 | 9 | .. contents:: 10 | 11 | Introduction 12 | ------------ 13 | 14 | **haproxystats** is a statistics collector for `HAProxy`_ load balancer which 15 | processes various statistics and pushes them to graphing systems (Graphite). 16 | It is designed to satisfy the following requirements: 17 | 18 | #. Fast and configurable processing of HAProxy statistics 19 | #. Perform aggregation when HAProxy runs in multiprocess (nbproc > 1) 20 | #. Pull statistics at very low intervals (10secs) 21 | #. Flexible dispatching of statistics to different systems (Graphite, kafka) 22 | 23 | The main design characteristic is the split between pulling the statistics and 24 | processing them. This provides the ability to pull data as frequently 25 | as possible without worrying about the impact on processing time. It also 26 | reduces the risk of losing data in case of trouble during the processing phase. 27 | 28 | It runs locally on each load balancer node, offering a decentralized setup for 29 | the processing phase, but it can be easily extended in the future to have a 30 | centralized setup for the processing phase. In that centralized setup it will 31 | be possible to perform aggregation on a cluster level as well. 32 | Until then users can deploy `carbon-c-relay`_ for aggregation. 33 | 34 | Because of this design haproxystats comes with two programs: 35 | **haproxystats-pull** and **haproxystats-process**. The former pulls 36 | statistics from HAProxy via `stats socket`_ and it uses the `asyncio`_ framework 37 | from Python to achieve high concurrency and low footprint. The latter 38 | processes the statistics and pushes them to various destinations. It utilizes 39 | `Pandas`_ for data analysis and the multiprocess framework from Python. 40 | 41 | haproxystats requires Python 3.4, docopt and Pandas to be available in the 42 | system. 43 | 44 | How haproxystats works 45 | ---------------------- 46 | 47 | 48 | .. image:: haproxystats-architecture.png 49 | 50 | 51 | haproxystats-pull sends `info`_ and `stat`_ commands to all haproxy processes 52 | in order to collect statistics for the daemon and for all 53 | frontends/backends/servers. Data returned from each process and for each 54 | command is stored in individual files which are saved under one directory. The 55 | time (seconds since the epoch) of retrieval is used to name that directory. 56 | haproxystats-process watches for changes on the parent directory and when a 57 | directory is created it adds its full path to the queue. Multiple workers pick 58 | up items (directories) from the queue and process statistics from those 59 | directories. 60 | 61 | haproxystats-pull 62 | ################# 63 | 64 | haproxystats-pull leverages the `asyncio`_ framework from Python by utilizing 65 | coroutines to multiplex I/O access over several `stats socket`_, which are 66 | simple UNIX and TCP sockets. 67 | 68 | The actual task of storing the data to the file system is off-loaded to a very 69 | light `pool of threads`_ in order to avoid blocking the coroutines during the 70 | disk IO phase. 71 | 72 | haproxystats-pull manages the *incoming* directory and makes sure directories 73 | are created with correct names. It also suspends the collection when the number 74 | of directories under the *incoming* directory exceeds a threshold. This avoids 75 | filling up the disk when haproxystats-process is unavailable for sometime. 76 | This an example of directory structure: 77 | 78 | .. code-block:: bash 79 | 80 | incoming 81 | ├── 1457298067 82 | │   ├── admin1.sock_info 83 | │   ├── admin1.sock_stat 84 | │   ├── admin2.sock_info 85 | │   ├── admin2.sock_stat 86 | │   ├── admin3.sock_info 87 | │   ├── admin3.sock_stat 88 | │   ├── admin4.sock_info 89 | │   └── admin4.sock_stat 90 | └── 1457298072 91 | ├── admin1.sock_info 92 | ├── admin1.sock_stat 93 | ├── admin2.sock_info 94 | ├── admin2.sock_stat 95 | ├── admin3.sock_info 96 | ├── admin3.sock_stat 97 | ├── admin4.sock_info 98 | └── admin4.sock_stat 99 | 100 | haproxystats-process 101 | #################### 102 | 103 | haproxystats-process is a multiprocess program. The parent process uses the 104 | Linux kernel's `inotify`_ API to watch for changes in *incoming* directory. 105 | 106 | It receives an event when a directory is either created or moved in *incoming* 107 | directory. The event contains the absolute path name of that directory. It 108 | maintains an internal queue in which it puts directory names. Multiple child 109 | processes pick directory names from the queue and process the data. 110 | 111 | Its worker dispatches statistics to various destinations. The directories are 112 | removed from *incoming* directory when all statistics are successfully 113 | processed. 114 | 115 | When haproxystats-process starts it scans the *incoming* directory 116 | for new directories and processes them instantly, so you don't lose statistics 117 | if haproxystats-process is unavailable for sometime. 118 | 119 | Dispatchers 120 | ########### 121 | 122 | haproxystats-process currently supports 2 different dispatchers. 123 | 124 | 1. **Graphite** 125 | 126 | Pushes statistics to a Graphite system via a local or remote carbon-relay. 127 | The recommended method is to use `carbon-c-relay`_. It is very fast and capable 128 | of handling millions of metrics per second. This dispatcher utilizes an internal 129 | queue to store metrics which are failed to be sent to Graphite. 130 | 131 | An example of graphite namespace:: 132 | 133 | ..haproxy.frontend.. 134 | ..haproxy.backend.. 135 | ..haproxy.backend..server. 136 | ..haproxy.server.. 137 | ..haproxy.daemon. 138 | ..haproxy.haproxystats.. 139 | 140 | 2. **local-store** 141 | 142 | Stores statistics in the local disk. Use it only for debugging purposes. 143 | 144 | Statistics for HAProxy 145 | ###################### 146 | 147 | In addition the statistics that are exposed by HAProxy, haproxystats provides 148 | the following statistics. 149 | 150 | HAProxy process 151 | ~~~~~~~~~~~~~~~ 152 | 153 | HAProxy exposes Idle_pct and haproxystats-process converts it to CPU 154 | utilization without removing Idle_pct metric. This avoids the usage of 155 | scale(-1) and offset(100) functions on graphite:: 156 | 157 | CpuUsagePct CPU utilization in percentage 158 | 159 | The following metrics are calculated only when HAProxy is configured with more 160 | than 1 processes (nbproc > 1):: 161 | 162 | 25PercentileCpuUsagePct 25th percentile of CpuUsagePct across all processes 163 | 50PercentileCpuUsagePct 50th percentile -//- 164 | 75PercentileCpuUsagePct 75th percentile -//- 165 | 95PercentileCpuUsagePct 95th percentile -//- 166 | 99PercentileCpuUsagePct 99th percentile -//- 167 | StdCpuUsagePct standard deviation -//- 168 | 169 | Queuing system 170 | ############## 171 | 172 | The *incoming* directory together with the inotify API provides a simple 173 | queueing system which is used as a communication channel between 174 | haproxystats-pull and haproxystats-process programs. 175 | 176 | There isn't any feedback mechanism in place, thus haproxystats-pull monitors 177 | the number of directories before it pulls data from HAProxy and suspends its 178 | job when the number of directories exceeds a threshold. 179 | 180 | See **queue-size** parameter of **pull** section. 181 | 182 | Statistics for haproxystats 183 | ########################### 184 | 185 | **haproxystats** provides statistics for the time it takes to process, 186 | calculate and send HAProxy metrics. By default provides the following list 187 | of metric names with values in seconds:: 188 | 189 | loadbalancers.lb-01.haproxy.haproxystats.WallClockTimeHAProxy 190 | loadbalancers.lb-01.haproxy.haproxystats.WallClockTimeFrontends 191 | loadbalancers.lb-01.haproxy.haproxystats.WallClockTimeBackends 192 | loadbalancers.lb-01.haproxy.haproxystats.WallClockTimeServers 193 | loadbalancers.lb-01.haproxy.haproxystats.WallClockTimeAllStats 194 | 195 | It also provides the number of metrics which are send to graphite:: 196 | 197 | loadbalancers.lb-01.haproxy.haproxystats.MetricsHAProxy 198 | loadbalancers.lb-01.haproxy.haproxystats.MetricsFrontend 199 | loadbalancers.lb-01.haproxy.haproxystats.MetricsBackend 200 | loadbalancers.lb-01.haproxy.haproxystats.MetricsServer 201 | 202 | Configuration 203 | ------------- 204 | 205 | haproxystats uses the popular `INI`_ format for its configuration file. 206 | This is an example configuration file (/etc/haproxystats.conf):: 207 | 208 | 209 | [DEFAULT] 210 | loglevel = info 211 | retries = 2 212 | timeout = 1 213 | interval = 2 214 | 215 | [paths] 216 | base-dir = /var/lib/haproxystats 217 | 218 | [pull] 219 | loglevel = info 220 | socket-dir = /run/haproxy 221 | retries = 1 222 | timeout = 0.1 223 | interval = 0.5 224 | pull-timeout = 2 225 | pull-interval = 10 226 | dst-dir = ${paths:base-dir}/incoming 227 | tmp-dst-dir = ${paths:base-dir}/incoming.tmp 228 | workers = 8 229 | queue-size = 360 230 | 231 | [process] 232 | src-dir = ${paths:base-dir}/incoming 233 | workers = 4 234 | per-process-metrics = false 235 | 236 | [graphite] 237 | server = 127.0.0.1 238 | port = 3002 239 | retries = 3 240 | interval = 1.8 241 | connect-timeout = 1.0 242 | write-timeout = 1.0 243 | delay = 10 244 | backoff = 2 245 | namespace = loadbalancers 246 | prefix-hostname = true 247 | fqdn = true 248 | queue-size = 1000000 249 | 250 | #[local-store] 251 | #dir = ${paths:base-dir}/local-store 252 | 253 | All the above settings are optional as haproxystats comes with default values 254 | for all of them. Thus, both programs can be started without supplying any 255 | configuration. 256 | 257 | DEFAULT section 258 | ############### 259 | 260 | Settings in this section can be overwritten in other sections. 261 | 262 | * **loglevel** Defaults to **info** 263 | 264 | Log level to use, possible values are: debug, info, warning, error, critical 265 | 266 | * **retries** Defaults to **2** 267 | 268 | Number of times to retry a connection after a failure. Used by haproxystats-pull 269 | and haproxystats-process when they open a connection to a UNIX/TCP socket and 270 | Graphite respectively. 271 | 272 | * **timeout** Defaults to **1** (seconds) 273 | 274 | Time to wait for establishing a connection. Used by haproxystats-pull and 275 | haproxystats-process when they open a connection to a UNIX/TCP socket and Graphite 276 | respectively. 277 | 278 | * **interval** Defaults to **2** 279 | 280 | Time to wait before trying to open a connection. Used by haproxystats-pull and 281 | haproxystats-process when they retry a connection to a UNIX/TCP socket and Graphite 282 | respectively. 283 | 284 | paths section 285 | ############# 286 | 287 | * **base-dir** Defaults to **/var/lib/haproxystats** 288 | 289 | The directory to use as the base of the directory structure. 290 | 291 | pull section 292 | ############ 293 | 294 | * **socket-dir** Unset by default 295 | 296 | A directory with HAProxy socket files. 297 | 298 | * **servers** Unset by default 299 | 300 | A list of servers to pull statistics from. You define a server by passing a URL, 301 | here some examples:: 302 | 303 | tcp://127.0.0.1:5555 304 | tcp://foo.bar.com:4444 305 | tcp://[fe80::3f2f:46b3:ef0c:a420]:4444 306 | unix:///run/haproxy.sock 307 | 308 | Only TCP and UNIX schemes are supported and the port for TCP servers **must** 309 | be set. For UNIX scheme you can only pass a file and not a directory, but 310 | **socket-dir** option can be set as well, so you can use a directory and UNIX 311 | socket files at the same time. You can use comma as separator to pass multiple 312 | servers:: 313 | 314 | servers = unix:///run/haproxy.sock,tcp://127.0.0.1:555,tcp://127.0.0.1:556 315 | 316 | * **buffer-limit** Defaults to **6291456** (bytes) 317 | 318 | At most size bytes are read and returned from the sockets. Setting too low and 319 | it will slow down the retrieval of statistics. 320 | Only values greater than or equal to 1 are accepted. 321 | 322 | * **retries** Defaults to **1** 323 | 324 | Number of times to reconnect to UNIX/TCP socket after a failure. 325 | 326 | * **timeout** Defaults to **0.1** (seconds) 327 | 328 | Time to wait for establishing a connection to UNIX/TCP socket. There is no need to 329 | set it higher than few ms as haproxy accepts a connection within 1-2ms. 330 | 331 | * **interval** Defaults to **0.5** (seconds) 332 | 333 | Time to wait before trying to reconnect to UNIX/TCP socket after a failure. Tune it 334 | based on the duration of the reload process of haproxy. haproxy reloads within 335 | few ms but in some environments with hundreds different SSL certificates it can 336 | take a bit more. 337 | 338 | * **pull-interval** Defaults to **10** (seconds) 339 | 340 | How often to pull statistics from HAProxy. A value of *1* second can overload 341 | the haproxy processes in environments with thousands backends/servers. 342 | 343 | * **pull-timeout** Defaults to **2** (seconds) 344 | 345 | Total time to wait for the pull process to finish. Should be always less than 346 | **pull-interval**. 347 | 348 | * **dst-dir** Defaults **/var/lib/haproxystats/incoming** 349 | 350 | A directory to store statistics retrieved by HAProxy. 351 | 352 | * **tmp-dst-dir** Defaults **/var/lib/haproxystats/incoming.tmp** 353 | 354 | A directory to use as temporary storage location before directories are moved 355 | to **dst-dir**. haproxystats-pull stores statistics for each process under 356 | that directory and only when data from all haproxy processes are successfully 357 | retrieved they are moved to **dst-dir**. Make sure **dst-dir** and 358 | **tmp-dst-dir** are on the same file system, so the move of the directories 359 | become a rename which is a quick and atomic operation. 360 | 361 | * **workers** Defaults to **8** 362 | 363 | Number of threads to use for writing statistics to disk. These are very 364 | light threads and don't consume a lot of resources. Shouldn't be set higher 365 | than the number of haproxy processes. 366 | 367 | * **queue-size** Defaults to **360** 368 | 369 | Suspend the pulling of statistics when the number of directories in **dst-dir** 370 | exceeds this limit. 371 | 372 | process section 373 | ############### 374 | 375 | * **src-dir** Defaults **/var/lib/haproxystats/incoming** 376 | 377 | 378 | A directory to watch for changes. It should point to the same directory as 379 | the **dst-dir** option from *pull* section. 380 | 381 | * **workers** Defaults to **4** 382 | 383 | Number of workers to use for processing statistics. These are real processes 384 | which can consume a fair bit of CPU. 385 | 386 | * **frontend-metrics** Unset by default 387 | 388 | A list of frontend metric names separated by space to process. By default all 389 | statistics are processed and this overwrites the default selection. 390 | 391 | haproxystats-process emits an error and refuses to start if metrics aren't 392 | valid HAProxy metrics. Check the list of valid metrics in Chapter 9.1 of 393 | `management`_ documentation of HAProxy. 394 | 395 | * **backend-metrics** Unset by default 396 | 397 | A list of backend metric names separated by space to process. By default all 398 | statistics are processed and this overwrites the default selection. 399 | 400 | haproxystats-process emits an error and refuses to start if metrics aren't 401 | valid HAProxy metrics. Check the list of valid metrics in Chapter 9.1 of 402 | `management`_ documentation of HAProxy. 403 | 404 | * **server-metrics** Unset by default 405 | 406 | A list of server metric names separated by space to process. By default all 407 | statistics are processed and this overwrites the default selection. 408 | 409 | haproxystats-process emits an error and refuses to start if metrics aren't 410 | valid HAProxy metrics. Check the list of valid metrics in Chapter 9.1 of 411 | `management`_ documentation of HAProxy. 412 | 413 | * **aggr-server-metrics** Defaults to **false** 414 | 415 | Aggregates server's statistics across all backends. 416 | 417 | * **exclude-frontends** Unset by default 418 | 419 | A file which contains one frontend name per line for which processing is 420 | skipped. 421 | 422 | * **exclude-backends** Unset by default 423 | 424 | A file which contains one backend name per line for which processing is 425 | skipped. 426 | 427 | * **per-process-metrics** Defaults to **false** 428 | 429 | HAProxy daemon provides statistics and by default **haproxystat-process** 430 | aggregates those statistics when HAProxy runs in multiprocess mode 431 | (nbproc > 1). 432 | 433 | Set this to **true** to get those statistics also per process as well. 434 | This is quite useful for monitoring purposes where someone wants to monitor 435 | sessions per process in order to see if traffic is evenly distributed to all 436 | processes by the kernel. 437 | 438 | It is also useful in setups where configuration for frontends and backends is 439 | unevenly spread across all processes, for instance processes 1-4 manage SSL 440 | frontends and processes 5-7 manage noSSL frontends. 441 | 442 | This adds another path in Graphite under haproxy space:: 443 | 444 | loadbalancers.lb-01.haproxy.daemon.process.. 445 | 446 | * **calculate-percentages** Defaults to **false** 447 | 448 | Calculates percentages for a selection of metrics for HAProxy daemon. When 449 | **per-process-metrics** is set to **true** the calculation happens also per 450 | HAProxy process. This adds the following metric names:: 451 | 452 | ConnPercentage 453 | ConnRatePercentage 454 | SslRatePercentage 455 | SslConnPercentage 456 | 457 | Those metrics can be used for alerting when the current usage on connections 458 | is very close the configured limit. 459 | 460 | * **liveness-check-interval** Defaults to **10** (seconds) 461 | 462 | How often to check if all workers are alive and trigger a termination if at 463 | least one is dead. 464 | 465 | graphite section 466 | ################ 467 | 468 | This dispatcher **is enabled** by default and it can't be disabled. 469 | 470 | * **server** Defaults to **127.0.0.1** 471 | 472 | Graphite server to connect to. 473 | 474 | * **port** Defaults to **3002** 475 | 476 | Graphite port to connect to. 477 | 478 | * **retries** Defaults to **3** 479 | 480 | Number of times to reconnect to Graphite after a failure. 481 | 482 | * **interval** Defaults to **1.8** (seconds) 483 | 484 | Time to wait before trying to reconnect to Graphite after a failure. 485 | 486 | * **connect-timeout** Defaults to **1** (seconds) 487 | 488 | Time to wait for establishing a connection to Graphite relay. 489 | 490 | * **write-timeout** Defaults to **1** (seconds) 491 | 492 | Time to wait on sending data to Graphite relay. 493 | 494 | * **delay** Defaults to **10** (seconds) 495 | 496 | How long to wait before trying to connect again after number of retries has 497 | exceeded the threshold set in **retries**. During the delay period metrics are 498 | stored in the queue of the dispatcher, see **queue-size**. 499 | 500 | * **backoff** Defaults to **2** 501 | 502 | A simple exponential backoff to apply for each retry. 503 | 504 | * **namespace** Defaults to **loadbalancers** 505 | 506 | A top level graphite namespace. 507 | 508 | * **prefix-hostname** Defaults to **true** 509 | 510 | Insert the hostname of the load balancer in the Graphite namespace, example:: 511 | 512 | loadbalancers.lb-01.haproxy. 513 | 514 | * **fqdn** Defaults to **true** 515 | 516 | Use FQDN or short name in the graphite namespace 517 | 518 | * **queue-size** Defaults to **1000000** 519 | 520 | haproxystats-process uses a queue to store metrics which failed to be sent due 521 | to a connection error/timeout. This is a First In First Out queueing system. 522 | When the queue reaches the limit, the oldest items are removed to free space. 523 | 524 | * **group-namespace** Unset by default. 525 | 526 | group graphite metrics by patterns. When a frontend, backend or server matches a 527 | given pattern, the metric will be prefixed by this namespace, plus a 528 | configurable group name which must be specified in the **frontend-groups**, 529 | **backend-groups** or **server-groups** sections. These sections consist of 530 | group names and their corresponding regular expression that will be matched 531 | against frontend, backend or server names (depending on the section). 532 | 533 | For example: 534 | 535 | Let's assume our metrics look something like:: 536 | 537 | loadbalancers.lb-01.haproxy.frontend.foo-001. 538 | loadbalancers.lb-01.haproxy.frontend.foo-002. 539 | ... 540 | loadbalancers.lb-01.haproxy.frontend.bar-001. 541 | loadbalancers.lb-01.haproxy.frontend.bar-002. 542 | ... 543 | 544 | And we want them to be grouped to like this:: 545 | 546 | loadbalancers.lb-01.haproxy.flavor.abc.frontend.foo-001. 547 | loadbalancers.lb-01.haproxy.flavor.abc.frontend.foo-002. 548 | ... 549 | loadbalancers.lb-01.haproxy.flavor.xyz.frontend.bar-001. 550 | loadbalancers.lb-01.haproxy.flavor.xyz.frontend.bar-002. 551 | ... 552 | 553 | The configuration should contain these settings:: 554 | 555 | [graphite] 556 | group-namespace = flavor 557 | 558 | [frontend-groups] 559 | abc = ^foo- 560 | xyz = ^bar- 561 | 562 | Note that if the **group-namespace** setting is specified, then at least one of 563 | **frontend-groups**, **backend-groups** or **server-groups** sections must be 564 | specified as well. 565 | 566 | Also note that if frontend, backend or server names contain dots, these will be 567 | converted to underscores for graphite -- because dots are graphite's namespace 568 | separator. The patterns will have to take this into account. 569 | 570 | * **group-namespace-double-writes** Unset by default. 571 | 572 | Boolean; required only if **group-namespace** is specified. If True, send to 573 | graphite the original metric as well as the grouped metrics. If False, send 574 | only the grouped metrics. (See **group-namespace**.) 575 | 576 | frontend-groups, backend-groups, and server-groups sections 577 | ########################################################### 578 | 579 | Specify the patterns to match against frontend, backend and/or server names, to 580 | group graphite metrics and give them a variable prefix. See **group-namespace**. 581 | 582 | These sections are optional, unless **group-namespace** is set. 583 | 584 | local-store section 585 | ################### 586 | 587 | This dispatcher **isn't** enabled by default. 588 | 589 | The primarily use of local-store dispatcher is to debug/troubleshoot possible 590 | problems with the processing or/and with Graphite. There isn't any clean-up 591 | process in place, thus you need remove the files after they are created. 592 | Don't leave it enabled for more than 1 hour as it can easily fill up the disk 593 | in environments with hundreds frontends/backends and thousands servers. 594 | 595 | * **dir** Defaults to **/var/lib/haproxystats/local-store** 596 | 597 | A directory to stores statistics after they have been processed. The current 598 | format is compatible with Graphite. 599 | 600 | Systemd integration 601 | ------------------- 602 | 603 | haproxystats-pull and haproxystats-process are simple programs which are not 604 | daemonized and they output logging messages to stdout. This is by design as it 605 | simplifies the code. The daemonization and logging is off-loaded to systemd 606 | which has everything we need for that job. 607 | 608 | Under contrib/systend directory there are service files for both programs. 609 | These are functional systemd Unit files which are used in production. 610 | 611 | The order in which these 2 programs start doesn't matter and there isn't any 612 | soft or hard dependency between them. 613 | 614 | Furthermore, these programs don't need to run as root. It highly recommended to 615 | create a dedicated user to run them. You need to add that user to the group of 616 | *haproxy* and adjust socket configuration of haproxy to allow write for the 617 | group, see below an example configuration:: 618 | 619 | stats socket /run/haproxy/sock1 user haproxy group haproxy mode 660 level admin process 1 620 | stats socket /run/haproxy/sock2 user haproxy group haproxy mode 660 level admin process 2 621 | stats socket /run/haproxy/sock3 user haproxy group haproxy mode 660 level admin process 3 622 | 623 | systemd Unit files use haproxystats user which has to be created prior running 624 | haproxystats programs. 625 | 626 | Graceful shutdown 627 | ----------------- 628 | 629 | In an effort to reduce the loss of statistics both programs support graceful 630 | shutdown. When *SIGHUP* or *SIGTERM* signals are sent they perform a clean exit. 631 | When a signal is sent to haproxystats-process it may take some time for the 632 | program to exit, as it waits for all workers to empty the queue. 633 | 634 | Puppet module 635 | ------------- 636 | 637 | A puppet module is available under contrib directory which provides classes for 638 | configuring both programs. 639 | 640 | Because haproxystats-process is CPU bound program, CPU Affinity is configured 641 | using systemd. By default it pins the workers to the last CPUs. 642 | 643 | You should take care of pinning haproxy processes to other CPUs in order to 644 | avoid haproxystats-process *stealing* CPU cycles from haproxy. In production 645 | servers you usually pin the first 80% of CPUs to haproxy processes and you 646 | leave the rest of CPUs for other processes. The default template of puppet 647 | module enforces this logic. 648 | 649 | haproxystats-pull is a single threaded program which doesn't use a lot of CPU 650 | cycles and by default is assigned to the last CPU. 651 | 652 | Ansible Playbook 653 | ---------------- 654 | 655 | A Ansible playbook is available under contrib directory. For installation 656 | instruction of the playbook please read Installation chapter of this document. 657 | 658 | Nagios checks 659 | ------------- 660 | 661 | Several nagios checks are provided for monitoring purposes, they can be found 662 | under contrib/nagios directory. 663 | 664 | * check_haproxystats_process_number_of_procs.sh 665 | 666 | Monitor the number of processes of haproxystats-process program. Systemd 667 | monitors only the parent process and this check helps to detect cases where 668 | some worker(s) die unexpectedly 669 | 670 | * check_haproxystats_process.sh 671 | 672 | A wrapper around systemctl tool to detect a dead parent process. 673 | 674 | * check_haproxystats_pull.sh 675 | 676 | A wrapper around systemctl tool to a check if haproxystats-pull is running. 677 | 678 | * check_haproxystats_queue_size.py 679 | 680 | Checks the size of the *incoming* directory queue which is consumed by 681 | haproxystats-process and alert when exceeds a threshold. 682 | 683 | 684 | Starting the programs 685 | --------------------- 686 | 687 | :: 688 | 689 | haproxystats-pull -f ./haproxystats.conf 690 | 691 | :: 692 | 693 | haproxystats-process -f ./haproxystats.conf 694 | 695 | Usage:: 696 | 697 | % haproxystats-pull -h 698 | Pulls statistics from HAProxy daemon over UNIX socket(s) 699 | 700 | Usage: 701 | haproxystats-pull [-f ] [-p | -P] 702 | 703 | Options: 704 | -f, --file configuration file with settings 705 | [default: /etc/haproxystats.conf] 706 | -p, --print show default settings 707 | -P, --print-conf show configuration 708 | -h, --help show this screen 709 | -v, --version show version 710 | 711 | 712 | % haproxystats-process -h 713 | Processes statistics from HAProxy and pushes them to Graphite 714 | 715 | Usage: 716 | haproxystats-process [-f ] [-d ] [-p | -P] 717 | 718 | Options: 719 | -f, --file configuration file with settings 720 | [default: /etc/haproxystats.conf] 721 | -d, --dir directory with additional configuration files 722 | -p, --print show default settings 723 | -P, --print-conf show configuration 724 | -h, --help show this screen 725 | -v, --version show version 726 | 727 | 728 | Development 729 | ----------- 730 | I would love to hear what other people think about **haproxystats** and provide 731 | feedback. Please post your comments, bug reports and wishes on my `issues page 732 | `_. 733 | 734 | How to setup a development environment 735 | ###################################### 736 | 737 | Install HAProxy:: 738 | 739 | % sudo apt-get install haproxy 740 | 741 | Use a basic HAProxy configuration in multiprocess mode:: 742 | 743 | global 744 | log 127.0.0.1 len 2048 local2 745 | chroot /var/lib/haproxy 746 | stats socket /run/haproxy/admin1.sock mode 666 level admin process 1 747 | stats socket /run/haproxy/admin2.sock mode 666 level admin process 2 748 | stats socket /run/haproxy/admin3.sock mode 666 level admin process 3 749 | stats socket /run/haproxy/admin4.sock mode 666 level admin process 4 750 | # allow read/write access to anyone----------^ 751 | stats timeout 30s 752 | user haproxy 753 | group haproxy 754 | daemon 755 | nbproc 4 756 | cpu-map 1 0 757 | cpu-map 2 1 758 | cpu-map 3 1 759 | cpu-map 4 0 760 | 761 | defaults 762 | log global 763 | mode http 764 | timeout connect 5000 765 | timeout client 50000 766 | timeout server 50000 767 | 768 | frontend frontend_proc1 769 | bind 0.0.0.0:81 process 1 770 | default_backend backend_proc1 771 | 772 | frontend frontend_proc2 773 | bind 0.0.0.0:82 process 2 774 | default_backend backend_proc1 775 | 776 | frontend frontend1_proc34 777 | bind :83 process 3 778 | bind :83 process 4 779 | default_backend backend1_proc34 780 | 781 | backend backend_proc1 782 | bind-process 1 783 | default-server inter 1000s 784 | option httpchk GET / HTTP/1.1\r\nHost:\ .com\r\nUser-Agent:\ HAProxy 785 | server member1_proc1 10.189.224.169:80 weight 100 check fall 2 rise 3 786 | server member2_proc1 10.196.70.109:80 weight 100 check fall 2 rise 3 787 | server bck_all_srv1 10.196.70.109:88 weight 100 check fall 2 rise 3 788 | 789 | backend backend1_proc34 790 | bind-process 3,4 791 | default-server inter 1000s 792 | option httpchk GET / HTTP/1.1\r\nHost:\ .com\r\nUser-Agent:\ HAProxy 793 | server bck1_proc34_srv1 10.196.70.109:80 check fall 2 inter 5s rise 3 794 | server bck1_proc34_srv2 10.196.70.109:80 check fall 2 inter 5s rise 3 795 | server bck_all_srv1 10.196.70.109:80 check fall 2 inter 5s rise 3 796 | 797 | backend backend_proc2 798 | bind-process 2 799 | default-server inter 1000s 800 | option httpchk GET / HTTP/1.1\r\nHost:\ .com\r\nUser-Agent:\ HAProxy 801 | server bck_proc2_srv1_proc2 127.0.0.1:8001 check fall 2 inter 5s rise 3 802 | server bck_proc2_srv2_proc2 127.0.0.1:8002 check fall 2 inter 5s rise 3 803 | server bck_proc2_srv3_proc2 127.0.0.1:8003 check fall 2 inter 5s rise 3 804 | server bck_proc2_srv4_proc2 127.0.0.1:8004 check fall 2 inter 5s rise 3 805 | 806 | Start HAProxy and check it is up:: 807 | 808 | sudo systemctl start haproxy.service;systemctl status -l haproxy.service 809 | 810 | Create a python virtual environment using virtualenvwrapper tool:: 811 | 812 | mkvirtualenv --python=`which python3` haproxystats-dev 813 | 814 | **Do not** exit the *haproxystats-dev* virtual environment. 815 | 816 | Clone the project, if you are planning to contribute then you should fork it on 817 | GitHub and clone that project instead:: 818 | 819 | mkdir ~/repo;cd ~/repo 820 | git clone https://github.com/unixsurfer/haproxystats 821 | 822 | Install necessary libraries:: 823 | 824 | cd haproxystats 825 | pip install -U pbr setuptools 826 | pip install -r ./requirements.txt 827 | 828 | Start a TCP server which acts a Graphite relay and listens on 127.0.0.1:39991:: 829 | 830 | python3 ./contrib/tcp_server.py 831 | 832 | Install haproxystats:: 833 | 834 | python setup.py install 835 | 836 | Create necessary directory structure:: 837 | 838 | mkdir -p ./var/var/lib/haproxystats 839 | mkdir -p ./var/etc 840 | mkdir -p ./var/etc/haproxystats.d 841 | 842 | Adjust the following configuration and save it in ./var/etc/haproxystats.conf:: 843 | 844 | [DEFAULT] 845 | loglevel = debug 846 | retries = 2 847 | timeout = 1 848 | interval = 2 849 | 850 | [paths] 851 | base-dir = /home//repo/haproxystats/var/var/lib/haproxystats 852 | 853 | [pull] 854 | socket-dir = /run/haproxy 855 | retries = 1 856 | timeout = 0.1 857 | interval = 0.5 858 | pull-timeout = 10 859 | pull-interval = 10 860 | dst-dir = ${paths:base-dir}/incoming 861 | tmp-dst-dir = ${paths:base-dir}/incoming.tmp 862 | workers = 8 863 | 864 | [process] 865 | src-dir = ${paths:base-dir}/incoming 866 | workers = 2 867 | calculate-percentages = true 868 | per-process-metrics = true 869 | 870 | [graphite] 871 | server = 127.0.0.1 872 | port = 39991 873 | retries = 3 874 | interval = 0.8 875 | timeout = 0.9 876 | delay = 10 877 | backoff = 2 878 | namespace = loadbalancers 879 | prefix_hostname = true 880 | fqdn = true 881 | queue-size = 1000 882 | 883 | [local-store] 884 | dir = ${paths:base-dir}/local-store 885 | 886 | Start haproxystats-pull and haproxystats-process on 2 different terminals:: 887 | 888 | haproxystats-pull -f var/etc/haproxystats.conf 889 | haproxystats-process -f var/etc/haproxystats.conf 890 | 891 | Exit from *haproxystats-dev* virtual environment:: 892 | 893 | deactivate 894 | 895 | **Start hacking and don't forget to make a Pull Request** 896 | 897 | Installation 898 | ------------ 899 | 900 | Use pip:: 901 | 902 | pip install haproxystats 903 | 904 | From Source:: 905 | 906 | sudo python setup.py install 907 | 908 | Build (source) RPMs:: 909 | 910 | python setup.py clean --all; python setup.py bdist_rpm 911 | 912 | Build a source archive for manual installation:: 913 | 914 | python setup.py sdist 915 | 916 | Use Ansible Playbook: 917 | 918 | To deploy haproxystats By Ansible Playbook go to contrib/ansible-playbook 919 | directory:: 920 | 921 | cd contrib/ansible-playbook 922 | 923 | Then enter your haproxy server IP address in hosts file:: 924 | 925 | vi hosts 926 | 927 | After that set information of your environment in variable file:: 928 | 929 | vi group_vars/all 930 | 931 | Now for run Ansible Playbook use this command:: 932 | 933 | ansible-playbook -i hosts main-playbook.yml 934 | 935 | When Ansible Playbook run successful completely, you can take control haproxystats-pull and haproxystats-process by systemd:: 936 | 937 | systemctl start haproxystats-pull.service 938 | 939 | systemctl start haproxystats-process.service 940 | 941 | Use Docker:: 942 | 943 | To build haproxystats docker image, after clone the project you should run the beneath commands, all you need as a prerequisite is having docker on your machine:: 944 | 945 | cd haproxystats/contrib/Docker 946 | 947 | docker build --tag [Your-name]/haproxystats . 948 | 949 | Launch a container: 950 | 951 | To launch a haproxystats docker container you can use this instruction:: 952 | 953 | docker run -d --restart always --name [container_name] --hostname=[container_hostname] -v [path_of_haproxystats_config_file]:/etc/haproxystats/haproxystats.conf -v [path_of_haproxy_socket_files]:[path_of_socket-dir] [Your-name]/haproxystats 954 | 955 | Notes: 956 | 957 | - [container_name]: The name of the container which you can choose as you wish. 958 | - [container_hostname]: The hostname of the container which you can choose as you wish. 959 | - [path_of_haproxystats_config_file]: The path to your haproxystats configuration file. 960 | - [path_of_haproxy_socket_files]: The path to your haproxy socket files. 961 | - [path_of_socket-dir]: The path of the haproxy socket files inside of container that you have set in the haproxystats.conf file. 962 | 963 | 964 | For example:: 965 | 966 | docker run -d --restart always --name haproxystats --hostname=haproxystats -v /opt/haproxystats/haproxystats.conf:/etc/haproxystats/haproxystats.conf -v /var/lib/haproxy:/run/haproxy hos7ein/haproxystats 967 | 968 | 969 | How to make a release 970 | --------------------- 971 | 972 | #. Bump version in haproxystats/__init__.py 973 | 974 | #. Commit above change with:: 975 | 976 | git commit -av -m'RELEASE 0.1.3 version' 977 | 978 | #. Create a signed tag, pbr will use this for the version number:: 979 | 980 | git tag -s 0.1.3 -m 'bump release' 981 | 982 | #. Create the source distribution archive (the archive will be placed in the 983 | **dist** directory):: 984 | 985 | python setup.py sdist 986 | 987 | #. pbr updates ChangeLog file and we want to squeeze this change to the 988 | previous commit, thus run:: 989 | 990 | git commit -av --amend 991 | 992 | #. Move current tag to the last commit:: 993 | 994 | git tag -fs 0.1.3 -m 'bump release' 995 | 996 | #. Push changes:: 997 | 998 | git push;git push --tags 999 | 1000 | #. Upload to Python Package Index:: 1001 | 1002 | twine upload -s -p dist/* 1003 | 1004 | 1005 | Contributors 1006 | ------------ 1007 | 1008 | The following people have contributed to project with feedback and code reviews 1009 | 1010 | - Károly Nagy https://github.com/charlesnagy 1011 | 1012 | - Dan Achim https://github.com/danakim 1013 | 1014 | Licensing 1015 | --------- 1016 | 1017 | Apache 2.0 1018 | 1019 | Acknowledgement 1020 | --------------- 1021 | This program was originally developed for Booking.com. With approval 1022 | from Booking.com, the code was generalised and published as Open Source 1023 | on github, for which the author would like to express his gratitude. 1024 | 1025 | Contacts 1026 | -------- 1027 | 1028 | **Project website**: https://github.com/unixsurfer/haproxystats 1029 | 1030 | **Author**: Pavlos Parissis 1031 | 1032 | .. _HAProxy: http://www.haproxy.org/ 1033 | .. _stats socket: http://cbonte.github.io/haproxy-dconv/1.6/management.html#9.2 1034 | .. _carbon-c-relay: https://github.com/grobian/carbon-c-relay 1035 | .. _Pandas: http://pandas.pydata.org/ 1036 | .. _asyncio: https://docs.python.org/3/library/asyncio.html 1037 | .. _inotify: http://linux.die.net/man/7/inotify 1038 | .. _stat: http://cbonte.github.io/haproxy-dconv/1.6/management.html#show%20stat 1039 | .. _info: http://cbonte.github.io/haproxy-dconv/1.6/management.html#show%20info 1040 | .. _pool of threads: https://docs.python.org/3/library/concurrent.futures.html#concurrent.futures.ThreadPoolExecutor 1041 | .. _INI: https://en.wikipedia.org/wiki/INI_file 1042 | .. _carbon-c-relay: https://github.com/grobian/carbon-c-relay 1043 | .. _management: http://cbonte.github.io/haproxy-dconv/1.6/management.html#9.1 1044 | -------------------------------------------------------------------------------- /TODO.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unixsurfer/haproxystats/3ef4b3cacada9b6ed52dcc7726d8dad81a821ed1/TODO.md -------------------------------------------------------------------------------- /contrib/Docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM alpine:3.12 2 | 3 | # ---------------- # 4 | # Installation # 5 | # ---------------- # 6 | 7 | # Install and setup all prerequisites 8 | RUN apk add --no-cache gcc g++ python3 py3-pip python3-dev supervisor &&\ 9 | wget -c -O /requirements.txt https://raw.githubusercontent.com/unixsurfer/haproxystats/master/requirements.txt &&\ 10 | pip3 install --requirement /requirements.txt &&\ 11 | pip3 install haproxystats &&\ 12 | mkdir -p /etc/haproxystats /var/lib/haproxy /var/log/supervisor &&\ 13 | rm -rf /var/cache/apk/* &&\ 14 | rm -rf /requirements.txt 15 | 16 | 17 | COPY ./conf_files/supervisor/ /etc/supervisor.d/ 18 | 19 | 20 | # -------- # 21 | # Run! # 22 | # -------- # 23 | 24 | CMD ["/usr/bin/supervisord", "--nodaemon", "--configuration", "/etc/supervisord.conf"] -------------------------------------------------------------------------------- /contrib/Docker/conf_files/haproxystats.conf: -------------------------------------------------------------------------------- 1 | [DEFAULT] 2 | loglevel = info 3 | retries = 2 4 | timeout = 1 5 | interval = 2 6 | 7 | [paths] 8 | base-dir = /var/lib/haproxystats 9 | 10 | [pull] 11 | loglevel = info 12 | socket-dir = /var/lib/haproxy 13 | retries = 1 14 | timeout = 1 15 | interval = 1 16 | pull-timeout = 0.5 17 | pull-interval = 10 18 | dst-dir = ${paths:base-dir}/incoming 19 | tmp-dst-dir = ${paths:base-dir}/incoming.tmp 20 | workers = 8 21 | 22 | [process] 23 | src-dir = ${paths:base-dir}/incoming 24 | workers = 4 25 | 26 | [graphite] 27 | server = 127.0.0.1 28 | port = 3002 29 | retries = 2 30 | interval = 0.8 31 | delay = 10 32 | backoff = 2 33 | namespace = loadbalancers 34 | prefix_hostname = true 35 | fqdn = true 36 | queue-size = 1000000 37 | 38 | #[local-store] 39 | #dir = ${paths:base-dir}/local-store 40 | -------------------------------------------------------------------------------- /contrib/Docker/conf_files/supervisor/haproxystats-process.ini: -------------------------------------------------------------------------------- 1 | [program:haproxystats-process] 2 | command = /usr/bin/haproxystats-process -f /etc/haproxystats/haproxystats.conf 3 | stdout_logfile = /var/log/supervisor/%(program_name)s.log 4 | stderr_logfile = /var/log/supervisor/%(program_name)s.log 5 | autorestart = true -------------------------------------------------------------------------------- /contrib/Docker/conf_files/supervisor/haproxystats-pull.ini: -------------------------------------------------------------------------------- 1 | [program:haproxystats-pull] 2 | command = /usr/bin/haproxystats-pull -f /etc/haproxystats/haproxystats.conf 3 | stdout_logfile = /var/log/supervisor/%(program_name)s.log 4 | stderr_logfile = /var/log/supervisor/%(program_name)s.log 5 | autorestart = true 6 | -------------------------------------------------------------------------------- /contrib/ansible-playbook/group_vars/all: -------------------------------------------------------------------------------- 1 | --- 2 | # Variables listed here are applicable to all host groups 3 | 4 | 5 | haproxy_socket_dir: /var/lib/haproxy 6 | graphite_server: 10.10.22.77 7 | graphite_port: 2003 8 | pull_workers: 4 9 | process_workers: 2 10 | -------------------------------------------------------------------------------- /contrib/ansible-playbook/hosts: -------------------------------------------------------------------------------- 1 | [haproxy-servers] 2 | 10.10.22.70 3 | -------------------------------------------------------------------------------- /contrib/ansible-playbook/main-playbook.yml: -------------------------------------------------------------------------------- 1 | # Installation haproxystats 2 | - hosts: haproxy-servers 3 | remote_user: root 4 | gather_facts: False 5 | roles: 6 | - install-dependency-package 7 | - install-haproxystats 8 | - create-newuser 9 | - haproxy-socket-dir-permission 10 | - systemd-files 11 | - haproxystats-config-file 12 | -------------------------------------------------------------------------------- /contrib/ansible-playbook/roles/create-newuser/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Create haproxystats group 3 | group: 4 | name: haproxystats 5 | state: present 6 | 7 | - name : Create new user for haproxystats service and add to haproxystats and haproxy group 8 | user: 9 | name: haproxystats 10 | group: haproxystats 11 | groups: haproxy 12 | append: yes 13 | -------------------------------------------------------------------------------- /contrib/ansible-playbook/roles/haproxy-socket-dir-permission/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Set permission for haproxy socket directory 3 | file: 4 | path: "{{ haproxy_socket_dir }}" 5 | state: directory 6 | mode: 0770 7 | -------------------------------------------------------------------------------- /contrib/ansible-playbook/roles/haproxystats-config-file/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Make haproxystats config file 3 | template: 4 | src: haproxystats.conf.j2 5 | dest: /etc/haproxystats.conf 6 | backup: yes 7 | -------------------------------------------------------------------------------- /contrib/ansible-playbook/roles/haproxystats-config-file/templates/haproxystats.conf.j2: -------------------------------------------------------------------------------- 1 | [DEFAULT] 2 | loglevel = info 3 | retries = 2 4 | timeout = 1 5 | interval = 2 6 | 7 | [paths] 8 | base-dir = /home/haproxystats 9 | 10 | [pull] 11 | loglevel = info 12 | socket-dir = {{ haproxy_socket_dir }} 13 | retries = 1 14 | timeout = 0.1 15 | interval = 0.5 16 | pull-timeout = 2 17 | pull-interval = 10 18 | dst-dir = ${paths:base-dir}/incoming 19 | tmp-dst-dir = ${paths:base-dir}/incoming.tmp 20 | workers = {{ pull_workers }} 21 | queue-size = 360 22 | 23 | [process] 24 | src-dir = ${paths:base-dir}/incoming 25 | workers = {{ process_workers }} 26 | per-process-metrics = false 27 | 28 | [graphite] 29 | server = {{ graphite_server }} 30 | port = {{ graphite_port }} 31 | retries = 3 32 | interval = 1.8 33 | connect-timeout = 1.0 34 | write-timeout = 1.0 35 | delay = 10 36 | backoff = 2 37 | namespace = loadbalancers 38 | prefix-hostname = true 39 | fqdn = true 40 | queue-size = 1000000 41 | 42 | #[local-store] 43 | #dir = ${paths:base-dir}/local-store 44 | -------------------------------------------------------------------------------- /contrib/ansible-playbook/roles/install-dependency-package/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Check out distribution and install python version 2 on Ubuntu hosts 3 | raw: DISTRO=$(awk '/^ID=/' /etc/*-release | awk -F'=' '{ print tolower($2) }') && if [ "$DISTRO" == "ubuntu" ] ; then apt-get -y update && apt-get install -y python-minimal; fi 4 | 5 | - name: enable gather_facts 6 | setup: 7 | 8 | #task on CentOS or RHEL distribution 9 | - name: Install or update EPEL repo 10 | yum: name=epel-release state=latest 11 | when: ansible_distribution == 'CentOS' or ansible_distribution == 'Red Hat Enterprise Linux' 12 | 13 | - name: Install python 3 and pip3 packages 14 | yum: 15 | name: "{{ packages }}" 16 | state: latest 17 | vars: 18 | packages: 19 | - python3-pip 20 | - gcc 21 | - gcc-c++ 22 | - python3-devel 23 | when: ansible_distribution == 'CentOS' or ansible_distribution == 'Red Hat Enterprise Linux' 24 | 25 | #task on Debian or Ubuntu distribution 26 | - name: Install python 3 and pip3 packages 27 | apt: 28 | name: "{{ packages }}" 29 | state: latest 30 | vars: 31 | packages: 32 | - python3 33 | - python3-pip 34 | - python3-setuptools 35 | - python-setuptools 36 | when: ansible_distribution == 'Debian' or ansible_distribution == 'Ubuntu' 37 | -------------------------------------------------------------------------------- /contrib/ansible-playbook/roles/install-haproxystats/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Update setuptools by pip3 3 | pip: 4 | name: 5 | - setuptools 6 | state: latest 7 | executable: pip3 8 | 9 | 10 | - name: Install haproxystats by pip3 11 | pip: 12 | name: 13 | - pbr 14 | - pandas 15 | - haproxystats 16 | state: latest 17 | executable: pip3 18 | when: ansible_distribution == 'CentOS' or ansible_distribution == 'Red Hat Enterprise Linux' 19 | 20 | 21 | - name: Install haproxystats by pip3 22 | pip: 23 | name: 24 | - haproxystats 25 | state: latest 26 | executable: pip3 27 | when: ansible_distribution == 'Debian' or ansible_distribution == 'Ubuntu' 28 | -------------------------------------------------------------------------------- /contrib/ansible-playbook/roles/systemd-files/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Make haproxystats-pull systemd file 3 | template: 4 | src: haproxystats-pull.service.j2 5 | dest: /etc/systemd/system/haproxystats-pull.service 6 | backup: yes 7 | 8 | - name: Make haproxystats-process systemd file 9 | template: 10 | src: haproxystats-process.service.j2 11 | dest: /etc/systemd/system/haproxystats-process.service 12 | backup: yes 13 | -------------------------------------------------------------------------------- /contrib/ansible-playbook/roles/systemd-files/templates/haproxystats-process.service.j2: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=Processes statistics from HAProxy and pushes them to Graphite 3 | After=network.target syslog.target 4 | Wants=network.target syslog.target 5 | Documentation=https://github.com/unixsurfer/haproxystats 6 | 7 | [Service] 8 | Type=simple 9 | KillMode=process 10 | Environment="CONFIG=/etc/haproxystats.conf" 11 | User=haproxystats 12 | Group=haproxystats 13 | ExecStart=/usr/local/bin/haproxystats-process -f $CONFIG 14 | TimeoutStartSec=3 15 | TimeoutStopSec=60 16 | Restart=on-failure 17 | 18 | [Install] 19 | WantedBy=multi-user.target 20 | -------------------------------------------------------------------------------- /contrib/ansible-playbook/roles/systemd-files/templates/haproxystats-pull.service.j2: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=Pulls statistics from HAProxy daemon over UNIX sockets 3 | After=network.target syslog.target 4 | Wants=network.target syslog.target 5 | Documentation=https://github.com/unixsurfer/haproxystats 6 | 7 | [Service] 8 | Type=simple 9 | Environment="CONFIG=/etc/haproxystats.conf" 10 | User=haproxystats 11 | Group=haproxystats 12 | ExecStart=/usr/local/bin/haproxystats-pull -f $CONFIG 13 | TimeoutStartSec=3 14 | TimeoutStopSec=6 15 | Restart=on-failure 16 | 17 | [Install] 18 | WantedBy=multi-user.target 19 | -------------------------------------------------------------------------------- /contrib/nagios/check_haproxystats_process.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | osversion=$(/usr/local/bin/facter operatingsystemmajrelease) 3 | if [ "${osversion}" -lt 7 ]; then 4 | echo "OK: haproxystats-process doesn't run here as it only runs on CentOS version 7 and higher" 5 | exit 0 6 | fi 7 | message=$(systemctl is-active haproxystats-process.service) 8 | if [[ $? -ne 0 ]]; then 9 | echo "CRITICAL:" "${message}" 10 | exit 2 11 | else 12 | echo "OK:" "${message}" 13 | exit 0 14 | fi 15 | -------------------------------------------------------------------------------- /contrib/nagios/check_haproxystats_process_number_of_procs.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | VERBOSE_ARG= 3 | 4 | while getopts ":v" opt; do 5 | case $opt in 6 | v) 7 | VERBOSE_ARG=" -vv" 8 | ;; 9 | esac 10 | done 11 | if [[ -x /opt/blue-python/3.4/bin/haproxystats-process && -r /etc/haproxystats.conf ]]; then 12 | WORKERS=$(/opt/blue-python/3.4/bin/haproxystats-process -f /etc/haproxystats.conf -P|grep workers |awk '{print $3}') 13 | if [ $? -ne 0 ]; then 14 | echo "OK: haproxystats-process doesn't run here" 15 | exit 0 16 | fi 17 | PROCESSES=$(($WORKERS+1)) 18 | msg=$(/usr/lib64/nagios/plugins/check_procs\ 19 | -c "${PROCESSES}":"${PROCESSES}"\ 20 | --ereg-argument-array='^/usr/local/bin/blue-python3.4 /opt/blue-python/3.4/bin/haproxystats-process -f /etc/haproxystats.conf$'\ 21 | -u haproxystats\ 22 | $VERBOSE_ARG) 23 | EXITCODE=$? 24 | if [[ ${EXITCODE} -ne 0 ]]; then 25 | echo "${msg}" "Number of processes must be ${PROCESSES} OPDOC: TBD" 26 | else 27 | echo "${msg}" 28 | fi 29 | exit ${EXITCODE} 30 | else 31 | echo "OK: haproxystats-process isn't installed here" 32 | exit 0 33 | fi 34 | -------------------------------------------------------------------------------- /contrib/nagios/check_haproxystats_pull.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | osversion=$(/usr/local/bin/facter operatingsystemmajrelease) 3 | if [ "${osversion}" -lt 7 ]; then 4 | echo "OK: haproxystats-pull doesn't run here as it only runs on CentOS version 7 and higher" 5 | exit 0 6 | fi 7 | message=$(systemctl is-active haproxystats-pull.service) 8 | if [[ $? -ne 0 ]]; then 9 | echo "CRITICAL:" "${message}" 10 | exit 2 11 | else 12 | echo "OK:" "${message}" 13 | exit 0 14 | fi 15 | -------------------------------------------------------------------------------- /contrib/nagios/check_haproxystats_queue_size.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # vim:fenc=utf-8 4 | """Checks the size of the queue which is consumed by haproxystats-process 5 | 6 | Usage: 7 | check_haproxystats_queue_size [-f -o -w ] 8 | 9 | Options: 10 | -f, --file configuration file with settings 11 | [default: /etc/haproxystats.conf] 12 | -o, --ok OK threshold [default: 60] 13 | -w, --warning WARNING threshold [default: 120] 14 | -h, --help show this screen 15 | """ 16 | import os 17 | import sys 18 | from configparser import (ConfigParser, ExtendedInterpolation, NoSectionError, 19 | NoOptionError) 20 | from docopt import docopt 21 | 22 | 23 | def main(): 24 | """ 25 | main code 26 | """ 27 | args = docopt(__doc__) 28 | config = ConfigParser(interpolation=ExtendedInterpolation()) 29 | config.read(args['--file']) 30 | try: 31 | base_dir = config.get('pull', 'dst-dir') 32 | except (NoSectionError, NoOptionError) as exc: 33 | print('OK: missing configuration as I got: {e}'.format(e=exc)) 34 | sys.exit(0) 35 | std_msg = (": Queue location={b}, Thresholds OK={ok} WARNING={w} and any " 36 | "higher value is critical").format(b=base_dir, 37 | ok=args['--ok'], 38 | w=args['--warning']) 39 | try: 40 | dirs = [os.path.join(base_dir, x) for x in os.listdir(base_dir) if 41 | os.path.isdir(os.path.join(base_dir, x))] 42 | except (PermissionError, FileNotFoundError, OSError) as exc: 43 | print("UNKNOWN: can't check {d} due to {e}".format(d=base_dir, 44 | e=exc)) 45 | sys.exit(3) 46 | queue_size = len(dirs) 47 | if queue_size <= int(args['--ok']): 48 | print('OK: queue size {q}{s}'.format(q=queue_size, s=std_msg)) 49 | sys.exit(0) 50 | elif int(args['--ok']) < queue_size <= int(args['--warning']): 51 | print('WARNING: queue size {q}{s}'.format(q=queue_size, s=std_msg)) 52 | sys.exit(1) 53 | else: 54 | print('CRITICAL: queue size {q}{s}'.format(q=queue_size, s=std_msg)) 55 | sys.exit(2) 56 | 57 | # This is the standard boilerplate that calls the main() function. 58 | if __name__ == '__main__': 59 | main() 60 | -------------------------------------------------------------------------------- /contrib/puppet/manifests/init.pp: -------------------------------------------------------------------------------- 1 | # == Class: haproxystats 2 | # 3 | # A class to configure HAProxy statistics collection tool haproxystats. 4 | # See more information about haproxystats here 5 | # https://github.com/unixsurfer/haproxystats 6 | # 7 | # === Parameters 8 | # 9 | # Document parameters here. 10 | # 11 | # [*sample_parameter*] 12 | # 13 | # === Examples 14 | # 15 | # class { 'haproxystats': 16 | # } 17 | # 18 | # === Actions 19 | # 20 | # - Create user and group haproxystats 21 | # 22 | # === Requires 23 | # 24 | # - 'haproxystats' user and group defined in profile_base::user 25 | # - syslog::activate{ 'haproxystats':} 26 | # - daemon-reload exec resource 27 | # exec { 28 | # 'systemd-daemon-reload': 29 | # refreshonly => true, 30 | # command => '/bin/systemctl daemon-reload', 31 | # logoutput => true; 32 | # } 33 | # === Authors 34 | # 35 | # Author Name 36 | # 37 | # === Copyright 38 | # 39 | # Copyright 2016 Pavlos Parissis 40 | # 41 | class haproxystats ( 42 | $package_name = $::haproxystats::params::package_name, 43 | $version = $::haproxystats::params::version, 44 | $enable = $::haproxystats::params::enable, 45 | $autostart = $::haproxystats::params::autostart, 46 | $enable_monit = $::haproxystats::params::enable_monit, 47 | $user = $::haproxystats::params::user, 48 | $group = $::haproxystats::params::group, 49 | $groups = $::haproxystats::params::groups, 50 | $log_rotate = $::haproxystats::params::log_rotate, 51 | $log_rotate_freq = $::haproxystats::params::log_rotate_freq, 52 | $default_loglevel = $::haproxystats::params::default_loglevel, 53 | $default_retries = $::haproxystats::params::default_retries, 54 | $default_timeout = $::haproxystats::params::default_timeout, 55 | $default_interval = $::haproxystats::params::default_interval, 56 | $paths_base_dir = $::haproxystats::params::paths_base_dir, 57 | $pull_loglevel = $::haproxystats::params::pull_loglevel, 58 | $pull_retries = $::haproxystats::params::pull_retries, 59 | $pull_timeout = $::haproxystats::params::pull_timeout, 60 | $pull_interval = $::haproxystats::params::pull_interval, 61 | $pull_socket_dir = $::haproxystats::params::pull_socket_dir, 62 | $pull_pull_timeout = $::haproxystats::params::pull_pull_timeout, 63 | $pull_pull_interval = $::haproxystats::params::pull_pull_interval, 64 | $pull_dst_dir = $::haproxystats::params::pull_dst_dir, 65 | $pull_tmp_dst_dir = $::haproxystats::params::pull_tmp_dst_dir, 66 | $pull_workers = $::haproxystats::params::pull_workers, 67 | $pull_queue_size = $::haproxystats::params::pull_queue_size, 68 | $pull_CPUAffinity = $::haproxystats::params::pull_CPUAffinity, 69 | $process_workers = $::haproxystats::params::process_workers, 70 | $process_src_dir = $::haproxystats::params::process_src_dir, 71 | $process_loglevel = $::haproxystats::params::process_loglevel, 72 | $process_CPUAffinity = $::haproxystats::params::process_CPUAffinity, 73 | $process_aggr_server_metrics = $::haproxystats::params::process_aggr_server_metrics, 74 | $process_per_process_metrics = $::haproxystats::params::process_per_process_metrics, 75 | $process_exclude_frontends = $::haproxystats::params::process_exclude_frontends, 76 | $process_exclude_backends = $::haproxystats::params::process_exclude_backends, 77 | $process_compute_percentages = $::haproxystats::params::process_compute_percentages, 78 | $graphite_server = $::haproxystats::params::graphite_server, 79 | $graphite_port = $::haproxystats::params::graphite_port, 80 | $graphite_retries = $::haproxystats::params::graphite_retries, 81 | $graphite_interval = $::haproxystats::params::graphite_interval, 82 | $graphite_connect_timeout = $::haproxystats::params::graphite_connect_timeout, 83 | $graphite_write_timeout = $::haproxystats::params::graphite_write_timeout, 84 | $graphite_delay = $::haproxystats::params::graphite_delay, 85 | $graphite_backoff = $::haproxystats::params::graphite_backoff, 86 | $graphite_queue_size = $::haproxystats::params::graphite_queue_size, 87 | $graphite_namespace = $::haproxystats::params::graphite_namespace, 88 | $graphite_prefix_hostname = $::haproxystats::params::graphite_prefix_hostname, 89 | $graphite_fqdn = $::haproxystats::params::graphite_fqdn, 90 | $local_store_enabled = $::haproxystats::params::local_store_enabled, 91 | $local_store_dir = $::haproxystats::params::local_store_dir, 92 | ) inherits haproxystats::params { 93 | 94 | validate_re($default_loglevel, [ 95 | '^debug$', 96 | '^info$', 97 | '^warning$', 98 | '^error$', 99 | '^critical$', 100 | ] 101 | ) 102 | if ! is_numeric($default_timeout) { 103 | fail("default_timeout must be a number") 104 | } 105 | if ! is_numeric($default_retries) { 106 | fail("default_retries must be a number") 107 | } 108 | if ! is_numeric($default_interval) { 109 | fail("default_interval must be a number") 110 | } 111 | validate_re($pull_loglevel, [ 112 | '^debug$', 113 | '^info$', 114 | '^warning$', 115 | '^error$', 116 | '^critical$', 117 | ] 118 | ) 119 | if ! is_numeric($pull_timeout) { 120 | fail("pull_timeout must be a number") 121 | } 122 | if ! is_numeric($pull_retries) { 123 | fail("pull_retries must be a number") 124 | } 125 | if ! is_numeric($pull_interval) { 126 | fail("pull_interval must be a number") 127 | } 128 | if ! is_numeric($pull_pull_interval) { 129 | fail("pull_pull_interval must be a number") 130 | } 131 | if ! is_numeric($pull_pull_timeout) { 132 | fail("pull_pull_timeout must be a number") 133 | } 134 | if ! is_numeric($pull_workers) { 135 | fail("pull_workers must be a number") 136 | } 137 | if ! is_numeric($pull_queue_size) { 138 | fail("pull_queue_size must be a number") 139 | } 140 | validate_re($process_loglevel, [ 141 | '^debug$', 142 | '^info$', 143 | '^warning$', 144 | '^error$', 145 | '^critical$', 146 | ] 147 | ) 148 | if ! is_numeric($process_workers) { 149 | fail("process_workers must be a number") 150 | } 151 | validate_bool($process_aggr_server_metrics) 152 | validate_bool($process_compute_percentages) 153 | validate_array($process_exclude_backends) 154 | validate_array($process_exclude_frontends) 155 | if ! is_numeric($graphite_port) { 156 | fail("graphite_port must be a number") 157 | } 158 | if ! is_numeric($graphite_retries) { 159 | fail("graphite_retries must be a number") 160 | } 161 | if ! is_numeric($graphite_interval) { 162 | fail("graphite_interval must be a number") 163 | } 164 | if ! is_numeric($graphite_connect_timeout) { 165 | fail("graphite_connect_timeout must be a number") 166 | } 167 | if ! is_numeric($graphite_write_timeout) { 168 | fail("graphite_write_timeout must be a number") 169 | } 170 | if ! is_numeric($graphite_delay) { 171 | fail("graphite_delay must be a number") 172 | } 173 | if ! is_numeric($graphite_backoff) { 174 | fail("graphite_backoff must be a number") 175 | } 176 | if ! is_numeric($graphite_queue_size) { 177 | fail("graphite_queue_size must be a number") 178 | } 179 | validate_bool($graphite_prefix_hostname) 180 | validate_bool($graphite_fqdn) 181 | validate_bool($local_store_enabled) 182 | 183 | $dotdir = '/etc/haproxystats.d' 184 | $exclude_frontends_filename = "${dotdir}/exclude_frontend.conf" 185 | $exclude_backends_filename = "${dotdir}/exclude_backend.conf" 186 | realize ( Group[$user] ) 187 | User <| title == "${user}" |> { 188 | groups => $groups, 189 | } 190 | 191 | package { 192 | $package_name: 193 | ensure => $version, 194 | } 195 | 196 | file { 197 | $paths_base_dir: 198 | ensure => directory, 199 | owner => $user, 200 | group => $group, 201 | require => [ 202 | User[$user], 203 | Group[$group] 204 | ], 205 | mode => '0755'; 206 | ['/etc/systemd/system/haproxystats-process.service.d', 207 | '/etc/systemd/system/haproxystats-pull.service.d']: 208 | ensure => directory, 209 | owner => root, 210 | group => root, 211 | mode => '0755', 212 | purge => true, 213 | recurse => true; 214 | '/etc/systemd/system/haproxystats-pull.service.d/overwrites.conf': 215 | ensure => file, 216 | owner => root, 217 | group => root, 218 | mode => '0444', 219 | content => template('haproxystats/pull-systemd-overwrites.conf.erb'), 220 | notify => [ 221 | Exec['systemd-daemon-reload'], 222 | Service['haproxystats-pull'], 223 | ]; 224 | '/etc/systemd/system/haproxystats-process.service.d/overwrites.conf': 225 | ensure => file, 226 | owner => root, 227 | group => root, 228 | mode => '0444', 229 | content => template('haproxystats/process-systemd-overwrites.conf.erb'), 230 | notify => [ 231 | Exec['systemd-daemon-reload'], 232 | Service['haproxystats-process'], 233 | ]; 234 | '/usr/local/bin/haproxystats-process-monit-check.sh': 235 | ensure => file, 236 | owner => root, 237 | group => root, 238 | mode => '0755', 239 | content => template('haproxystats/haproxystats-process-monit-check.sh.erb'); 240 | $dotdir: 241 | ensure => directory, 242 | owner => root, 243 | group => root, 244 | mode => '0755'; 245 | $exclude_frontends_filename: 246 | ensure => size($process_exclude_frontends) ? { 247 | 0 => absent, 248 | default => file, 249 | }, 250 | owner => root, 251 | group => root, 252 | mode => '0444', 253 | content => template('haproxystats/exclude_frontend.conf.erb'), 254 | notify => [ 255 | Service['haproxystats-process'], 256 | ]; 257 | $exclude_backends_filename: 258 | ensure => size($process_exclude_backends) ? { 259 | 0 => absent, 260 | default => file, 261 | }, 262 | owner => root, 263 | group => root, 264 | mode => '0444', 265 | content => template('haproxystats/exclude_backend.conf.erb'), 266 | notify => [ 267 | Service['haproxystats-process'], 268 | ]; 269 | } 270 | concat { 271 | '/etc/haproxystats.conf': 272 | mode => 0444, 273 | owner => $user, 274 | group => $group, 275 | require => [Package[$package_name]], 276 | notify => [ 277 | Service['haproxystats-pull'], 278 | Service['haproxystats-process'], 279 | ]; 280 | } 281 | concat::fragment { 282 | 'defaults': 283 | target => '/etc/haproxystats.conf', 284 | order => '00', 285 | content => template('haproxystats/defaults.conf.erb'); 286 | 'pull': 287 | target => '/etc/haproxystats.conf', 288 | order => '01', 289 | content => template('haproxystats/pull.conf.erb'), 290 | notify => Service['haproxystats-pull']; 291 | 'process': 292 | target => '/etc/haproxystats.conf', 293 | order => '02', 294 | content => template('haproxystats/process.conf.erb'), 295 | notify => Service['haproxystats-process']; 296 | } 297 | service { 298 | 'haproxystats-pull': 299 | ensure => $enable, 300 | enable => $autostart, 301 | require => [ 302 | Package[$package_name], 303 | Concat['/etc/haproxystats.conf'], 304 | ]; 305 | 'haproxystats-process': 306 | ensure => $enable, 307 | enable => $autostart, 308 | require => [ 309 | Package[$package_name], 310 | Concat['/etc/haproxystats.conf'], 311 | ]; 312 | } 313 | syslog::activate{ 'haproxystats': 314 | rotate => $log_rotate, 315 | rotate_freq => $log_rotate_freq; 316 | } 317 | 318 | $real_enable_monit = $enable ? { 319 | false => false, 320 | 'stopped' => false, 321 | default => $enabled_monit, 322 | } 323 | monit::program { 324 | 'haproxystats-process': 325 | enabled => $real_enable_monit, 326 | scriptname => '/usr/local/bin/haproxystats-process-monit-check.sh' , 327 | email => 'foo@bar.com', 328 | tolerance => 2, 329 | priority => 'priority_1', 330 | nrestarts => 2, 331 | stop_timeout => 380, 332 | require => File['/usr/local/bin/haproxystats-process-monit-check.sh']; 333 | } 334 | } 335 | -------------------------------------------------------------------------------- /contrib/puppet/manifests/params.pp: -------------------------------------------------------------------------------- 1 | # - PRIVATE CLASS - 2 | class haproxystats::params { 3 | $package_name = 'blue-python34-haproxystats' 4 | $version = 'latest' 5 | $enable = true 6 | $autostart = true 7 | $enable_monit = true 8 | $user = 'haproxystats' 9 | $group = 'haproxystats' 10 | $groups = 'hapee' 11 | $log_rotate = 4 12 | $log_rotate_freq = 'daily' 13 | $default_loglevel = 'info' 14 | $default_retries = 2 15 | $default_timeout = 1 16 | $default_interval = 2 17 | $paths_base_dir = '/var/lib/haproxystats' 18 | $pull_loglevel = $default_loglevel 19 | $pull_retries = 2 20 | $pull_timeout = 1 21 | $pull_interval = 2 22 | $pull_socket_dir = '/run/hapee' 23 | $pull_pull_timeout = 8 24 | $pull_pull_interval = 10 25 | $pull_dst_dir = '${paths:base-dir}/incoming' 26 | $pull_tmp_dst_dir = '${paths:base-dir}/incoming.tmp' 27 | $pull_workers = 8 28 | $pull_queue_size = 360 29 | $pull_CPUAffinity = undef 30 | $process_workers = 2 31 | $process_loglevel = $default_loglevel 32 | $process_CPUAffinity = undef 33 | $process_aggr_server_metrics = false 34 | $process_per_process_metrics = false 35 | $process_src_dir = '${paths:base-dir}/incoming' 36 | $process_exclude_frontends = [] 37 | $process_exclude_backends = [] 38 | $process_calculate_percentages = false 39 | $graphite_server = hiera('graphite::host') 40 | $graphite_port = hiera('graphite::port') 41 | $graphite_retries = 2 42 | $graphite_interval = 1 43 | $graphite_connect_timeout = 2 44 | $graphite_write_timeout = 4 45 | $graphite_delay = 10 46 | $graphite_backoff = 2 47 | $graphite_queue_size = 10000 48 | $graphite_namespace = 'loadbalancers' 49 | $graphite_prefix_hostname = true 50 | $graphite_fqdn = true 51 | $local_store_enabled = false 52 | $local_store_dir = '${paths:base-dir}/local-store' 53 | } 54 | 55 | -------------------------------------------------------------------------------- /contrib/puppet/templates/defaults.conf.erb: -------------------------------------------------------------------------------- 1 | ############################################################################# 2 | # 3 | # This file is managed by Puppet 4 | # any changes made locally will be lost. 5 | # 6 | # The master version of this file is created with concat module using 7 | # the following fragments: 8 | # puppet:///modules/haproxystats/templates/{defaults,pull,process}.conf.erb 9 | # 10 | ############################################################################# 11 | [DEFAULT] 12 | loglevel = <%= @default_loglevel%> 13 | retries = <%= @default_retries%> 14 | timeout = <%= @default_timeout%> 15 | interval = <%= @default_interval%> 16 | 17 | [paths] 18 | base-dir = <%= @paths_base_dir%> 19 | 20 | -------------------------------------------------------------------------------- /contrib/puppet/templates/exclude_backend.conf.erb: -------------------------------------------------------------------------------- 1 | #################################################################### 2 | # # 3 | # This file is managed by Puppet # 4 | # any changes made locally will be lost. # 5 | # # 6 | # puppet:///modules/haproxystats/templates/exclude_backend.conf.erb# 7 | # # 8 | #################################################################### 9 | <%- @process_exclude_backends.each do |val| -%> 10 | <%= val.to_s %> 11 | <%- end -%> 12 | -------------------------------------------------------------------------------- /contrib/puppet/templates/exclude_frontend.conf.erb: -------------------------------------------------------------------------------- 1 | ###################################################################### 2 | # # 3 | # This file is managed by Puppet # 4 | # any changes made locally will be lost. # 5 | # # 6 | # puppet:///modules/haproxystats/templates/exclude_frontend.conf.erb # 7 | # # 8 | ###################################################################### 9 | <%- @process_exclude_frontends.each do |val| -%> 10 | <%= val.to_s %> 11 | <%- end -%> 12 | -------------------------------------------------------------------------------- /contrib/puppet/templates/haproxystats-process-monit-check.sh.erb: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | #################################################################################### 3 | # This file is managed by Puppet # 4 | # any changes made locally will be lost. # 5 | # puppet:///modules/haproxystats/templates/haproxystats-process-monit-chech.sh.erb # 6 | # # 7 | #################################################################################### 8 | <%- 9 | _processes = @process_workers.to_i + 1 10 | -%> 11 | if [[ -x /opt/blue-python/3.4/bin/haproxystats-process && -r /etc/haproxystats.conf ]]; then 12 | msg=$(/usr/lib64/nagios/plugins/check_procs\ 13 | -c <%= _processes-%>:<%= _processes-%>\ 14 | --ereg-argument-array='^/usr/local/bin/blue-python3.4 /opt/blue-python/3.4/bin/haproxystats-process -f /etc/haproxystats.conf$'\ 15 | -u <%= @user-%> 16 | -vv) 17 | EXITCODE=$? 18 | if [[ ${EXITCODE} -ne 0 ]]; then 19 | echo "${msg}" "Number of processes must be <%= _processes-%> OPDOC: TBD" 20 | else 21 | echo "${msg}" 22 | fi 23 | exit ${EXITCODE} 24 | else 25 | echo "OK: haproxystats-process isn't installed here" 26 | exit 0 27 | fi 28 | -------------------------------------------------------------------------------- /contrib/puppet/templates/process-systemd-overwrites.conf.erb: -------------------------------------------------------------------------------- 1 | ############################################################################# 2 | # 3 | # This file is managed by Puppet 4 | # any changes made locally will be lost. 5 | # 6 | # The master version of this file is at 7 | # puppet:///modules/haproxystats/templates/process-systemd-overwrites.conf.erb 8 | # 9 | ############################################################################# 10 | <%- 11 | _cpus = scope.lookupvar('::processorcount').to_i 12 | if @process_CPUAffinity 13 | _value = @process_CPUAffinity 14 | else 15 | if @process_workers.to_i >= _cpus 16 | _workers = _cpus 17 | else 18 | _workers = @process_workers.to_i 19 | end 20 | cpu_list = Array (0.._cpus-1) 21 | _value = cpu_list[-_workers..-1].join(' ') 22 | end 23 | -%> 24 | [Service] 25 | CPUAffinity = <%=_value%> 26 | -------------------------------------------------------------------------------- /contrib/puppet/templates/process.conf.erb: -------------------------------------------------------------------------------- 1 | <%- 2 | _cpus = scope.lookupvar('::processorcount').to_i 3 | if @process_workers.to_i >= _cpus 4 | _workers = _cpus 5 | else 6 | _workers = @process_workers.to_i 7 | end 8 | -%> 9 | [process] 10 | loglevel = <%= @process_loglevel %> 11 | workers = <%= _workers %> 12 | src-dir = <%= @process_src_dir %> 13 | aggr-server-metrics = <%= @process_aggr_server_metrics %> 14 | per-process-metrics = <%= @process_per_process_metrics %> 15 | <%- if @process_exclude_frontends.size > 0 -%> 16 | exclude-frontends = <%= @exclude_frontends_filename %> 17 | <%- end -%> 18 | <%- if @process_exclude_backends.size > 0 -%> 19 | exclude-backends = <%= @exclude_backends_filename %> 20 | <%- end -%> 21 | calculate-percentages = <%= @process_calculate_percentages %> 22 | 23 | [graphite] 24 | server = <%= @graphite_server %> 25 | port = <%= @graphite_port %> 26 | retries = <%= @graphite_retries %> 27 | interval = <%= @graphite_interval %> 28 | connect-timeout = <%= @graphite_connect_timeout %> 29 | write-timeout = <%= @graphite_write_timeout %> 30 | delay = <%= @graphite_delay %> 31 | backoff = <%= @graphite_backoff %> 32 | queue-size = <%= @graphite_queue_size %> 33 | namespace = <%= @graphite_namespace %> 34 | prefix-name = <%= @graphite_prefix_hostname %> 35 | fqdn = <%= @graphite_fqdn %> 36 | 37 | <%- if @local_store_enabled -%> 38 | [local-store] 39 | dir = <%= @local_store_dir %> 40 | <%- end -%> 41 | -------------------------------------------------------------------------------- /contrib/puppet/templates/pull-systemd-overwrites.conf.erb: -------------------------------------------------------------------------------- 1 | ############################################################################# 2 | # 3 | # This file is managed by Puppet 4 | # any changes made locally will be lost. 5 | # 6 | # The master version of this file is at 7 | # puppet:///modules/haproxystats/templates/pull-systemd-overwrites.conf.erb 8 | # 9 | ############################################################################# 10 | <%- 11 | if @pull_CPUAffinity 12 | _value = @pull_CPUAffinity 13 | else 14 | cpu_list = Array (0..scope.lookupvar('::processorcount').to_i-1) 15 | _value = cpu_list[-1] 16 | end 17 | -%> 18 | [Service] 19 | CPUAffinity = <%=_value%> 20 | -------------------------------------------------------------------------------- /contrib/puppet/templates/pull.conf.erb: -------------------------------------------------------------------------------- 1 | [pull] 2 | loglevel = <%= @pull_loglevel %> 3 | socket-dir = <%= @pull_socket_dir %> 4 | retries = <%= @pull_retries %> 5 | timeout = <%= @pull_timeout %> 6 | interval = <%= @pull_interval %> 7 | pull-timeout = <%= @pull_pull_timeout %> 8 | pull-interval = <%= @pull_pull_interval %> 9 | dst-dir = <%= @pull_dst_dir %> 10 | tmp-dst-dir = <%= @pull_tmp_dst_dir %> 11 | workers = <%= @pull_workers %> 12 | queue-size = <%= @pull_queue_size %> 13 | 14 | -------------------------------------------------------------------------------- /contrib/systemd/haproxystats-process.service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=Processes statistics from HAProxy and pushes them to Graphite 3 | After=network.target syslog.target 4 | Wants=network.target syslog.target 5 | Documentation=https://github.com/unixsurfer/haproxystats 6 | 7 | [Service] 8 | Type=simple 9 | KillMode=process 10 | Environment="CONFIG=/etc/haproxystats.conf" 11 | User=haproxystats 12 | Group=haproxystats 13 | ExecStart=/opt/blue-python/3.4/bin/haproxystats-process -f $CONFIG 14 | TimeoutStartSec=3 15 | TimeoutStopSec=60 16 | Restart=always 17 | 18 | [Install] 19 | WantedBy=multi-user.target 20 | -------------------------------------------------------------------------------- /contrib/systemd/haproxystats-pull.service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=Pulls statistics from HAProxy daemon over UNIX sockets 3 | After=network.target syslog.target 4 | Wants=network.target syslog.target 5 | Documentation=https://github.com/unixsurfer/haproxystats 6 | 7 | [Service] 8 | Type=simple 9 | Environment="CONFIG=/etc/haproxystats.conf" 10 | User=haproxystats 11 | Group=haproxystats 12 | ExecStart=/opt/blue-python/3.4/bin/haproxystats-pull -f $CONFIG 13 | TimeoutStartSec=3 14 | TimeoutStopSec=6 15 | Restart=on-failure 16 | 17 | [Install] 18 | WantedBy=multi-user.target 19 | -------------------------------------------------------------------------------- /contrib/tcp_server.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # vim:fenc=utf-8 4 | # 5 | """ 6 | A very simple TCP server for simulating a graphite relay, copied-paste from 7 | Python documentation. Few things were adjusted to make pylint happy and print 8 | incoming data. 9 | """ 10 | import asyncio 11 | 12 | 13 | class EchoServerClientProtocol(asyncio.Protocol): 14 | """ 15 | A TCP server 16 | """ 17 | def __init__(self): 18 | self.peername = None 19 | self.transport = None 20 | 21 | def connection_made(self, transport): 22 | self.peername = transport.get_extra_info('peername') 23 | print('Connection from {}'.format(self.peername)) 24 | self.transport = transport 25 | 26 | def data_received(self, data): 27 | message = data.decode() 28 | print(message) 29 | 30 | def connection_lost(self, exc): 31 | print('client {} closed connection {}'.format(self.peername, exc)) 32 | 33 | 34 | def main(): 35 | """ 36 | main code 37 | """ 38 | loop = asyncio.get_event_loop() 39 | # Each client connection will create a new protocol instance 40 | coro = loop.create_server(EchoServerClientProtocol, '127.0.0.1', 39991) 41 | server = loop.run_until_complete(coro) 42 | 43 | # Serve requests until Ctrl+C is pressed 44 | print('Serving on {}'.format(server.sockets[0].getsockname())) 45 | try: 46 | loop.run_forever() 47 | except KeyboardInterrupt: 48 | pass 49 | 50 | # Close the server 51 | server.close() 52 | loop.run_until_complete(server.wait_closed()) 53 | loop.close() 54 | 55 | # This is the standard boilerplate that calls the main() function. 56 | if __name__ == '__main__': 57 | main() 58 | -------------------------------------------------------------------------------- /haproxystats-architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unixsurfer/haproxystats/3ef4b3cacada9b6ed52dcc7726d8dad81a821ed1/haproxystats-architecture.png -------------------------------------------------------------------------------- /haproxystats.conf: -------------------------------------------------------------------------------- 1 | [DEFAULT] 2 | loglevel = info 3 | retries = 2 4 | timeout = 1 5 | interval = 2 6 | 7 | [paths] 8 | base-dir = /var/lib/haproxystats 9 | 10 | [pull] 11 | loglevel = info 12 | socket-dir = /run/haproxy 13 | retries = 1 14 | timeout = 1 15 | interval = 1 16 | pull-timeout = 0.5 17 | pull-interval = 10 18 | dst-dir = ${paths:base-dir}/incoming 19 | tmp-dst-dir = ${paths:base-dir}/incoming.tmp 20 | workers = 8 21 | 22 | [process] 23 | src-dir = ${paths:base-dir}/incoming 24 | workers = 4 25 | 26 | [graphite] 27 | server = 127.0.0.1 28 | port = 3002 29 | retries = 2 30 | interval = 0.8 31 | delay = 10 32 | backoff = 2 33 | namespace = loadbalancers 34 | prefix_hostname = true 35 | fqdn = true 36 | queue-size = 1000000 37 | 38 | #[local-store] 39 | #dir = ${paths:base-dir}/local-store 40 | -------------------------------------------------------------------------------- /haproxystats/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # vim:fenc=utf-8 3 | # 4 | """A collection of Python tools to process HAProxy statistics.""" 5 | __title__ = 'haproxystats' 6 | __author__ = 'Pavlos Parissis' 7 | __license__ = 'Apache 2.0' 8 | __version__ = '0.5.2' 9 | __copyright__ = 'Copyright 2016 Pavlos Parissis ] [-d ] [-p | -P] 8 | 9 | Options: 10 | -f, --file configuration file with settings 11 | [default: /etc/haproxystats.conf] 12 | -d, --dir directory with additional configuration files 13 | -p, --print show default settings 14 | -P, --print-conf show configuration 15 | -h, --help show this screen 16 | -v, --version show version 17 | """ 18 | import os 19 | import multiprocessing 20 | import signal 21 | import logging 22 | import glob 23 | import copy 24 | import re 25 | import sys 26 | import time 27 | import shutil 28 | import socket 29 | import fileinput 30 | from collections import defaultdict 31 | from configparser import ConfigParser, ExtendedInterpolation, ParsingError 32 | from threading import Lock, Thread 33 | from docopt import docopt 34 | import pyinotify 35 | import pandas 36 | 37 | from haproxystats import __version__ as VERSION 38 | from haproxystats import DEFAULT_OPTIONS 39 | from haproxystats.utils import (dispatcher, GraphiteHandler, get_files, 40 | FileHandler, EventHandler, concat_csv, 41 | FILE_SUFFIX_INFO, FILE_SUFFIX_STAT, 42 | load_file_content, configuration_check, 43 | read_write_access, check_metrics, 44 | daemon_percentage_metrics, send_wlc, 45 | calculate_percentage_per_column, 46 | calculate_percentage_per_row) 47 | from haproxystats.metrics import (DAEMON_AVG_METRICS, DAEMON_METRICS, 48 | SERVER_AVG_METRICS, SERVER_AVG_TIME_METRICS, 49 | SERVER_METRICS, 50 | BACKEND_AVG_METRICS, BACKEND_AVG_TIME_METRICS, 51 | BACKEND_METRICS, 52 | FRONTEND_METRICS) 53 | 54 | LOG_FORMAT = ('%(asctime)s [%(process)d] [%(processName)-11s] ' 55 | '[%(funcName)-20s] %(levelname)-8s %(message)s') 56 | logging.basicConfig(format=LOG_FORMAT) 57 | log = logging.getLogger('root') # pylint: disable=I0011,C0103 58 | 59 | watcher = pyinotify.WatchManager() # pylint: disable=I0011,C0103 60 | # watched events 61 | MASK = pyinotify.IN_CREATE | pyinotify.IN_MOVED_TO # pylint: disable=no-member 62 | 63 | STOP_SIGNAL = 'STOP' 64 | 65 | 66 | class Checker(Thread): 67 | """Check the liveness of consumer""" 68 | def __init__(self, consumers, interval): 69 | """Initialization. 70 | 71 | Arguments: 72 | consumers (list): A list of consumers(multiprocessing.Process obj) 73 | interval (float): How often to run the check 74 | """ 75 | super(Checker, self).__init__() 76 | self.daemon = True 77 | self.consumers = consumers 78 | self.interval = interval 79 | 80 | def run(self): 81 | """Terminate main program if at least one consumer isn't alive""" 82 | while True: 83 | alive_consumers = 0 84 | for consumer in self.consumers: 85 | if not consumer.is_alive(): 86 | log.critical("consumer %s is dead", consumer.name) 87 | else: 88 | alive_consumers += 1 89 | log.debug("consumer %s is alive", consumer.name) 90 | if alive_consumers < len(self.consumers): 91 | log.critical("terminating myself as %s consumers are dead", 92 | len(self.consumers) - alive_consumers) 93 | os.kill(os.getpid(), signal.SIGTERM) 94 | 95 | time.sleep(self.interval) 96 | 97 | class Consumer(multiprocessing.Process): 98 | """Process statistics and dispatch them to handlers.""" 99 | 100 | # Cache results of the get_metric_paths() function call 101 | path_cache = { 102 | 'frontend': {}, 103 | 'backend': {}, 104 | 'server': {} 105 | } 106 | 107 | # Store compiled patterns declared in the 'frontend-groups' and 108 | # 'backend-groups' config sections 109 | metric_patterns = { 110 | 'frontend': [], 111 | 'backend': [], 112 | 'server': [], 113 | } 114 | 115 | def __init__(self, tasks, config): 116 | """Initialization. 117 | 118 | Arguments: 119 | tasks (queue): A queue from which we consume items. 120 | config (obj): A configParser object which holds configuration. 121 | """ 122 | multiprocessing.Process.__init__(self) 123 | self.tasks = tasks 124 | self.config = config 125 | self.local_store = None 126 | self.file_handler = None 127 | self.timestamp = None # The time that statistics were retrieved 128 | 129 | # Build graphite path (..haproxy) 130 | graphite_tree = [] 131 | graphite_tree.append(self.config.get('graphite', 'namespace')) 132 | if self.config.getboolean('graphite', 'prefix-hostname'): 133 | if self.config.getboolean('graphite', 'fqdn'): 134 | graphite_tree.append(socket.gethostname().replace('.', '_')) 135 | else: 136 | graphite_tree.append(socket.gethostname().split('.')[0]) 137 | graphite_tree.append('haproxy') 138 | self.graphite_path = '.'.join(graphite_tree) 139 | 140 | # Compile regex patterns for metric groups 141 | if self.config.has_option('graphite', 'group-namespace'): 142 | self.build_metric_patterns() 143 | self.double_writes =\ 144 | self.config.getboolean('graphite', 145 | 'group-namespace-double-writes') 146 | else: 147 | self.double_writes = False 148 | 149 | 150 | def run(self): 151 | """Consume item from queue and process it. 152 | 153 | It is the target function of Process class. Consumes items from 154 | the queue, processes data which are pulled down by haproxystats-pull 155 | program and uses Pandas to perform all computations of statistics. 156 | 157 | It exits when it receives STOP_SIGNAL as item. 158 | 159 | To avoid orphan processes on the system, it must be robust against 160 | failures and try very hard recover from failures. 161 | """ 162 | if self.config.has_section('local-store'): 163 | self.local_store = self.config.get('local-store', 'dir') 164 | self.file_handler = FileHandler() 165 | dispatcher.register('open', self.file_handler.open) 166 | dispatcher.register('send', self.file_handler.send) 167 | dispatcher.register('flush', self.file_handler.flush) 168 | dispatcher.register('loop', self.file_handler.loop) 169 | 170 | timeout = self.config.getfloat('graphite', 'timeout') 171 | connect_timeout = self.config.getfloat('graphite', 172 | 'connect-timeout', 173 | fallback=timeout) 174 | write_timeout = self.config.getfloat('graphite', 175 | 'write-timeout', 176 | fallback=timeout) 177 | graphite = GraphiteHandler( 178 | server=self.config.get('graphite', 'server'), 179 | port=self.config.getint('graphite', 'port'), 180 | connect_timeout=connect_timeout, 181 | write_timeout=write_timeout, 182 | retries=self.config.getint('graphite', 'retries'), 183 | interval=self.config.getfloat('graphite', 'interval'), 184 | delay=self.config.getfloat('graphite', 'delay'), 185 | backoff=self.config.getfloat('graphite', 'backoff'), 186 | queue_size=self.config.getint('graphite', 'queue-size') 187 | ) 188 | dispatcher.register('open', graphite.open) 189 | dispatcher.register('send', graphite.send) 190 | 191 | dispatcher.signal('open') 192 | 193 | try: 194 | while True: 195 | log.info('waiting for item from the queue') 196 | incoming_dir = self.tasks.get() 197 | log.info('received item %s', incoming_dir) 198 | if incoming_dir == STOP_SIGNAL: 199 | break 200 | start_time = time.time() 201 | 202 | # incoming_dir => /var/lib/haproxystats/incoming/1454016646 203 | # timestamp => 1454016646 204 | self.timestamp = os.path.basename(incoming_dir) 205 | 206 | # update filename for file handler. 207 | # This *does not* error if a file handler is not registered. 208 | dispatcher.signal('loop', 209 | local_store=self.local_store, 210 | timestamp=self.timestamp) 211 | 212 | self.process_stats(incoming_dir) 213 | 214 | # This flushes data to file 215 | dispatcher.signal('flush') 216 | 217 | # Remove directory as data have been successfully processed. 218 | log.debug('removing %s', incoming_dir) 219 | try: 220 | shutil.rmtree(incoming_dir) 221 | except (FileNotFoundError, PermissionError, OSError) as exc: 222 | log.critical('failed to remove directory %s with:%s. ' 223 | 'This should not have happened as it means ' 224 | 'another worker processed data from this ' 225 | 'directory or something/someone removed the ' 226 | 'directory!', incoming_dir, exc) 227 | elapsed_time = time.time() - start_time 228 | log.info('total wall clock time in seconds %.3f', elapsed_time) 229 | data = ("{p}.haproxystats.{m} {v} {t}\n" 230 | .format(p=self.graphite_path, 231 | m='TotalWallClockTime', 232 | v="{t:.3f}".format(t=elapsed_time), 233 | t=self.timestamp)) 234 | dispatcher.signal('send', data=data) 235 | log.info('finished with %s', incoming_dir) 236 | except KeyboardInterrupt: 237 | log.critical('Ctrl-C received') 238 | 239 | return 240 | 241 | @send_wlc(output=dispatcher, name='AllStats') 242 | def process_stats(self, pathname): 243 | """Delegate the processing of statistics to other functions. 244 | 245 | Arguments: 246 | pathname (str): Directory where statistics from HAProxy are saved. 247 | """ 248 | # statistics for HAProxy daemon and for frontend/backend/server have 249 | # different format and haproxystats-pull save them using a different 250 | # file suffix, so we can distinguish them easier. 251 | files = get_files(pathname, FILE_SUFFIX_INFO) 252 | if not files: 253 | log.warning("%s directory doesn't contain any files with HAProxy " 254 | "daemon statistics", pathname) 255 | else: 256 | self.haproxy_stats(files) 257 | files = get_files(pathname, FILE_SUFFIX_STAT) 258 | 259 | if not files: 260 | log.warning("%s directory doesn't contain any files with site " 261 | "statistics", pathname) 262 | else: 263 | self.sites_stats(files) 264 | 265 | @send_wlc(output=dispatcher, name='HAProxy') 266 | def haproxy_stats(self, files): 267 | """Process statistics for HAProxy daemon. 268 | 269 | Arguments: 270 | files (list): A list of files which contain the output of 'show 271 | info' command on the stats socket. 272 | """ 273 | cnt_metrics = 1 # a metric counter 274 | log.info('processing statistics for HAProxy daemon') 275 | log.debug('processing files %s', ' '.join(files)) 276 | raw_info_stats = defaultdict(list) 277 | # Parse raw data and build a data structure, input looks like: 278 | # Name: HAProxy 279 | # Version: 1.6.3-4d747c-52 280 | # Release_date: 2016/02/25 281 | # Nbproc: 4 282 | # Uptime_sec: 59277 283 | # SslFrontendSessionReuse_pct: 0 284 | # .... 285 | with fileinput.input(files=files) as file_input: 286 | for line in file_input: 287 | if ': ' in line: 288 | key, value = line.split(': ', 1) 289 | try: 290 | numeric_value = int(value) 291 | except ValueError: 292 | pass 293 | else: 294 | raw_info_stats[key].append(numeric_value) 295 | 296 | if not raw_info_stats: 297 | log.error('failed to parse daemon statistics') 298 | return 299 | else: 300 | # Here is where Pandas enters and starts its magic. 301 | try: 302 | dataframe = pandas.DataFrame(raw_info_stats) 303 | except ValueError as exc: 304 | log.error('failed to create Pandas object for daemon ' 305 | 'statistics %s', exc) 306 | return 307 | 308 | sums = dataframe.loc[:, DAEMON_METRICS].sum() 309 | avgs = dataframe.loc[:, DAEMON_AVG_METRICS].mean() 310 | cnt_metrics += sums.size + avgs.size 311 | 312 | # Pandas did all the hard work, let's join above tables and extract 313 | # statistics 314 | for values in pandas.concat([sums, avgs], axis=0).items(): 315 | data = ("{p}.daemon.{m} {v} {t}\n" 316 | .format(p=self.graphite_path, 317 | m=values[0].replace('.', '_'), 318 | v=values[1], 319 | t=self.timestamp)) 320 | dispatcher.signal('send', data=data) 321 | 322 | dataframe['CpuUsagePct'] = (dataframe.loc[:, 'Idle_pct'] 323 | .map(lambda x: (x * -1) + 100)) 324 | if dataframe.loc[:, 'Idle_pct'].size > 1: 325 | log.info('calculating percentiles for CpuUsagePct') 326 | percentiles = (dataframe.loc[:, 'CpuUsagePct'] 327 | .quantile(q=[0.25, 0.50, 0.75, 0.95, 0.99], 328 | interpolation='nearest')) 329 | for per in percentiles.items(): 330 | # per[0] = index => [0.25, 0.50, 0.75, 0.95, 0.99] 331 | # per[1] = percentile value 332 | cnt_metrics += 1 333 | data = ("{p}.daemon.{m} {v} {t}\n" 334 | .format(p=self.graphite_path, 335 | m=("{:.2f}PercentileCpuUsagePct" 336 | .format(per[0]).split('.')[1]), 337 | v=per[1], 338 | t=self.timestamp)) 339 | dispatcher.signal('send', data=data) 340 | 341 | cnt_metrics += 1 342 | data = ("{p}.daemon.{m} {v} {t}\n" 343 | .format(p=self.graphite_path, 344 | m="StdCpuUsagePct", 345 | v=dataframe.loc[:, 'CpuUsagePct'].std(), 346 | t=self.timestamp)) 347 | dispatcher.signal('send', data=data) 348 | 349 | if self.config.getboolean('process', 'calculate-percentages'): 350 | for metric in daemon_percentage_metrics(): 351 | cnt_metrics += 1 352 | log.info('calculating percentage for %s', metric.name) 353 | try: 354 | value = calculate_percentage_per_column(dataframe, 355 | metric) 356 | except KeyError: 357 | log.warning("metric %s doesn't exist", metric.name) 358 | else: 359 | data = ("{p}.daemon.{m} {v} {t}\n" 360 | .format(p=self.graphite_path, 361 | m=metric.title, 362 | v=value, 363 | t=self.timestamp)) 364 | dispatcher.signal('send', data=data) 365 | 366 | if self.config.getboolean('process', 'per-process-metrics'): 367 | log.info("processing statistics per daemon") 368 | indexed_by_worker = dataframe.set_index('Process_num') 369 | metrics_per_worker = (indexed_by_worker 370 | .loc[:, DAEMON_METRICS 371 | + ['CpuUsagePct'] 372 | + DAEMON_AVG_METRICS]) 373 | cnt_metrics += metrics_per_worker.size 374 | 375 | for worker, row in metrics_per_worker.iterrows(): 376 | for values in row.iteritems(): 377 | data = ("{p}.daemon.process.{w}.{m} {v} {t}\n" 378 | .format(p=self.graphite_path, 379 | w=worker, 380 | m=values[0].replace('.', '_'), 381 | v=values[1], 382 | t=self.timestamp)) 383 | dispatcher.signal('send', data=data) 384 | 385 | if self.config.getboolean('process', 'calculate-percentages'): 386 | for metric in daemon_percentage_metrics(): 387 | log.info('calculating percentage for %s per daemon', 388 | metric.name) 389 | _percentages = (metrics_per_worker 390 | .loc[:, [metric.limit, metric.name]] 391 | .apply(calculate_percentage_per_row, 392 | axis=1, 393 | args=(metric,))) 394 | 395 | cnt_metrics += _percentages.size 396 | for worker, row in _percentages.iterrows(): 397 | for values in row.iteritems(): 398 | data = ("{p}.daemon.process.{w}.{m} {v} {t}\n" 399 | .format(p=self.graphite_path, 400 | w=worker, 401 | m=values[0].replace('.', '_'), 402 | v=values[1], 403 | t=self.timestamp)) 404 | dispatcher.signal('send', data=data) 405 | 406 | data = ("{p}.haproxystats.MetricsHAProxy {v} {t}\n" 407 | .format(p=self.graphite_path, 408 | v=cnt_metrics, 409 | t=self.timestamp)) 410 | dispatcher.signal('send', data=data) 411 | 412 | log.info('number of HAProxy metrics %s', cnt_metrics) 413 | log.info('finished processing statistics for HAProxy daemon') 414 | 415 | def sites_stats(self, files): 416 | """Process statistics for frontends/backends/servers. 417 | 418 | Arguments: 419 | files (list): A list of files which contain the output of 'show 420 | stat' command on the stats socket of HAProxy. 421 | """ 422 | log.info('processing statistics for sites') 423 | log.debug('processing files %s', ' '.join(files)) 424 | log.debug('merging multiple csv files to one Pandas data frame') 425 | data_frame = concat_csv(files) 426 | excluded_backends = [] 427 | 428 | if data_frame is not None: 429 | # Perform some sanitization on the raw data 430 | if '# pxname' in data_frame.columns: 431 | log.debug('replace "# pxname" column with "pxname"') 432 | data_frame.rename(columns={'# pxname': 'pxname'}, inplace=True) 433 | if 'Unnamed: 62' in data_frame.columns: 434 | log.debug('remove "Unnamed: 62" column') 435 | try: 436 | data_frame.drop(labels=['Unnamed: 62'], 437 | axis=1, 438 | inplace=True) 439 | except ValueError as error: 440 | log.warning("failed to drop 'Unnamed: 62' column with: %s", 441 | error) 442 | # Sanitize the values for pxname (frontend's/backend's names) and 443 | # svname (server's names) columns by replacing dots with 444 | # underscores because Graphite uses the dot in the namespace. 445 | data_frame['pxname_'] = (data_frame.pxname 446 | .apply(lambda value: 447 | value.replace('.', '_'))) 448 | data_frame['svname_'] = (data_frame.svname 449 | .apply(lambda value: 450 | value.replace('.', '_'))) 451 | 452 | data_frame.drop('pxname', axis=1, inplace=True) 453 | data_frame.drop('svname', axis=1, inplace=True) 454 | 455 | if not isinstance(data_frame, pandas.DataFrame): 456 | log.warning('Pandas data frame was not created') 457 | return 458 | if len(data_frame.index) == 0: 459 | log.error('Pandas data frame is empty') 460 | return 461 | 462 | # For some metrics HAProxy returns nothing, so we replace them 463 | # with zeros 464 | data_frame.fillna(0, inplace=True) 465 | 466 | self.process_frontends(data_frame) 467 | 468 | exclude_backends_file = self.config.get('process', 469 | 'exclude-backends', 470 | fallback=None) 471 | if exclude_backends_file is not None: 472 | excluded_backends = load_file_content(exclude_backends_file) 473 | log.info('excluding backends %s', excluded_backends) 474 | # replace dots in backend names 475 | excluded_backends[:] = [x.replace('.', '_') 476 | for x in excluded_backends] 477 | 478 | filter_backend = ~data_frame['pxname_'].isin(excluded_backends) 479 | 480 | self.process_backends(data_frame, filter_backend) 481 | self.process_servers(data_frame, filter_backend) 482 | log.info('finished processing statistics for sites') 483 | else: 484 | log.error('failed to process statistics for sites') 485 | 486 | @send_wlc(output=dispatcher, name='Frontends') 487 | def process_frontends(self, data_frame): 488 | """Process statistics for frontends. 489 | 490 | Arguments: 491 | data_frame (obj): A pandas data_frame ready for processing. 492 | """ 493 | # Filtering for Pandas 494 | cnt_metrics = 1 495 | log.debug('processing statistics for frontends') 496 | is_frontend = data_frame['svname_'] == 'FRONTEND' 497 | excluded_frontends = [] 498 | metrics = self.config.get('process', 'frontend-metrics', fallback=None) 499 | 500 | if metrics is not None: 501 | metrics = metrics.split(' ') 502 | else: 503 | metrics = FRONTEND_METRICS 504 | log.debug('metric names for frontends %s', metrics) 505 | 506 | exclude_frontends_file = self.config.get('process', 507 | 'exclude-frontends', 508 | fallback=None) 509 | if exclude_frontends_file is not None: 510 | excluded_frontends = load_file_content(exclude_frontends_file) 511 | log.info('excluding frontends %s', excluded_frontends) 512 | # replace dots in frontend names 513 | excluded_frontends[:] = [x.replace('.', '_') 514 | for x in excluded_frontends] 515 | filter_frontend = (~data_frame['pxname_'] 516 | .isin(excluded_frontends)) 517 | 518 | frontend_stats = (data_frame[is_frontend & filter_frontend] 519 | .loc[:, ['pxname_'] + metrics]) 520 | 521 | # Group by frontend name and sum values for each column 522 | frontend_aggr_stats = frontend_stats.groupby(['pxname_']).sum() 523 | cnt_metrics += frontend_aggr_stats.size 524 | for index, row in frontend_aggr_stats.iterrows(): 525 | paths = self.get_metric_paths('frontend', index) 526 | for i in row.iteritems(): 527 | datapoints = [ 528 | "{p}.frontend.{f}.{m} {v} {t}\n" 529 | .format(p=path, 530 | f=index, 531 | m=i[0], 532 | v=i[1], 533 | t=self.timestamp) for path in paths 534 | ] 535 | for datapoint in datapoints: 536 | dispatcher.signal('send', data=datapoint) 537 | 538 | data = ("{p}.haproxystats.MetricsFrontend {v} {t}\n" 539 | .format(p=self.graphite_path, 540 | v=cnt_metrics, 541 | t=self.timestamp)) 542 | dispatcher.signal('send', data=data) 543 | log.info('number of frontend metrics %s', cnt_metrics) 544 | 545 | log.debug('finished processing statistics for frontends') 546 | 547 | @send_wlc(output=dispatcher, name='Backends') 548 | def process_backends(self, data_frame, filter_backend): 549 | """Process statistics for backends. 550 | 551 | Arguments: 552 | data_frame (obj): A pandas data_frame ready for processing. 553 | filter_backend: A filter to apply on data_frame. 554 | """ 555 | cnt_metrics = 1 556 | log.debug('processing statistics for backends') 557 | # Filtering for Pandas 558 | is_backend = data_frame['svname_'] == 'BACKEND' 559 | # For averages only consider entries with actual connections made 560 | got_traffic = data_frame['lbtot'] > 0 561 | 562 | metrics = self.config.get('process', 'backend-metrics', fallback=None) 563 | if metrics is not None: 564 | metrics = metrics.split(' ') 565 | else: 566 | metrics = BACKEND_METRICS 567 | log.debug('metric names for backends %s', metrics) 568 | # Get rows only for backends. For some metrics we need the sum and 569 | # for others the average, thus we split them. 570 | stats_sum = (data_frame[is_backend & filter_backend] 571 | .loc[:, ['pxname_'] + metrics]) 572 | stats_avg = (data_frame[is_backend & filter_backend] 573 | .loc[:, ['pxname_'] + BACKEND_AVG_METRICS]) 574 | stats_avg_time = (data_frame[is_backend & filter_backend & got_traffic] 575 | .loc[:, ['pxname_'] + BACKEND_AVG_TIME_METRICS]) 576 | 577 | aggr_sum = stats_sum.groupby(['pxname_'], as_index=False).sum() 578 | aggr_avg = stats_avg.groupby(['pxname_'], as_index=False).mean() 579 | aggr_avg_time = stats_avg_time.groupby(['pxname_'], as_index=False) \ 580 | .mean() 581 | merged_stats = aggr_sum.merge(aggr_avg, on='pxname_', how='outer') \ 582 | .merge(aggr_avg_time, on='pxname_', how='outer') 583 | 584 | rows, columns = merged_stats.shape 585 | cnt_metrics += rows * (columns - 1) # minus the index 586 | 587 | for _, row in merged_stats.iterrows(): 588 | backend = row[0] 589 | paths = self.get_metric_paths('backend', backend) 590 | for i in row[1:].iteritems(): 591 | datapoints = [ 592 | "{p}.backend.{b}.{m} {v} {t}\n" 593 | .format(p=path, 594 | b=backend, 595 | m=i[0], 596 | v=i[1], 597 | t=self.timestamp) for path in paths 598 | ] 599 | for datapoint in datapoints: 600 | dispatcher.signal('send', data=datapoint) 601 | 602 | data = ("{p}.haproxystats.MetricsBackend {v} {t}\n" 603 | .format(p=self.graphite_path, 604 | v=cnt_metrics, 605 | t=self.timestamp)) 606 | dispatcher.signal('send', data=data) 607 | 608 | log.info('number of backend metrics %s', cnt_metrics) 609 | log.debug('finished processing statistics for backends') 610 | 611 | @send_wlc(output=dispatcher, name='Servers') 612 | def process_servers(self, data_frame, filter_backend): 613 | """Process statistics for servers. 614 | 615 | Arguments: 616 | data_frame (obj): A pandas data_frame ready for processing. 617 | filter_backend: A filter to apply on data_frame. 618 | """ 619 | cnt_metrics = 1 620 | # A filter for rows with stats for servers 621 | is_server = data_frame['type'] == 2 622 | # For averages only consider entries with actual connections made 623 | got_traffic = data_frame['lbtot'] > 0 624 | 625 | log.debug('processing statistics for servers') 626 | 627 | server_metrics = self.config.get('process', 628 | 'server-metrics', 629 | fallback=None) 630 | if server_metrics is not None: 631 | server_metrics = server_metrics.split(' ') 632 | else: 633 | server_metrics = SERVER_METRICS 634 | log.debug('metric names for servers %s', server_metrics) 635 | # Get rows only for servers. For some metrics we need the sum and 636 | # for others the average, thus we split them. 637 | stats_sum = (data_frame[is_server & filter_backend] 638 | .loc[:, ['pxname_', 'svname_'] + server_metrics]) 639 | stats_avg = (data_frame[is_server & filter_backend] 640 | .loc[:, ['pxname_', 'svname_'] + SERVER_AVG_METRICS]) 641 | stats_avg_time = (data_frame[is_server & filter_backend & got_traffic] 642 | .loc[:, ['pxname_', 'svname_'] + SERVER_AVG_TIME_METRICS]) 643 | servers = (data_frame[is_server & filter_backend] 644 | .loc[:, ['pxname_', 'svname_']]) 645 | 646 | # Calculate the number of configured servers in a backend 647 | tot_servers = (servers 648 | .groupby(['pxname_']) 649 | .agg({'svname_': pandas.Series.nunique})) 650 | aggr_sum = (stats_sum 651 | .groupby(['pxname_', 'svname_'], as_index=False) 652 | .sum()) 653 | aggr_avg = (stats_avg 654 | .groupby(['pxname_', 'svname_'], as_index=False) 655 | .mean()) 656 | aggr_avg_time = (stats_avg_time 657 | .groupby(['pxname_', 'svname_'], as_index=False) 658 | .mean()) 659 | merged_stats = aggr_sum.merge(aggr_avg, 660 | on=['svname_', 'pxname_'], 661 | how='outer') \ 662 | .merge(aggr_avg_time, 663 | on=['svname_', 'pxname_'], 664 | how='outer') 665 | rows, columns = merged_stats.shape 666 | cnt_metrics += rows * (columns - 2) 667 | for backend, row in tot_servers.iterrows(): 668 | cnt_metrics += 1 669 | paths = self.get_metric_paths('backend', backend) 670 | datapoints = [ 671 | "{p}.backend.{b}.{m} {v} {t}\n" 672 | .format(p=path, 673 | b=backend, 674 | m='TotalServers', 675 | v=row[0], 676 | t=self.timestamp) for path in paths 677 | ] 678 | for datapoint in datapoints: 679 | dispatcher.signal('send', data=datapoint) 680 | 681 | for _, row in merged_stats.iterrows(): 682 | backend = row[0] 683 | server = row[1] 684 | paths = self.get_metric_paths('backend', backend) 685 | for i in row[2:].iteritems(): 686 | datapoints = [ 687 | "{p}.backend.{b}.server.{s}.{m} {v} {t}\n" 688 | .format(p=path, 689 | b=backend, 690 | s=server, 691 | m=i[0], 692 | v=i[1], 693 | t=self.timestamp) for path in paths 694 | ] 695 | for datapoint in datapoints: 696 | dispatcher.signal('send', data=datapoint) 697 | 698 | if self.config.getboolean('process', 'aggr-server-metrics'): 699 | log.info('aggregate stats for servers across all backends') 700 | # Produce statistics for servers across all backends 701 | stats_sum = (data_frame[is_server] 702 | .loc[:, ['svname_'] + SERVER_METRICS]) 703 | stats_avg = (data_frame[is_server] 704 | .loc[:, ['svname_'] + SERVER_AVG_METRICS]) 705 | stats_avg_time = (data_frame[is_server & got_traffic] 706 | .loc[:, ['svname_'] + SERVER_AVG_TIME_METRICS]) 707 | aggr_sum = (stats_sum 708 | .groupby(['svname_'], as_index=False) 709 | .sum()) 710 | aggr_avg = (stats_avg 711 | .groupby(['svname_'], as_index=False) 712 | .mean()) 713 | aggr_avg_time = (stats_avg_time 714 | .groupby(['svname_'], as_index=False) 715 | .mean()) 716 | merged_stats = aggr_sum.merge(aggr_avg, 717 | on=['svname_'], 718 | how='outer') \ 719 | .merge(aggr_avg_time, 720 | on=['svname_'], 721 | how='outer') 722 | rows, columns = merged_stats.shape 723 | cnt_metrics += rows * (columns - 1) # minus the index 724 | 725 | for _, row in merged_stats.iterrows(): 726 | server = row[0] 727 | paths = self.get_metric_paths('server', server) 728 | for i in row[1:].iteritems(): 729 | datapoints = [ 730 | "{p}.server.{s}.{m} {v} {t}\n" 731 | .format(p=path, 732 | s=server, 733 | m=i[0], 734 | v=i[1], 735 | t=self.timestamp) for path in paths 736 | ] 737 | for datapoint in datapoints: 738 | dispatcher.signal('send', data=datapoint) 739 | 740 | data = ("{p}.haproxystats.MetricsServer {v} {t}\n" 741 | .format(p=self.graphite_path, 742 | v=cnt_metrics, 743 | t=self.timestamp)) 744 | dispatcher.signal('send', data=data) 745 | 746 | log.info('number of server metrics %s', cnt_metrics) 747 | log.debug('finished processing statistics for servers') 748 | 749 | 750 | def build_metric_patterns(self): 751 | """Compile regexes from frontend- backend- and server-groups config. 752 | 753 | Builds a list of pairs (pattern_name, regex) to be used when sending 754 | metrics. When a frontend, backend or server matches a given pattern, the 755 | string in pattern_name can be inserted into the metric. 756 | 757 | This list is stored in the class variable 'metric_patterns'. 758 | """ 759 | # Don't let Consumer instances run this at the same time 760 | lock = Lock() 761 | with lock: 762 | for (section, patterns) in Consumer.metric_patterns.items(): 763 | # Run only once 764 | if patterns: 765 | return 766 | config_section = "{}-groups".format(section) 767 | if config_section not in self.config.sections(): 768 | continue 769 | for (name, pattern) in self.config.items(config_section): 770 | # Skip items inherited from the [DEFAULTS] section 771 | if name in self.config.defaults(): 772 | continue 773 | try: 774 | regex = re.compile(pattern) 775 | except re.error as error: 776 | log.error('faied to compile %s pattern %s. Error: %s', 777 | config_section, name, error) 778 | else: 779 | Consumer.metric_patterns[section].append((name, regex)) 780 | log.debug('built metric patterns %s', Consumer.metric_patterns) 781 | 782 | 783 | def get_metric_paths(self, section, section_name): 784 | """Return the graphite path(s) of a metric. 785 | 786 | When the name of a frontend or backend matches a given pattern, the 787 | returned graphite path will include the name of the pattern, prefixed by 788 | a string defined in the 'group-namespace' config setting. The list of 789 | patterns and their names are defined in the 'frontend-groups', 790 | 'backend-groups' and 'server-groups' config sections. 791 | 792 | Additionally, if the config option 'group-namespace-double-writes' is 793 | true, this function will return the default graphite path as well, 794 | so every datapoint may be sent to graphite on both paths. 795 | 796 | If no groups are defined, or if there is no match for the given 797 | frontend/backend name, it returns only the default graphite path. 798 | 799 | If two or more patterns match a frontend/backend name, only one will be 800 | used: the first one declared in the config file. 801 | 802 | Arguments: 803 | section (str): Either 'frontend', 'backend' or 'server'. 804 | section_name (str): The name of said frontend/backend/server. 805 | """ 806 | group = None 807 | for (pattern_name, pattern) in Consumer.metric_patterns[section]: 808 | if pattern.search(section_name): 809 | group = pattern_name 810 | break 811 | if group is None: 812 | return [self.graphite_path] 813 | try: 814 | path = Consumer.path_cache[section][section_name] 815 | except KeyError: 816 | # cache miss 817 | group_namespace = self.config.get('graphite', 'group-namespace') 818 | path = "{}.{}.{}".format(self.graphite_path, group_namespace, group) 819 | Consumer.path_cache[section][section_name] = path 820 | if self.double_writes: 821 | return [path, self.graphite_path] 822 | else: 823 | return [path] 824 | 825 | 826 | def main(): 827 | """Parse CLI arguments and launches main program.""" 828 | args = docopt(__doc__, version=VERSION) 829 | 830 | config = ConfigParser(interpolation=ExtendedInterpolation()) 831 | # Set defaults for all sections 832 | config.read_dict(copy.copy(DEFAULT_OPTIONS)) 833 | try: 834 | config.read(args['--file']) 835 | except ParsingError as exc: 836 | sys.exit(str(exc)) 837 | 838 | config_dir = args['--dir'] 839 | if config_dir is not None: 840 | if not os.path.isdir(config_dir): 841 | raise ValueError("{d} directory with .conf files doesn't exist" 842 | .format(d=config_dir)) 843 | else: 844 | config_files = glob.glob(os.path.join(config_dir, '*.conf')) 845 | try: 846 | config.read(config_files) 847 | except ParsingError as exc: 848 | sys.exit(str(exc)) 849 | 850 | incoming_dir = config.get('process', 'src-dir') 851 | 852 | if args['--print']: 853 | for section in sorted(DEFAULT_OPTIONS): 854 | if section == 'pull': 855 | continue 856 | print("[{}]".format(section)) 857 | for key, value in sorted(DEFAULT_OPTIONS[section].items()): 858 | print("{k} = {v}".format(k=key, v=value)) 859 | print() 860 | sys.exit(0) 861 | if args['--print-conf']: 862 | for section in sorted(config): 863 | if section == 'pull': 864 | continue 865 | print("[{}]".format(section)) 866 | for key, value in sorted(config[section].items()): 867 | print("{k} = {v}".format(k=key, v=value)) 868 | print() 869 | sys.exit(0) 870 | 871 | try: 872 | configuration_check(config, 'paths') 873 | configuration_check(config, 'process') 874 | configuration_check(config, 'graphite') 875 | read_write_access(config.get('process', 'src-dir')) 876 | check_metrics(config) 877 | except ValueError as exc: 878 | sys.exit(str(exc)) 879 | 880 | tasks = multiprocessing.Queue() 881 | handler = EventHandler(tasks=tasks) 882 | notifier = pyinotify.Notifier(watcher, handler) 883 | num_consumers = config.getint('process', 'workers') 884 | incoming_dir = config.get('process', 'src-dir') 885 | 886 | loglevel =\ 887 | config.get('process', 'loglevel').upper() # pylint: disable=no-member 888 | log.setLevel(getattr(logging, loglevel, None)) 889 | 890 | log.info('haproxystats-processs %s version started', VERSION) 891 | # process incoming data which were retrieved while processing was stopped 892 | for pathname in glob.iglob(incoming_dir + '/*'): 893 | if os.path.isdir(pathname): 894 | log.info('putting %s in queue', pathname) 895 | tasks.put(pathname) 896 | 897 | def shutdown(signalnb=None, frame=None): 898 | """Signal processes to exit. 899 | 900 | It adds STOP_SIGNAL to the queue, which causes processes to exit in a 901 | clean way. 902 | 903 | Arguments: 904 | signalnb (int): The ID of signal 905 | frame (obj): Frame object at the time of receiving the signal 906 | """ 907 | log.info('received %s at %s', signalnb, frame) 908 | notifier.stop() 909 | for _ in range(num_consumers): 910 | log.info('sending stop signal to worker') 911 | tasks.put(STOP_SIGNAL) 912 | log.info('waiting for workers to finish their work') 913 | for consumer in consumers: 914 | consumer.join() 915 | log.info('exiting') 916 | sys.exit(0) 917 | 918 | # Register our graceful shutdown process to termination signals 919 | signal.signal(signal.SIGHUP, shutdown) 920 | signal.signal(signal.SIGTERM, shutdown) 921 | 922 | # Add our watcher 923 | while True: 924 | try: 925 | log.info('adding a watch for %s', incoming_dir) 926 | watcher.add_watch(incoming_dir, MASK, quiet=False, rec=False) 927 | except pyinotify.WatchManagerError as error: 928 | log.error('received error (%s), going to retry in few seconds', 929 | error) 930 | time.sleep(3) 931 | else: 932 | break 933 | 934 | log.info('creating %d consumers', num_consumers) 935 | consumers = [Consumer(tasks, config) for i in range(num_consumers)] 936 | for consumer in consumers: 937 | consumer.start() 938 | 939 | _thread = Checker( 940 | consumers, config.getfloat('process', 'liveness-check-interval') 941 | ) 942 | _thread.start() 943 | log.info('watching %s directory for incoming data', incoming_dir) 944 | notifier.loop(daemonize=False) 945 | 946 | 947 | if __name__ == '__main__': 948 | main() 949 | -------------------------------------------------------------------------------- /haproxystats/pull.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # vim:fenc=utf-8 3 | # pylint: disable=too-many-statements 4 | # pylint: disable=too-many-arguments 5 | # pylint: disable=too-many-branches 6 | # pylint: disable=too-many-locals 7 | # 8 | """Pulls statistics from HAProxy daemon over UNIX/TCP socket(s). 9 | 10 | Usage: 11 | haproxystats-pull [-f ] [-p | -P] 12 | 13 | Options: 14 | -f, --file configuration file with settings 15 | [default: /etc/haproxystats.conf] 16 | -p, --print show default settings 17 | -P, --print-conf show configuration 18 | -h, --help show this screen 19 | -v, --version show version 20 | """ 21 | import os 22 | import asyncio 23 | from concurrent.futures import ThreadPoolExecutor, ALL_COMPLETED 24 | import sys 25 | import time 26 | import signal 27 | import shutil 28 | import logging 29 | from functools import partial 30 | from configparser import ConfigParser, ExtendedInterpolation, ParsingError 31 | import copy 32 | import glob 33 | from urllib.parse import urlparse 34 | from docopt import docopt 35 | 36 | from haproxystats import __version__ as VERSION 37 | from haproxystats import DEFAULT_OPTIONS 38 | from haproxystats.utils import (is_unix_socket, CMD_SUFFIX_MAP, 39 | configuration_check) 40 | 41 | LOG_FORMAT = ('%(asctime)s [%(process)d] [%(threadName)-10s:%(funcName)s] ' 42 | '%(levelname)-8s %(message)s') 43 | logging.basicConfig(format=LOG_FORMAT) 44 | log = logging.getLogger('root') # pylint: disable=I0011,C0103 45 | CMDS = ['show info', 'show stat'] 46 | 47 | 48 | @asyncio.coroutine 49 | def get(socket_name, cmd, storage_dir, loop, executor, config): 50 | """Fetch data from a UNIX and TCP socket. 51 | 52 | Sends a command to HAProxy over UNIX/TCP socket, reads the response and 53 | then offloads the writing of the received data to a thread, so we don't 54 | block this coroutine. 55 | 56 | Arguments: 57 | socket_name (str or tuple): Either the full path of the UNIX socket 58 | or a tuple with two elements, where 1st element is the host and the 59 | second is the port. 60 | cmd (str): The command to send. 61 | storage_dir (str): The full path of the directory to save the response. 62 | loop (obj): A base event loop from asyncio module. 63 | executor (obj): A Threader executor to execute calls asynchronously. 64 | config (obj): A configParser object which holds configuration. 65 | 66 | Returns: 67 | True if statistics from a UNIX/TCP sockets are saved False otherwise. 68 | 69 | """ 70 | retries = config.getint('pull', 'retries') 71 | timeout = config.getfloat('pull', 'timeout') 72 | interval = config.getfloat('pull', 'interval') 73 | limit = config.getint('pull', 'buffer-limit') 74 | attempt = 0 # times to attempt a connect after a failure 75 | raised = None 76 | 77 | if isinstance(socket_name, str): 78 | socket_type = 'UNIX' 79 | address = socket_name 80 | elif isinstance(socket_name, tuple): 81 | host, port = socket_name 82 | address = "{h}:{p}".format(h=host, p=port) 83 | socket_type = 'TCP' 84 | 85 | log.debug('connecting to %s socket %s', socket_type, address) 86 | if retries == -1: 87 | attempt = -1 # -1 means retry indefinitely 88 | elif retries == 0: 89 | attempt = 1 # Zero means don't retry 90 | else: 91 | attempt = retries + 1 # any other value means retry N times 92 | while attempt != 0: 93 | if raised: # an exception was raised sleep before the next retry 94 | log.error('caught "%s" when connecting to %s socket %s, ' 95 | 'remaining tries %s, sleeping for %.2f seconds', 96 | raised, socket_type, address, attempt, interval) 97 | yield from asyncio.sleep(interval) 98 | try: 99 | if socket_type == 'UNIX': 100 | connect = asyncio.open_unix_connection(address, limit=limit) 101 | else: 102 | connect = asyncio.open_connection(host=host, 103 | port=port, 104 | limit=limit) 105 | reader, writer = yield from asyncio.wait_for(connect, timeout) 106 | except (ConnectionRefusedError, PermissionError, asyncio.TimeoutError, 107 | OSError) as exc: 108 | raised = exc 109 | else: 110 | log.debug('connection established to %s socket %s', 111 | socket_type, 112 | address) 113 | raised = None 114 | break 115 | 116 | attempt -= 1 117 | 118 | if raised is not None: 119 | log.error('failed to connect to %s socket %s after %s retries', 120 | socket_type, address, retries) 121 | return False 122 | else: 123 | log.debug('connection established to %s socket %s', 124 | socket_type, address) 125 | 126 | log.debug('sending command "%s" to %s socket %s', 127 | cmd, 128 | socket_type, 129 | address) 130 | writer.write('{c}\n'.format(c=cmd).encode()) 131 | data = yield from reader.read() 132 | writer.close() 133 | 134 | data_size = len(data) 135 | if data_size == 0: 136 | log.critical('received zero data') 137 | return False 138 | 139 | log.debug('received %s bytes from %s socket %s', 140 | data_size, socket_type, address) 141 | 142 | suffix = CMD_SUFFIX_MAP.get(cmd.split()[1]) 143 | if socket_type == 'UNIX': 144 | filename = os.path.basename(address) + suffix 145 | elif socket_type == 'TCP': 146 | filename = address + suffix 147 | 148 | filename = os.path.join(storage_dir, filename) 149 | log.debug('going to save data to %s', filename) 150 | # Offload the writing to a thread so we don't block ourselves. 151 | 152 | def write_file(): 153 | """Write data to a file. 154 | 155 | Returns: 156 | True if succeeds False otherwise. 157 | 158 | """ 159 | try: 160 | with open(filename, 'w') as file_handle: 161 | file_handle.write(data.decode()) 162 | except OSError as exc: 163 | log.critical('failed to write data %s', exc) 164 | return False 165 | else: 166 | log.debug('data saved in %s', filename) 167 | return True 168 | 169 | result = yield from loop.run_in_executor(executor, write_file) 170 | 171 | return result 172 | 173 | 174 | @asyncio.coroutine 175 | def pull_stats(config, storage_dir, loop, executor): 176 | """Launch coroutines for pulling statistics from UNIX/TCP sockets. 177 | 178 | This a delegating routine. 179 | 180 | Arguments: 181 | config (obj): A configParser object which holds configuration. 182 | storage_dir (str): The absolute directory path to save the statistics. 183 | loop (obj): A base event loop. 184 | executor(obj): A ThreadPoolExecutor object. 185 | 186 | Returns: 187 | True if statistics from all sockets are fetched False otherwise. 188 | 189 | """ 190 | results = [] # stores the result of finished tasks 191 | sockets = [] 192 | pull_timeout = config.getfloat('pull', 'pull-timeout') 193 | if int(pull_timeout) == 0: 194 | pull_timeout = None 195 | 196 | if config.has_option('pull', 'socket-dir'): 197 | socket_dir = config.get('pull', 'socket-dir') 198 | socket_files = [f for f in glob.glob(socket_dir + '/*') 199 | if is_unix_socket(f)] 200 | if not socket_files: 201 | log.error("found zero UNIX sockets under %s to connect to", 202 | socket_dir) 203 | else: 204 | sockets.extend(socket_files) 205 | 206 | if config.has_option('pull', 'servers'): 207 | servers = config.get('pull', 'servers').strip(',').split(',') 208 | for server in servers: 209 | url = urlparse(server.strip()) 210 | if url.scheme == 'unix': 211 | sockets.append(url.path) 212 | elif url.scheme == 'tcp': 213 | sockets.append((url.hostname, url.port)) 214 | 215 | if not sockets: 216 | log.error("found zero UNIX and TCP sockets") 217 | return False 218 | 219 | log.debug('pull statistics') 220 | coroutines = [get(socket_name, cmd, storage_dir, loop, executor, config) 221 | for socket_name in sockets 222 | for cmd in CMDS] 223 | # Launch all connections. 224 | done, pending = yield from asyncio.wait(coroutines, 225 | timeout=pull_timeout, 226 | return_when=ALL_COMPLETED) 227 | for task in done: 228 | log.debug('task status: %s', task) 229 | results.append(task.result()) 230 | 231 | log.info('task report, done:%s pending:%s succeed:%s failed:%s', 232 | len(done), 233 | len(pending), 234 | results.count(True), 235 | results.count(False)) 236 | 237 | for task in pending: 238 | log.warning('cancelling task %s as it reached its timeout threshold of' 239 | ' %.2f seconds', task, pull_timeout) 240 | task.cancel() 241 | 242 | # only when all tasks are finished successfully we claim success 243 | return not pending and len(set(results)) == 1 and True in set(results) 244 | 245 | 246 | def supervisor(loop, config, executor): 247 | """Coordinate the pulling of HAProxy statistics from UNIX/TCP sockets. 248 | 249 | This is the client routine which launches requests to all HAProxy 250 | UNIX/TCP sockets for retrieving statistics and save them to file-system. 251 | It runs indefinitely until main program is terminated. 252 | 253 | Arguments: 254 | loop (obj): A base event loop from asyncio module. 255 | config (obj): A configParser object which holds configuration. 256 | executor(obj): A ThreadPoolExecutor object. 257 | """ 258 | dst_dir = config.get('pull', 'dst-dir') 259 | tmp_dst_dir = config.get('pull', 'tmp-dst-dir') 260 | exit_code = 1 261 | 262 | interval = config.getint('pull', 'pull-interval') 263 | start_offset = time.time() % interval 264 | 265 | while True: 266 | timestamp = time.time() 267 | log.debug('entering while loop') 268 | try: 269 | queue = [x for x in os.listdir(dst_dir) 270 | if os.path.isdir(os.path.join(dst_dir, x))] 271 | except FileNotFoundError as exc: 272 | log.warning('%s disappeared: %s. Going to create it', dst_dir, exc) 273 | try: 274 | os.makedirs(dst_dir) 275 | except OSError as exc: 276 | # errno 17 => file exists 277 | if exc.errno != 17: 278 | sys.exit("failed to make directory {d}:{e}" 279 | .format(d=dst_dir, e=exc)) 280 | else: 281 | if len(queue) >= config.getint('pull', 'queue-size'): 282 | log.warning("queue reached max size of %s, pulling statistics " 283 | "is suspended", len(queue)) 284 | # calculate sleep time 285 | sleep = start_offset - time.time() % interval 286 | if sleep < 0: 287 | sleep += interval 288 | log.info('sleeping for %.3fs secs', sleep) 289 | time.sleep(sleep) 290 | continue 291 | # HAProxy statistics are stored in a directory and we use retrieval 292 | # time(seconds since the Epoch) as a name of the directory. 293 | # We first store them in a temporary place until we receive statistics 294 | # from all UNIX/TCP sockets. 295 | storage_dir = os.path.join(tmp_dst_dir, str(int(timestamp))) 296 | 297 | # Exit if our storage directory can't be created 298 | try: 299 | os.makedirs(storage_dir) 300 | except OSError as exc: 301 | # errno 17 => file exists 302 | if exc.errno == 17: 303 | old_data_files = glob.glob(storage_dir + '/*') 304 | for old_file in old_data_files: 305 | log.info('removing old data file %s', old_file) 306 | os.remove(old_file) 307 | else: 308 | msg = ("failed to make directory {d}:{e}" 309 | .format(d=storage_dir, e=exc)) 310 | log.critical(msg) 311 | log.critical('a fatal error has occurred, exiting..') 312 | break 313 | 314 | try: 315 | log.debug('launching delegating coroutine') 316 | result = loop.run_until_complete(pull_stats(config, storage_dir, 317 | loop, executor)) 318 | log.debug('delegating coroutine finished') 319 | except asyncio.CancelledError: 320 | log.info('Received CancelledError exception') 321 | exit_code = 0 322 | break 323 | 324 | # if and only if we received statistics from all sockets then move 325 | # statistics to the permanent directory. 326 | # NOTE: when temporary and permanent storage directory are on the same 327 | # file-system the move is actual a rename, which is an atomic 328 | # operation. 329 | if result: 330 | log.debug('move %s to %s', storage_dir, dst_dir) 331 | try: 332 | shutil.move(storage_dir, dst_dir) 333 | except OSError as exc: 334 | log.critical("failed to move %s to %s: %s", 335 | storage_dir, 336 | dst_dir, 337 | exc) 338 | log.critical('a fatal error has occurred, exiting..') 339 | break 340 | else: 341 | log.info('statistics are stored in %s', 342 | os.path.join(dst_dir, os.path.basename(storage_dir))) 343 | else: 344 | log.critical('failed to pull stats') 345 | log.debug('removing temporary directory %s', storage_dir) 346 | try: 347 | shutil.rmtree(storage_dir) 348 | except (FileNotFoundError, PermissionError, OSError) as exc: 349 | log.error('failed to remove temporary directory %s with:%s', 350 | storage_dir, 351 | exc) 352 | 353 | log.info('wall clock time in seconds: %.3f', time.time() - timestamp) 354 | # calculate sleep time 355 | sleep = start_offset - time.time() % interval 356 | if sleep < 0: 357 | sleep += interval 358 | log.info('sleeping for %.3fs secs', sleep) 359 | time.sleep(sleep) 360 | 361 | # It is very unlikely that threads haven't finished their job by now, but 362 | # they perform disk IO operations which can take some time in certain 363 | # situations, thus we want to wait for them in order to perform a clean 364 | # shutdown. 365 | log.info('waiting for threads to finish any pending IO tasks') 366 | executor.shutdown(wait=True) 367 | log.info('closing asyncio event loop') 368 | loop.close() 369 | log.info('exiting with status %s', exit_code) 370 | sys.exit(exit_code) 371 | 372 | 373 | def main(): 374 | """Parse CLI arguments and launch main program.""" 375 | args = docopt(__doc__, version=VERSION) 376 | 377 | config = ConfigParser(interpolation=ExtendedInterpolation()) 378 | # Set defaults for all sections 379 | config.read_dict(copy.copy(DEFAULT_OPTIONS)) 380 | # Load configuration from a file. NOTE: ConfigParser doesn't warn if user 381 | # sets a filename which doesn't exist, in this case defaults will be used. 382 | try: 383 | config.read(args['--file']) 384 | except ParsingError as exc: 385 | sys.exit(str(exc)) 386 | 387 | if args['--print']: 388 | for section in sorted(DEFAULT_OPTIONS): 389 | if section == 'pull' or section == 'DEFAULT': 390 | print("[{}]".format(section)) 391 | for key, value in sorted(DEFAULT_OPTIONS[section].items()): 392 | print("{k} = {v}".format(k=key, v=value)) 393 | print() 394 | sys.exit(0) 395 | if args['--print-conf']: 396 | for section in sorted(config): 397 | if section == 'pull' or section == 'DEFAULT': 398 | print("[{}]".format(section)) 399 | for key, value in sorted(config[section].items()): 400 | print("{k} = {v}".format(k=key, v=value)) 401 | print() 402 | sys.exit(0) 403 | 404 | try: 405 | configuration_check(config, 'pull') 406 | except ValueError as exc: 407 | sys.exit(str(exc)) 408 | 409 | loglevel = (config.get('pull', 'loglevel') # pylint: disable=no-member 410 | .upper()) 411 | log.setLevel(getattr(logging, loglevel, None)) 412 | 413 | log.info('haproxystats-pull %s version started', VERSION) 414 | # Setup our event loop 415 | loop = asyncio.get_event_loop() 416 | executor = ThreadPoolExecutor(max_workers=config.getint('pull', 417 | 'workers')) 418 | # Register shutdown to signals 419 | 420 | def shutdown(signalname): 421 | """Perform a clean shutdown. 422 | 423 | Arguments: 424 | signalname (str): Signal name 425 | """ 426 | tasks_running = False 427 | log.info('received %s', signalname) 428 | 429 | for task in asyncio.Task.all_tasks(): 430 | if not task.done(): 431 | tasks_running = True 432 | log.info('cancelling %s task', task) 433 | task.cancel() 434 | 435 | if not tasks_running: 436 | log.info('no tasks were running when %s signal received', signal) 437 | log.info('waiting for threads to finish any pending IO tasks') 438 | executor.shutdown(wait=True) 439 | sys.exit(0) 440 | 441 | loop.add_signal_handler(signal.SIGHUP, partial(shutdown, 'SIGHUP')) 442 | loop.add_signal_handler(signal.SIGTERM, partial(shutdown, 'SIGTERM')) 443 | 444 | # a temporary directory to store fetched data 445 | tmp_dst_dir = config['pull']['tmp-dst-dir'] 446 | # a permanent directory to move data from the temporary directory. Data are 447 | # picked up by the process daemon from that directory. 448 | dst_dir = config['pull']['dst-dir'] 449 | for directory in dst_dir, tmp_dst_dir: 450 | try: 451 | os.makedirs(directory) 452 | except OSError as exc: 453 | # errno 17 => file exists 454 | if exc.errno != 17: 455 | sys.exit("failed to make directory {d}:{e}" 456 | .format(d=directory, e=exc)) 457 | supervisor(loop, config, executor) 458 | 459 | 460 | # This is the standard boilerplate that calls the main() function. 461 | if __name__ == '__main__': 462 | main() 463 | -------------------------------------------------------------------------------- /haproxystats/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # vim:fenc=utf-8 3 | # pylint: disable=too-many-instance-attributes 4 | # pylint: disable=too-many-arguments 5 | # pylint: disable=too-many-branches 6 | """Provide functions, constants and classes that are used by haproxystats.""" 7 | import os 8 | import stat 9 | from collections import defaultdict, deque 10 | from functools import wraps 11 | import io 12 | import socket 13 | import shutil 14 | import logging 15 | import time 16 | import configparser 17 | import glob 18 | import re 19 | from urllib.parse import urlparse 20 | import pyinotify 21 | import pandas 22 | 23 | from haproxystats.metrics import (MetricNamesPercentage, FRONTEND_METRICS, 24 | BACKEND_METRICS, BACKEND_AVG_METRICS, 25 | BACKEND_AVG_TIME_METRICS, 26 | SERVER_METRICS, SERVER_AVG_METRICS, 27 | SERVER_AVG_TIME_METRICS) 28 | 29 | 30 | log = logging.getLogger('root') # pylint: disable=I0011,C0103 31 | 32 | FILE_SUFFIX_INFO = '_info' 33 | FILE_SUFFIX_STAT = '_stat' 34 | CMD_SUFFIX_MAP = {'info': FILE_SUFFIX_INFO, 'stat': FILE_SUFFIX_STAT} 35 | 36 | OPTIONS_TYPE = { 37 | 'paths': { 38 | 'base-dir': 'get', 39 | }, 40 | 'pull': { 41 | 'loglevel': 'get', 42 | 'retries': 'getint', 43 | 'timeout': 'getfloat', 44 | 'interval': 'getfloat', 45 | 'pull-timeout': 'getfloat', 46 | 'pull-interval': 'getint', 47 | 'buffer-limit': 'getint', 48 | 'dst-dir': 'get', 49 | 'tmp-dst-dir': 'get', 50 | 'workers': 'getint', 51 | 'queue-size': 'getint', 52 | }, 53 | 'process': { 54 | 'workers': 'getint', 55 | 'src-dir': 'get', 56 | 'aggr-server-metrics': 'getboolean', 57 | 'per-process-metrics': 'getboolean', 58 | 'calculate-percentages': 'getboolean', 59 | 'liveness-check-interval': 'getfloat', 60 | }, 61 | 'graphite': { 62 | 'server': 'get', 63 | 'port': 'getint', 64 | 'retries': 'getint', 65 | 'interval': 'getfloat', 66 | 'connect-timeout': 'getfloat', 67 | 'write-timeout': 'getfloat', 68 | 'delay': 'getfloat', 69 | 'backoff': 'getfloat', 70 | 'namespace': 'get', 71 | 'prefix-hostname': 'getboolean', 72 | 'fqdn': 'getboolean', 73 | 'queue-size': 'getint', 74 | }, 75 | 'local-store': { 76 | 'dir': 'get', 77 | }, 78 | } 79 | VALID_TCP_SOCKETS = [ 80 | 'tcp', 81 | 'unix', 82 | ] 83 | 84 | 85 | class BrokenConnection(Exception): 86 | """A wrapper of all possible exception during a TCP connect.""" 87 | 88 | def __init__(self, raised): 89 | """Initilaztion.""" 90 | self.raised = raised 91 | 92 | super().__init__() 93 | 94 | 95 | def load_file_content(filename): 96 | """Build list from the content of a file. 97 | 98 | Arguments: 99 | filename (str): A absolute path of a filename 100 | 101 | Returns: 102 | A list 103 | 104 | """ 105 | commented = re.compile(r'\s*?#') 106 | try: 107 | with open(filename, 'r') as _file: 108 | _content = [line.strip() for line in _file.read().splitlines() 109 | if not commented.match(line)] 110 | except OSError as exc: 111 | log.error('failed to read %s:%s', filename, exc) 112 | return [] 113 | else: 114 | return _content 115 | 116 | 117 | def is_unix_socket(path): 118 | """Check if path is a valid UNIX socket. 119 | 120 | Arguments: 121 | path (str): A file name path 122 | 123 | Returns: 124 | True if path is a valid UNIX socket otherwise False. 125 | 126 | """ 127 | mode = os.stat(path).st_mode 128 | 129 | return stat.S_ISSOCK(mode) 130 | 131 | 132 | def concat_csv(csv_files): 133 | """Perform a concatenation along several csv files. 134 | 135 | Arguments: 136 | csv_files (lst): A list of csv files. 137 | 138 | Returns: 139 | A pandas data frame object or None if fails to parse csv_files 140 | 141 | """ 142 | data_frames = [] 143 | for csv_file in csv_files: 144 | try: 145 | data_frame = pandas.read_csv(csv_file, low_memory=False) 146 | except (ValueError, OSError) as exc: 147 | log.error('Pandas failed to parse %s file with: %s', csv_file, exc) 148 | else: 149 | if not data_frame.empty: 150 | data_frames.append(data_frame) 151 | if data_frames: 152 | return pandas.concat(data_frames) 153 | 154 | return None 155 | 156 | 157 | def get_files(path, suffix): 158 | """Return the filenames from a directory which match a suffix. 159 | 160 | Arguments: 161 | path (str): Pathname 162 | suffix (str): Suffix to match against 163 | 164 | Returns: 165 | A list of filenames 166 | 167 | """ 168 | files = [filename 169 | for filename in glob.glob(path + '/*{s}'.format(s=suffix))] 170 | 171 | return files 172 | 173 | 174 | def retry_on_failures(retries=3, 175 | interval=0.9, 176 | backoff=3, 177 | exceptions=(ConnectionResetError, ConnectionRefusedError, 178 | ConnectionAbortedError, BrokenPipeError, 179 | OSError), 180 | exception_to_raise=BrokenConnection): 181 | """Perform a retry logic when an exception is raised by the decorated func. 182 | 183 | Arguments: 184 | retries (int): Maximum times to retry 185 | interval (float): Sleep this many seconds between retries 186 | backoff (int): Multiply interval by this factor after each failure 187 | exceptions (tuple): A list of exceptions to catch 188 | exception_to_raise (obj): An exception to raise when maximum tries 189 | have been reached. 190 | 191 | The decorator calls the function up to retries times if it raises an 192 | exception from the tuple. The decorated function will only be retried if 193 | it raises one of the specified exceptions. 194 | """ 195 | def dec(func): 196 | """Decorator. 197 | 198 | Arguments: 199 | func (obj): A function to decorate 200 | """ 201 | def decorated_func(*args, **kwargs): 202 | """Retry decorated functions.""" 203 | backoff_interval = interval 204 | raised = None 205 | attempt = 0 # times to attempt a connect after a failure 206 | if retries == -1: # -1 means retry indefinitely 207 | attempt = -1 208 | elif retries == 0: # Zero means don't retry 209 | attempt = 1 210 | else: # any other value means retry N times 211 | attempt = retries + 1 212 | while attempt != 0: 213 | if raised: 214 | log.error('caught "%s" at "%s", remaining tries %s, ' 215 | 'sleeping for %.2f seconds', raised, 216 | func.__name__, attempt, backoff_interval) 217 | time.sleep(backoff_interval) 218 | backoff_interval = backoff_interval * backoff 219 | try: 220 | return func(*args, **kwargs) 221 | except exceptions as error: 222 | raised = error 223 | else: 224 | raised = None 225 | break 226 | 227 | attempt -= 1 228 | 229 | if raised: 230 | raise exception_to_raise(raised=raised) 231 | 232 | return decorated_func 233 | 234 | return dec 235 | 236 | 237 | class Dispatcher(object): 238 | """Dispatch data to different handlers.""" 239 | 240 | def __init__(self): 241 | """Initilaztion.""" 242 | self.handlers = defaultdict(list) 243 | 244 | def register(self, signal, callback): 245 | """Register a callback to a signal. 246 | 247 | Multiple callbacks can be assigned to the same signal. 248 | 249 | Arguments: 250 | signal (str): The name of the signal 251 | callbacl (obj): A callable object to call for the given signal. 252 | """ 253 | self.handlers[signal].append(callback) 254 | 255 | def unregister(self, signal, callback): 256 | """Unregister a callback to a signal. 257 | 258 | Arguments: 259 | signal (str): The name of the signal 260 | callbacl (obj): A callable object to call for the given signal. 261 | """ 262 | try: 263 | self.handlers[signal].remove(callback) 264 | except ValueError: 265 | log.debug('tried to unregister %s from unknown %s signal', 266 | callback, signal) 267 | 268 | def signal(self, signal, **kwargs): 269 | """Run registered handlers. 270 | 271 | Arguments: 272 | signal (str): A registered signal 273 | """ 274 | if signal in self.handlers: 275 | for handler in self.handlers.get(signal): 276 | handler(**kwargs) 277 | 278 | 279 | class GraphiteHandler(): 280 | """A handler to send data to graphite. 281 | 282 | Arguments: 283 | server (str): Server name or IP address. 284 | port (int): Port to connect to 285 | retries (int): Numbers to retry on connection failure 286 | interval (float): Time to sleep between retries 287 | connect_timeout (float): Timeout on connection 288 | write_timeout (float): Timeout on sending data 289 | delay (float): Time to delay a connection attempt after last failure 290 | backoff (float): Multiply interval by this factor after each failure 291 | queue_size (int): Maximum size of the queue 292 | """ 293 | 294 | def __init__(self, 295 | server, 296 | port=3002, 297 | retries=1, 298 | interval=2, 299 | connect_timeout=1, 300 | write_timeout=1, 301 | delay=4, 302 | backoff=2, 303 | queue_size=1000000): 304 | """Initilaztion.""" 305 | self.server = server 306 | self.port = port 307 | self.retries = retries 308 | self.interval = interval 309 | self.connect_timeout = connect_timeout 310 | self.write_timeout = write_timeout 311 | self.delay = delay 312 | self.backoff = backoff 313 | self.queue_size = queue_size 314 | self.dqueue = deque([], maxlen=self.queue_size) 315 | self.connection = None 316 | self.timer = None 317 | self.exceptions = (ConnectionResetError, ConnectionRefusedError, 318 | ConnectionAbortedError, BrokenPipeError, OSError, 319 | socket.timeout) 320 | 321 | log.debug('connect timeout %.2fsecs write timeout %.2fsecs', 322 | self.connect_timeout, 323 | self.write_timeout) 324 | 325 | def open(self): 326 | """Open a connection to graphite relay.""" 327 | try: 328 | self.connect() 329 | except BrokenConnection as error: 330 | self.connection = None 331 | log.error('failed to connect to %s on port %s: %s', 332 | self.server, 333 | self.port, 334 | error.raised) 335 | else: 336 | self.connection.settimeout(self.write_timeout) 337 | log.info('successfully connected to %s on port %s, TCP info %s', 338 | self.server, 339 | self.port, 340 | self.connection) 341 | 342 | @property 343 | def connect(self): 344 | """Wrap connection so we can pass arguments to decorator.""" 345 | @retry_on_failures(retries=self.retries, 346 | interval=self.interval, 347 | backoff=self.backoff, 348 | exceptions=self.exceptions, 349 | exception_to_raise=BrokenConnection) 350 | def _create_connection(): 351 | """Try to open a connection. 352 | 353 | Exceptions are caught by the decorator which implements the retry 354 | logic. 355 | """ 356 | log.info('connecting to %s on port %s', self.server, self.port) 357 | self.connection = socket.create_connection( 358 | (self.server, self.port), 359 | timeout=self.connect_timeout) 360 | 361 | return _create_connection 362 | 363 | def send(self, **kwargs): 364 | """Send data to graphite relay.""" 365 | self.dqueue.appendleft(kwargs.get('data')) 366 | 367 | while len(self.dqueue) != 0: 368 | item = self.dqueue.popleft() 369 | try: 370 | self.connection.sendall(bytes(item, 'utf-8')) 371 | # AttributeError means that open() method failed, all other 372 | # exceptions indicate connection problems 373 | except (AttributeError, BrokenPipeError, ConnectionResetError, 374 | ConnectionAbortedError, socket.timeout) as exc: 375 | self.dqueue.appendleft(item) 376 | # Only try to connect again if some time has passed 377 | if self.timer is None: 378 | self.timer = time.time() 379 | log.warning('graphite connection problem is detected') 380 | log.debug('timer is set to:%s', self.timer) 381 | elif time.time() - self.timer > self.delay: 382 | log.error('caught %s while sending data to graphite', exc) 383 | log.warning('%s secs since last failure', self.delay) 384 | log.info('TCP info: %s', self.connection) 385 | self.timer = None 386 | 387 | if len(self.dqueue) == self.dqueue.maxlen: 388 | log.critical("graphite dispatcher queue is full, old " 389 | "metrics will be dropped") 390 | 391 | if not isinstance(exc, AttributeError): 392 | self.close() 393 | else: 394 | log.warning('connection is not available') 395 | self.open() 396 | return 397 | except OSError as exc: 398 | self.dqueue.appendleft(item) 399 | # Unclear under which conditions we may get OSError 400 | log.warning('caught %s while sending data to graphite', exc) 401 | log.info('TCP info: %s', self.connection) 402 | self.close() 403 | self.open() 404 | return 405 | else: 406 | # Consume all items from the local deque before return to 407 | # the caller. This causes a small delay to the caller at the 408 | # benefit of flushing data as soon as possible which avoids 409 | # gaps in graphs. 410 | continue 411 | 412 | def close(self, **kwargs): # pylint: disable=unused-argument 413 | """Close TCP connection to graphite relay.""" 414 | log.info('closing connection to %s on port %s', self.server, self.port) 415 | log.info('TCP info: %s', self.connection) 416 | try: 417 | self.connection.close() 418 | except (ConnectionRefusedError, ConnectionResetError, socket.timeout, 419 | ConnectionAbortedError) as exc: 420 | log.warning('closing connection failed: %s', exc) 421 | except (AttributeError, OSError) as exc: 422 | log.critical('closing connection failed: %s. We should not receive' 423 | ' this exception, it is a BUG', 424 | exc) 425 | else: 426 | log.info('successfully closed connection to %s on port %s', 427 | self.server, 428 | self.port) 429 | 430 | 431 | dispatcher = Dispatcher() # pylint: disable=I0011,C0103 432 | 433 | 434 | class FileHandler(): 435 | """A handler to write data to a file.""" 436 | 437 | def __init__(self): 438 | """Initilaztion.""" 439 | self._input = None 440 | self._output = None 441 | 442 | def open(self): 443 | """Build a stringIO object in memory ready to be used.""" 444 | self._input = io.StringIO() 445 | 446 | def send(self, **kwargs): 447 | """Write data to a file-like object.""" 448 | self._input.write(kwargs.get('data')) 449 | 450 | def set_path(self, filepath): 451 | """Set the filepath to send data. 452 | 453 | Arguments: 454 | filepath (str): The pathname of the file 455 | """ 456 | log.debug('filepath for local-store set to %s', filepath) 457 | try: 458 | self._output = open(filepath, 'w') 459 | except (OSError, PermissionError) as error: 460 | log.error('failed to create %s: %s', filepath, error) 461 | 462 | def loop(self, **kwargs): 463 | """Rotate the file.""" 464 | base_dir = os.path.join(kwargs.get('local_store'), 465 | kwargs.get('timestamp')) 466 | try: 467 | os.makedirs(base_dir) 468 | except (OSError, PermissionError) as error: 469 | # errno 17 => file exists 470 | if error.errno != 17: 471 | log.error('failed to make directory %s: %s', base_dir, error) 472 | self.set_path(filepath=os.path.join(base_dir, 'stats')) 473 | 474 | def flush(self, **kwargs): # pylint: disable=unused-argument 475 | """Flush data to disk.""" 476 | self._input.seek(0) 477 | try: 478 | shutil.copyfileobj(self._input, self._output) 479 | self._output.flush() 480 | self._output.close() 481 | except (OSError, PermissionError, AttributeError) as error: 482 | log.error('failed to flush data to file: %s', error) 483 | self._input.close() 484 | 485 | self.open() 486 | 487 | 488 | class EventHandler(pyinotify.ProcessEvent): 489 | """An event handler for inotify to push items to a queue. 490 | 491 | If the event isn't for a directory no action is taken. 492 | 493 | Arguments: 494 | tasks (queue obj): A queue to put items. 495 | """ 496 | 497 | def my_init(self, tasks): # pylint: disable=arguments-differ 498 | """Initilaztion.""" 499 | self.tasks = tasks 500 | 501 | def _put_item_to_queue(self, pathname): 502 | """Add item to queue if and only if the pathname is a directory.""" 503 | if os.path.isdir(pathname): 504 | log.info('putting %s in queue', pathname) 505 | self.tasks.put(pathname) 506 | else: 507 | log.info("ignore %s as it isn't directory", pathname) 508 | 509 | def process_IN_CREATE(self, event): # pylint: disable=C0103 510 | """Add an item to the queue when a directory is created.""" 511 | log.debug('received an event for CREATE') 512 | self._put_item_to_queue(event.pathname) 513 | 514 | def process_IN_MOVED_TO(self, event): # pylint: disable=C0103 515 | """Add an item to the queue when a directory/file is moved.""" 516 | log.debug('received an event for MOVE') 517 | self._put_item_to_queue(event.pathname) 518 | 519 | 520 | def configuration_check(config, section): 521 | """Perform a sanity check on configuration. 522 | 523 | Arguments: 524 | config (obg): A configparser object which holds our configuration. 525 | section (str): Section name 526 | 527 | Raises: 528 | ValueError on the first occureance of invalid configuration 529 | 530 | Returns: 531 | None if all checks are successful. 532 | 533 | """ 534 | loglevel = config[section]['loglevel'] 535 | num_level = getattr(logging, loglevel.upper(), None) 536 | if not isinstance(num_level, int): 537 | raise ValueError("invalid configuration, section:'{s}' option:'{o}' " 538 | "error: invalid loglevel '{l}'" 539 | .format(s=section, 540 | o='loglevel', 541 | l=loglevel)) 542 | 543 | for option, getter in OPTIONS_TYPE[section].items(): 544 | try: 545 | getattr(config, getter)(section, option) 546 | except (configparser.Error, ValueError) as exc: 547 | # For some errors ConfigParser mentions section/option names and 548 | # for others not. We want for all possible errors to mention 549 | # section and option names in order to make the life of our user 550 | # easier. 551 | if 'section' not in str(exc): 552 | raise ValueError("invalid configuration, section:'{s}' " 553 | "option:'{p}' error:{e}" 554 | .format(s=section, 555 | p=option, 556 | e=str(exc))) 557 | else: 558 | raise ValueError("invalid configuration, error:{e}" 559 | .format(e=str(exc))) 560 | 561 | # asyncio.StreamReader does not accept a value less than 1. 562 | if config.getint('pull', 'buffer-limit') < 1: 563 | raise ValueError("invalid configuration, you can set a value less " 564 | "than 1 for 'buffer-limit' option of 'pull'section") 565 | 566 | if config.has_option('pull', 'socket-dir'): 567 | try: 568 | socket_dir = config.get('pull', 'socket-dir') 569 | except (configparser.Error, ValueError) as exc: 570 | raise ValueError("invalid configuration, error:{e}" 571 | .format(e=str(exc))) 572 | else: 573 | if not socket_dir: 574 | raise ValueError("invalid configuration, no value for option " 575 | "'socket-dir'") 576 | 577 | if config.has_option('pull', 'servers'): 578 | try: 579 | servers = config.get('pull', 'servers').split(',') 580 | except (configparser.Error, ValueError) as exc: 581 | raise ValueError("invalid configuration, error:{e}" 582 | .format(e=str(exc))) 583 | else: 584 | if len(servers) == 1 and not servers[0]: 585 | raise ValueError("invalid configuration, no value for option " 586 | "'servers' in the section 'pull'") 587 | configuration_check_for_servers(servers) 588 | 589 | if section == 'process': 590 | groups = {'frontend-groups', 'backend-groups', 'server-groups'} 591 | configured_groups = groups.intersection(config.sections()) 592 | if config.has_option('graphite', 'group-namespace'): 593 | try: 594 | config.getboolean('graphite', 'group-namespace-double-writes') 595 | except (configparser.Error, ValueError) as exc: 596 | raise ValueError("invalid configuration, section:'graphite' " 597 | "option:'group-namespace-double-writes' " 598 | "error:{e}".format(e=exc)) 599 | if not configured_groups: 600 | raise ValueError("invalid configuration, at least one of these " 601 | "sections should exist: {}".format(groups)) 602 | else: 603 | if configured_groups: 604 | raise ValueError("invalid configuration, no value for option " 605 | "'group-namespace' in the section 'graphite'") 606 | 607 | 608 | def configuration_check_for_servers(servers, option='servers', section='pull'): 609 | """Perform a sanity check against the values for servers. 610 | 611 | Arguments: 612 | servers (list): A list of servers. 613 | option (str): The name of the option they belong to. 614 | section (str): The name of the section the option is part of 615 | 616 | Raises: 617 | ValueError on the first occureance of invalid configuration 618 | 619 | Returns: 620 | None if all checks are successful. 621 | 622 | """ 623 | for server in servers: 624 | server = server.strip() 625 | if not server: 626 | raise ValueError("invalid configuration, invalid value for '{o}' " 627 | "option of '{s}' section" 628 | .format(o=option, s=section)) 629 | try: 630 | url = urlparse(server) 631 | except ValueError as exc: 632 | raise ValueError("invalid configuration, failed to parse '{o}' " 633 | "option of '{s}' section, error:{e}" 634 | .format(e=str(exc), o=option, s=section)) 635 | else: 636 | if url.scheme not in VALID_TCP_SOCKETS: 637 | raise ValueError("invalid configuration, only unix and tcp " 638 | "type of servers are supported set in '{o}' " 639 | "option of '{s}' section" 640 | .format(o=option, s=section)) 641 | 642 | if url.scheme == 'tcp' and not url.port: 643 | raise ValueError("invalid configuration, port is not set in " 644 | "'{o}' option of '{s}' section" 645 | .format(o=option, s=section)) 646 | 647 | if url.scheme == 'unix' and not url.path: 648 | raise ValueError("invalid configuration, path is not set in " 649 | "'{o}' option of '{s}' section" 650 | .format(o=option, s=section)) 651 | 652 | 653 | def check_metrics(config): 654 | """Check if metrics set by user are valid. 655 | 656 | Arguments: 657 | config (obg): A configparser object which holds our configuration. 658 | 659 | Raises: 660 | ValueError when metrics are not valid 661 | 662 | Returns: 663 | None if all checks are successful. 664 | 665 | """ 666 | valid_metrics_per_option = { 667 | 'frontend-metrics': FRONTEND_METRICS, 668 | 'backend-metrics': BACKEND_METRICS + BACKEND_AVG_METRICS + BACKEND_AVG_TIME_METRICS, 669 | 'server-metrics': SERVER_METRICS + SERVER_AVG_METRICS + SERVER_AVG_TIME_METRICS, 670 | } 671 | for option, valid_metrics in valid_metrics_per_option.items(): 672 | user_metrics = config.get('process', option, fallback=None) 673 | if user_metrics is not None: 674 | metrics = set(user_metrics.split(' ')) 675 | if not metrics: 676 | break 677 | if not set(valid_metrics).issuperset(metrics): 678 | raise ValueError("invalid configuration, section:'{s}' " 679 | "option:'{p}' error:'{e}'" 680 | .format(s='process', 681 | p=option, 682 | e='invalid list of metrics')) 683 | 684 | 685 | def read_write_access(directory): 686 | """Check if read/write access is granted on a directory. 687 | 688 | Arguments: 689 | directory (str): Directory name 690 | 691 | Raises: 692 | OSError if either read or write access isn't granted 693 | 694 | Returns: 695 | None if read/write access is granted 696 | 697 | """ 698 | check_file = os.path.join(directory, '.read_write_check') 699 | try: 700 | with open(check_file, 'w') as _file: 701 | _file.write('') 702 | except OSError as exc: 703 | raise ValueError("invalid configuration, read and write access is not " 704 | "granted for '{d}' directory, error:{e}" 705 | .format(d=directory, 706 | e=str(exc))) 707 | else: 708 | os.remove(check_file) 709 | 710 | 711 | def daemon_percentage_metrics(): 712 | """Build a list of namedtuples. 713 | 714 | Those namedtuples hold metric names for HAProxy daemon for which we 715 | calculate a percentage. 716 | """ 717 | _list = [] 718 | _list.append(MetricNamesPercentage(name='CurrConns', 719 | limit='Maxconn', 720 | title='ConnPercentage')) 721 | _list.append(MetricNamesPercentage(name='ConnRate', 722 | limit='ConnRateLimit', 723 | title='ConnRatePercentage')) 724 | _list.append(MetricNamesPercentage(name='CurrSslConns', 725 | limit='MaxSslConns', 726 | title='SslConnPercentage')) 727 | _list.append(MetricNamesPercentage(name='SslRate', 728 | limit='SslRateLimit', 729 | title='SslRatePercentage')) 730 | 731 | return _list 732 | 733 | 734 | def calculate_percentage_per_row(row, metric): 735 | """Calculate the percentage per row for 2 columns. 736 | 737 | It selects per row 2 columns, metric.name and metric.limit, out of the 738 | dataframe and then calculate the percentage. 739 | 740 | Example where metric.name is 'CurrConns' and metric.limit is 'MaxConn'. 741 | +-------------+---------+-----------+ 742 | | | MaxConn | CurrConns | 743 | +-------------+---------+-----------+ 744 | | Process_num | | | 745 | | 0 | 300 | 13 | 746 | | 1 | 300 | 15 | 747 | | 2 | 300 | 11 | 748 | +-------------+---------+-----------+ 749 | 750 | It returns a Pandas Series with a column name set to metric.title 751 | +-------------+----------------+ 752 | | | ConnPercentage | 753 | +-------------+----------------+ 754 | | Process_num | | 755 | | 0 | 13 | 756 | | 1 | 15 | 757 | | 2 | 11 | 758 | +-------------+----------------+ 759 | 760 | Arguments: 761 | 762 | dataframe (obj): Pandas dataframe with statistics for HAProxy workers 763 | metric (tuple): A namedtuple of MetricNamesPercentage 764 | 765 | Returns: 766 | A Pandas Series with percentage as integer 767 | 768 | """ 769 | if row[metric.limit] == 0: 770 | return pandas.Series({metric.title: 0}) 771 | 772 | return pandas.Series( 773 | { 774 | metric.title: (100 * row[metric.name] 775 | / row[metric.limit]).astype('int') 776 | } 777 | ) 778 | 779 | 780 | def calculate_percentage_per_column(dataframe, metric): 781 | """Calculate the percentage against 2 Pandas Series. 782 | 783 | It selects 2 columns, metric.name and metric.limit, out of the dataframe, 784 | sums the values per column and then calculate the percentage. 785 | 786 | Example where metric.name is 'CurrConns' and metric.limit is 'MaxConn'. 787 | It calculates the sum per column and the retuns the percentage of CurrConns 788 | as part of Maxconn. 789 | +---+---------+-----------+ 790 | | | MaxConn | CurrConns | 791 | +---+---------+-----------+ 792 | | 0 | 300 | 13 | 793 | | 1 | 300 | 15 | 794 | | 2 | 300 | 11 | 795 | +---+---------+-----------+ 796 | 797 | Arguments: 798 | 799 | dataframe (obj): Pandas dataframe with statistics for HAProxy workers 800 | metric (tuple): A namedtuple of MetricNamesPercentage 801 | 802 | 803 | Returns: 804 | A percentage as integer 805 | 806 | """ 807 | _sum = dataframe.loc[:, [metric.name]].sum()[0] 808 | _sum_limit = dataframe.loc[:, [metric.limit]].sum()[0] 809 | if _sum_limit == 0: 810 | return 0 811 | 812 | return int(100 * _sum / _sum_limit) 813 | 814 | 815 | def send_wlc(output, name): 816 | """Send to graphite the wall clock time of the decorated method. 817 | 818 | The decorated method must have the following attributes: 819 | graphite_path (str): The graphite path to use for storing the metric 820 | timestamp (int): Time to credit the wallclock time 821 | 822 | Arguments: 823 | output (obj): A dispatcher object which has send method registered 824 | name (str): A name to append to the metric. 825 | """ 826 | def decorated(func): 827 | """Decorator. 828 | 829 | Arguments: 830 | func (obj): A function to decorate 831 | """ 832 | @wraps(func) 833 | def wrapper(self, *args, **kwargs): 834 | """Time the execution of decorated function.""" 835 | start_time = time.time() 836 | result = func(self, *args, **kwargs) 837 | elapsed_time = '{t:.3f}'.format(t=time.time() - start_time) 838 | data = ("{p}.haproxystats.{m} {v} {t}\n" 839 | .format(p=getattr(self, 'graphite_path'), 840 | m='WallClockTime' + name, 841 | v=elapsed_time, 842 | t=getattr(self, 'timestamp'))) 843 | log.info("wall clock time in seconds for %s %s", 844 | func.__name__, 845 | elapsed_time) 846 | output.signal('send', data=data) 847 | 848 | return result 849 | 850 | return wrapper 851 | 852 | return decorated 853 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | docopt>=0.6.1 2 | pandas>=0.17.1 3 | pyinotify>=0.9.6 4 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = haproxystats 3 | author = Pavlos Parissis 4 | author-email = pavlos.parissis@gmail.com 5 | maintainer = Pavlos Parissis 6 | maintainer-email = pavlos.parissis@gmail.com 7 | summary = A HAProxy statistics collection program 8 | home-page: https://github.com/unixsurfer/haproxystats 9 | license = Apache 2.0 10 | description-file = README.rst 11 | classifier = 12 | Development Status :: 5 - Production/Stable 13 | Environment :: Console 14 | Intended Audience :: Information Technology 15 | Intended Audience :: System Administrators 16 | Natural Language :: English 17 | Operating System :: POSIX 18 | Programming Language :: Python :: 3.4 19 | Topic :: Utilities 20 | keywords = haproxystats haproxy stats collector statistics 21 | 22 | [files] 23 | packages = 24 | haproxystats 25 | 26 | [entry_points] 27 | console_scripts = 28 | haproxystats-pull = haproxystats.pull:main 29 | haproxystats-process = haproxystats.process:main 30 | 31 | [pycodestyle] 32 | ignore = W503 33 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import setuptools 4 | 5 | setuptools.setup( 6 | setup_requires=['pbr'], 7 | pbr=True) 8 | --------------------------------------------------------------------------------