├── .github ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md └── workflows │ ├── ci.yml │ └── repolint.yml ├── .gitignore ├── CHANGELOG.md ├── LICENSE ├── Makefile ├── README.md ├── doc ├── app_top.png ├── proc_history.png └── proc_top.png ├── docker ├── db │ ├── 10-create_users.sql │ ├── 20-create_db.sql │ ├── 30-db_schema.sh │ └── Dockerfile ├── docker-compose.yml └── grafana │ ├── Dockerfile │ ├── dashboards │ ├── app_top.json │ ├── file.yml │ ├── proc_history.json │ └── proc_top.json │ └── datasources │ └── postgres.yml ├── include └── system_monitor.hrl ├── rebar.config ├── rebar.lock ├── src ├── system_monitor.app.src ├── system_monitor.erl ├── system_monitor_app.erl ├── system_monitor_callback.erl ├── system_monitor_events.erl ├── system_monitor_pg.erl ├── system_monitor_sup.erl └── system_monitor_top.erl └── test └── system_monitor_tests.erl /.github/CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | We as members, contributors, and leaders pledge to make participation in our 6 | community a harassment-free experience for everyone, regardless of age, body 7 | size, visible or invisible disability, ethnicity, sex characteristics, gender 8 | identity and expression, level of experience, education, socio-economic status, 9 | nationality, personal appearance, race, religion, or sexual identity 10 | and orientation. 11 | 12 | We pledge to act and interact in ways that contribute to an open, welcoming, 13 | diverse, inclusive, and healthy community. 14 | 15 | ## Our Standards 16 | 17 | Examples of behavior that contributes to a positive environment for our 18 | community include: 19 | 20 | - Demonstrating empathy and kindness toward other people 21 | - Being respectful of differing opinions, viewpoints, and experiences 22 | - Giving and gracefully accepting constructive feedback 23 | - Accepting responsibility and apologizing to those affected by our mistakes, 24 | and learning from the experience 25 | - Focusing on what is best not just for us as individuals, but for the 26 | overall community 27 | 28 | Examples of unacceptable behavior include: 29 | 30 | - The use of sexualized language or imagery, and sexual attention or 31 | advances of any kind 32 | - Trolling, insulting or derogatory comments, and personal or political attacks 33 | - Public or private harassment 34 | - Publishing others' private information, such as a physical or email 35 | address, without their explicit permission 36 | - Other conduct which could reasonably be considered inappropriate in a 37 | professional setting 38 | 39 | ## Enforcement Responsibilities 40 | 41 | Community leaders are responsible for clarifying and enforcing our standards of 42 | acceptable behavior and will take appropriate and fair corrective action in 43 | response to any behavior that they deem inappropriate, threatening, offensive, 44 | or harmful. 45 | 46 | Community leaders have the right and responsibility to remove, edit, or reject 47 | comments, commits, code, wiki edits, issues, and other contributions that are 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation 49 | decisions when appropriate. 50 | 51 | ## Scope 52 | 53 | This Code of Conduct applies within all community spaces, and also applies when 54 | an individual is officially representing the community in public spaces. 55 | Examples of representing our community include using an official e-mail address, 56 | posting via an official social media account, or acting as an appointed 57 | representative at an online or offline event. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported to the community leaders responsible for enforcement at 63 | developers@klarna.com. 64 | All complaints will be reviewed and investigated promptly and fairly. 65 | 66 | All community leaders are obligated to respect the privacy and security of the 67 | reporter of any incident. 68 | 69 | ## Enforcement Guidelines 70 | 71 | Community leaders will follow these Community Impact Guidelines in determining 72 | the consequences for any action they deem in violation of this Code of Conduct: 73 | 74 | ### 1. Correction 75 | 76 | **Community Impact**: Use of inappropriate language or other behavior deemed 77 | unprofessional or unwelcome in the community. 78 | 79 | **Consequence**: A private, written warning from community leaders, providing 80 | clarity around the nature of the violation and an explanation of why the 81 | behavior was inappropriate. A public apology may be requested. 82 | 83 | ### 2. Warning 84 | 85 | **Community Impact**: A violation through a single incident or series 86 | of actions. 87 | 88 | **Consequence**: A warning with consequences for continued behavior. No 89 | interaction with the people involved, including unsolicited interaction with 90 | those enforcing the Code of Conduct, for a specified period of time. This 91 | includes avoiding interactions in community spaces as well as external channels 92 | like social media. Violating these terms may lead to a temporary or 93 | permanent ban. 94 | 95 | ### 3. Temporary Ban 96 | 97 | **Community Impact**: A serious violation of community standards, including 98 | sustained inappropriate behavior. 99 | 100 | **Consequence**: A temporary ban from any sort of interaction or public 101 | communication with the community for a specified period of time. No public or 102 | private interaction with the people involved, including unsolicited interaction 103 | with those enforcing the Code of Conduct, is allowed during this period. 104 | Violating these terms may lead to a permanent ban. 105 | 106 | ### 4. Permanent Ban 107 | 108 | **Community Impact**: Demonstrating a pattern of violation of community 109 | standards, including sustained inappropriate behavior, harassment of an 110 | individual, or aggression toward or disparagement of classes of individuals. 111 | 112 | **Consequence**: A permanent ban from any sort of public interaction within 113 | the community. 114 | 115 | ## Attribution 116 | 117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 118 | version 2.0, available at 119 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. 120 | 121 | Community Impact Guidelines were inspired by [Mozilla's code of conduct 122 | enforcement ladder](https://github.com/mozilla/diversity). 123 | 124 | [homepage]: https://www.contributor-covenant.org 125 | 126 | For answers to common questions about this code of conduct, see the FAQ at 127 | https://www.contributor-covenant.org/faq. Translations are available at 128 | https://www.contributor-covenant.org/translations. 129 | -------------------------------------------------------------------------------- /.github/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to this Klarna project 2 | 3 | Are you here to help with this Klarna project? Welcome! Please read the following to better understand how to ask questions or work on something. 4 | 5 | All members of our community are expected to follow our [Code of Conduct](CODE_OF_CONDUCT.md). Please make sure you are welcoming and friendly in all of our spaces. 6 | 7 | ## Get in touch 8 | 9 | - Report bugs, suggest features or view the source code on GitHub. 10 | - If you have any questions concerning this product, please contact developers@klarna.com. 11 | 12 | ## Contributing to development 13 | 14 | At Klarna, we strive toward achieving the highest possible quality for our 15 | products. Therefore, we require you to follow these guidelines if you wish 16 | to contribute. 17 | 18 | Your contribution has to meet the following criteria: 19 | 20 | - It is accompanied by a description regarding what has been changed and why. 21 | - Pull requests should implement a boxed change, meaning they should optimally not try to address many things at once. 22 | - All code and documentation must follow the style specified by 23 | the included configuration. 24 | - New features and bug fixes must have accompanying unit tests. 25 | - All unit tests should pass. 26 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: Build 2 | on: 3 | push: 4 | branches: 5 | - '*' 6 | pull_request: 7 | branches: 8 | - master 9 | jobs: 10 | build: 11 | strategy: 12 | matrix: 13 | platform: [ubuntu-latest] 14 | otp-version: [23, 24, 25, 26] 15 | runs-on: ${{ matrix.platform }} 16 | container: 17 | image: erlang:${{ matrix.otp-version }} 18 | steps: 19 | - name: Checkout 20 | uses: actions/checkout@v3 21 | 22 | - name: Cache Dialyzer PLTs 23 | uses: actions/cache@v3 24 | with: 25 | path: | 26 | ~/.cache/rebar3/rebar3_*_plt 27 | _build/**/*_plt 28 | key: ${{ runner.os }}-otp${{ matrix.otp-version }}-dialyzer-${{ hashFiles('rebar.config') }} 29 | restore-keys: | 30 | ${{ runner.os }}-otp${{ matrix.otp-version }}-dialyzer- 31 | 32 | - name: Compile 33 | run: rebar3 do compile 34 | 35 | - name: Analyze 36 | run: rebar3 do xref, dialyzer 37 | 38 | - name: Test 39 | run: rebar3 do eunit, ct 40 | -------------------------------------------------------------------------------- /.github/workflows/repolint.yml: -------------------------------------------------------------------------------- 1 | name: Klarna repolint 2 | 3 | on: 4 | push: 5 | branches: [master] 6 | pull_request: 7 | branches: [master] 8 | 9 | jobs: 10 | lint: 11 | runs-on: ubuntu-latest 12 | 13 | steps: 14 | # Checks-out the repository under $GITHUB_WORKSPACE 15 | - uses: actions/checkout@v2 16 | 17 | - name: Install dependencies 18 | run: npm install repolinter log-symbols 19 | 20 | # @TODO Remove when fixed 21 | - name: Fix missing dependency in repolint 22 | run: npm install is-windows 23 | 24 | - name: Use custom rules 25 | run: wget https://raw.githubusercontent.com/klarna-incubator/meta/master/repolint.json 26 | 27 | - name: Run repolint 28 | run: ./node_modules/.bin/repolinter $GITHUB_WORKSPACE 29 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | _build 2 | *.beam 3 | ebin/ 4 | .idea 5 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | All notable changes to this project will be documented in this file. 4 | 5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 6 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). 7 | 8 | ## [Unreleased] - yyyy-mm-dd 9 | 10 | Batch insert operations in postgres backend. 11 | 12 | ## [2.2.2] - 2022-11-29 13 | 14 | Do not query memory usage of processes that have huge message queues. 15 | 16 | ## [2.2.1] - 2022-09-12 17 | 18 | Fixed a bug which could cause badrecord errors in system\_monitor\_top. 19 | 20 | ## [2.2.0] - 2021-11-05 21 | 22 | Added support for configuring a module to use to send system_monitor events to 23 | an external destination. 24 | 25 | ## [2.1.0] - 2021-10-20 26 | 27 | Data format of system\_monitor\_top is changed to keep static data between 28 | ticks. Since this gen server is started by a supervisor that allows for some 29 | restarts, you can either let the server crash or stop+start this application. 30 | 31 | ## [2.0.0] - 2021-04-07 32 | 33 | Replace Kafka backend with a configurable one that defaults into Postgres 34 | 35 | ## [1.0.0] - 2020-09-02 36 | 37 | Initial version 38 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | PICS=$(patsubst %.uml,%.png,$(wildcard doc/*.uml)) 2 | 3 | .PHONY: test 4 | test: 5 | rebar3 do compile, xref, dialyzer, eunit 6 | 7 | doc/%.png: doc/%.uml 8 | plantuml -tsvg $< 9 | 10 | .PHONY: doc 11 | doc: $(PICS) 12 | rebar3 edoc 13 | 14 | .PHONY: dev-start 15 | dev-start: 16 | docker-compose -f docker/docker-compose.yml up -d 17 | 18 | .PHONY: dev-stop 19 | dev-stop: 20 | docker-compose -f docker/docker-compose.yml down --rmi all 21 | 22 | .PHONY: clean 23 | clean: 24 | rm -rf _build 25 | 26 | .PHONY: hex-publish 27 | hex-publish: clean 28 | rebar3 as dev hex publish 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # system_monitor 2 | > Erlang telemetry collector 3 | 4 | [![Build Status][ci-image]][ci-url] 5 | [![License][license-image]][license-url] 6 | [![Developed at Klarna][klarna-image]][klarna-url] 7 | 8 | `system_monitor` is a BEAM VM monitoring and introspection application 9 | that helps in troubleshooting live systems. It collects various 10 | information about Erlang processes and applications. 11 | Unlike `observer`, `system_monitor` does not require 12 | connecting to the monitored system via the Erlang distribution protocol, 13 | and can be used to monitor systems with very tight access 14 | restrictions. 15 | 16 | ## Features 17 | 18 | ### Process top 19 | 20 | Information about top N Erlang processes consuming the most resources 21 | (such as reductions or memory), or have the longest message queues, is 22 | presented on the process top dashboard: 23 | 24 | ![Process top](doc/proc_top.png) 25 | 26 | Historical data can be accessed via standard Grafana time 27 | picker. `status` panel can display important information about the 28 | node state. Pids of the processes on that dashboard are clickable 29 | links that lead to the process history dashboard. 30 | 31 | ### Process history 32 | ![Process history](doc/proc_history.png) 33 | 34 | Process history dashboard displays time series data about certain 35 | Erlang process. Note that some data points can be missing if the 36 | process didn't consume enough resources to appear in the process top. 37 | 38 | ### Application top 39 | ![Application top](doc/app_top.png) 40 | 41 | Application top dashboard contains various information aggregated per 42 | OTP application. 43 | 44 | ## Usage example 45 | 46 | In order to integrate `system_monitor` into your system, simply add it 47 | to the release apps. Add the following lines to `rebar.config`: 48 | 49 | ```erlang 50 | {deps, [..., system_monitor]}. 51 | 52 | {relx, 53 | [ {release, {my_release, "1.0.0"}, 54 | [kernel, sasl, ..., system_monitor]} 55 | ]}. 56 | ``` 57 | 58 | To enable export to Postgres: 59 | 60 | ```erlang 61 | application:load(system_monitor), 62 | application:set_env(system_monitor, callback_mod, system_monitor_pg) 63 | ``` 64 | 65 | ### Custom node status 66 | 67 | `system_monitor` can export arbitrary node status information that is 68 | deemed important for the operator. This is done by defining a callback 69 | function that returns an HTML-formatted string (or iolist): 70 | 71 | ```erlang 72 | -module(foo). 73 | 74 | -export([node_status/0]). 75 | 76 | node_status() -> 77 | ["my node type
", 78 | case healthy() of 79 | true -> "UP
" 80 | false -> "DEGRADED
" 81 | end, 82 | io_lib:format("very important value=~p", [very_important_value()]) 83 | ]. 84 | ``` 85 | 86 | This callback then needs to be added to the system_monitor application 87 | environment: 88 | 89 | ```erlang 90 | {system_monitor, 91 | [ {node_status_fun, {foo, node_status}} 92 | ... 93 | ]} 94 | ``` 95 | 96 | More information about configurable options is found [here](src/system_monitor.app.src). 97 | 98 | ## How it all works out 99 | 100 | System_monitor will spawn several processes that handle different states: 101 | 102 | * `system_monitor_top` 103 | Collects a certain amount of data from the BEAM for a preconfigured number of processes 104 | * `system_monitor_events` 105 | Subscribes to certain types of preconfigured BEAM events such as: busy_port, long_gc, long_schedule etc 106 | * `system_monitor` 107 | Runs a set of preconfigured `monitors` periodically 108 | 109 | ### What are the preconfigured monitors 110 | 111 | * `check_process_count` 112 | Logs if the process_count passes a certain threshold 113 | * `suspect_procs` 114 | Logs if it detects processes with suspiciously high memory 115 | * `report_full_status` 116 | Gets the state from `system_monitor_top` and produces to a backend module 117 | that implements the `system_monitor_callback` behavior, selected by binding 118 | `callback_mod` in the `system_monitor` application environment to that module. 119 | If `callback_mod` is unbound, this monitor is disabled. 120 | The preconfigured backend is Postgres and is implemented via `system_monitor_pg`. 121 | 122 | `system_monitor_pg` allows for Postgres being temporary down by storing the stats in its own internal buffer. 123 | This buffer is built with a sliding window that will stop the state from growing too big whenever 124 | Postgres is down for too long. On top of this `system_monitor_pg` has a built-in load 125 | shedding mechanism that protects itself once the message length queue grows bigger than a certain level. 126 | 127 | ## Local development 128 | A Postgres and Grafana cluster can be spun up using `make dev-start` and stopped using `make dev-stop`. 129 | Start `system_monitor` by calling `rebar3 shell` and start the application with `application:ensure_all_started(system_monitor)`. 130 | 131 | At this point a grafana instance will be available on localhost:3000 with default login "admin" and password 132 | "admin" including some predefined dashboards. 133 | 134 | ## Production setup 135 | For production, a similar Postgres has to be setup as is done in the Dockerfile for Postgres in case one chooses to go with a system_monitor -> Postgres setup. 136 | 137 | ## How to contribute 138 | 139 | See our guide on [contributing](.github/CONTRIBUTING.md). 140 | 141 | ## Release History 142 | 143 | See our [changelog](CHANGELOG.md). 144 | 145 | ## License 146 | 147 | Copyright © 2020-2023 Klarna Bank AB 148 | 149 | For license details, see the [LICENSE](LICENSE) file in the root of this project. 150 | 151 | 152 | 153 | [ci-image]: https://img.shields.io/badge/build-passing-brightgreen?style=flat-square 154 | [ci-url]: https://github.com/klarna-incubator/TODO 155 | [license-image]: https://img.shields.io/badge/license-Apache%202-blue?style=flat-square 156 | [license-url]: http://www.apache.org/licenses/LICENSE-2.0 157 | [klarna-image]: https://img.shields.io/badge/%20-Developed%20at%20Klarna-black?labelColor=ffb3c7&style=flat-square&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAOCAYAAAAmL5yKAAAAAXNSR0IArs4c6QAAAIRlWElmTU0AKgAAAAgABQESAAMAAAABAAEAAAEaAAUAAAABAAAASgEbAAUAAAABAAAAUgEoAAMAAAABAAIAAIdpAAQAAAABAAAAWgAAAAAAAALQAAAAAQAAAtAAAAABAAOgAQADAAAAAQABAACgAgAEAAAAAQAAABCgAwAEAAAAAQAAAA4AAAAA0LMKiwAAAAlwSFlzAABuugAAbroB1t6xFwAAAVlpVFh0WE1MOmNvbS5hZG9iZS54bXAAAAAAADx4OnhtcG1ldGEgeG1sbnM6eD0iYWRvYmU6bnM6bWV0YS8iIHg6eG1wdGs9IlhNUCBDb3JlIDUuNC4wIj4KICAgPHJkZjpSREYgeG1sbnM6cmRmPSJodHRwOi8vd3d3LnczLm9yZy8xOTk5LzAyLzIyLXJkZi1zeW50YXgtbnMjIj4KICAgICAgPHJkZjpEZXNjcmlwdGlvbiByZGY6YWJvdXQ9IiIKICAgICAgICAgICAgeG1sbnM6dGlmZj0iaHR0cDovL25zLmFkb2JlLmNvbS90aWZmLzEuMC8iPgogICAgICAgICA8dGlmZjpPcmllbnRhdGlvbj4xPC90aWZmOk9yaWVudGF0aW9uPgogICAgICA8L3JkZjpEZXNjcmlwdGlvbj4KICAgPC9yZGY6UkRGPgo8L3g6eG1wbWV0YT4KTMInWQAAAVBJREFUKBVtkz0vREEUhsdXgo5qJXohkUgQ0fgFNFpR2V5ClP6CQu9PiB6lEL1I7B9A4/treZ47c252s97k2ffMmZkz5869m1JKL/AFbzAHaiRbmsIf4BdaMAZqMFsOXNxXkroKbxCPV5l8yHOJLVipn9/vEreLa7FguSN3S2ynA/ATeQuI8tTY6OOY34DQaQnq9mPCDtxoBwuRxPfAvPMWnARlB12KAi6eLTPruOOP4gcl33O6+Sjgc83DJkRH+h2MgorLzaPy68W48BG2S+xYnmAa1L+nOxEduMH3fgjGFvZeVkANZau68B6CrgJxWosFFpF7iG+h5wKZqwt42qIJtARu/ix+gqsosEq8D35o6R3c7OL4lAnTDljEe9B3Qa2BYzmHemDCt6Diwo6JY7E+A82OnN9HuoBruAQvUQ1nSxP4GVzBDRyBfygf6RW2/gD3NmEv+K/DZgAAAABJRU5ErkJggg== 158 | [klarna-url]: https://github.com/klarna-incubator 159 | -------------------------------------------------------------------------------- /doc/app_top.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/klarna/system_monitor/5221666bef6dd18013fb4d9cf9a775e77619b47c/doc/app_top.png -------------------------------------------------------------------------------- /doc/proc_history.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/klarna/system_monitor/5221666bef6dd18013fb4d9cf9a775e77619b47c/doc/proc_history.png -------------------------------------------------------------------------------- /doc/proc_top.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/klarna/system_monitor/5221666bef6dd18013fb4d9cf9a775e77619b47c/doc/proc_top.png -------------------------------------------------------------------------------- /docker/db/10-create_users.sql: -------------------------------------------------------------------------------- 1 | CREATE USER system_monitor WITH PASSWORD 'system_monitor_password'; 2 | CREATE USER grafana WITH PASSWORD 'system_monitor_password'; 3 | -------------------------------------------------------------------------------- /docker/db/20-create_db.sql: -------------------------------------------------------------------------------- 1 | CREATE DATABASE system_monitor; 2 | -------------------------------------------------------------------------------- /docker/db/30-db_schema.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euxo 3 | 4 | psql -v ON_ERROR_STOP=1 --username "system_monitor" --dbname "system_monitor" <<-EOSQL 5 | 6 | ----------------------------------------------------------------------------------- 7 | -- prc table 8 | ----------------------------------------------------------------------------------- 9 | 10 | create table if not exists prc ( 11 | node text not null, 12 | ts timestamp without time zone not null, 13 | pid text not null, 14 | dreductions double precision not null, 15 | dmemory double precision not null, 16 | reductions bigint not null, 17 | memory bigint not null, 18 | message_queue_len bigint not null, 19 | current_function text, 20 | initial_call text, 21 | registered_name text, 22 | stack_size bigint, 23 | heap_size bigint, 24 | total_heap_size bigint, 25 | current_stacktrace text, 26 | group_leader text 27 | ) partition by range(ts); 28 | 29 | alter table prc owner to system_monitor; 30 | grant insert on table prc to system_monitor; 31 | grant select on table prc to grafana; 32 | 33 | ----------------------------------------------------------------------------------- 34 | -- app_top table 35 | ----------------------------------------------------------------------------------- 36 | DO \$\$ 37 | BEGIN 38 | IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'app_top_unit') THEN 39 | CREATE TYPE app_top_unit AS 40 | enum ('reductions', 'memory', 'processes'); 41 | END IF; 42 | END\$\$; 43 | 44 | create table if not exists app_top ( 45 | node text, 46 | ts timestamp without time zone not null, 47 | application text, 48 | unit app_top_unit, 49 | value numeric 50 | ) partition by range(ts); 51 | 52 | alter table app_top owner to system_monitor; 53 | grant insert on table app_top to system_monitor; 54 | grant select on table app_top to grafana; 55 | 56 | ----------------------------------------------------------------------------------- 57 | -- fun_top table 58 | ----------------------------------------------------------------------------------- 59 | DO \$\$ 60 | BEGIN 61 | IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'fun_type') THEN 62 | CREATE TYPE fun_type AS 63 | enum ('initial_call', 'current_function'); 64 | END IF; 65 | END\$\$; 66 | 67 | create table if not exists fun_top ( 68 | node text, 69 | ts timestamp without time zone not null, 70 | fun text, 71 | fun_type fun_type, 72 | num_processes numeric 73 | ) partition by range(ts); 74 | 75 | alter table fun_top owner to system_monitor; 76 | grant insert on table fun_top to system_monitor; 77 | grant select on table fun_top to grafana; 78 | 79 | ----------------------------------------------------------------------------------- 80 | -- node_role table 81 | ----------------------------------------------------------------------------------- 82 | 83 | create table if not exists node_role ( 84 | node text not null, 85 | ts timestamp without time zone not null, 86 | data text 87 | ) partition by range(ts); 88 | 89 | alter table node_role owner to system_monitor; 90 | grant delete on table node_role to system_monitor; 91 | grant select on table node_role to system_monitor; 92 | grant insert on table node_role to system_monitor; 93 | grant select on table node_role to grafana; 94 | 95 | create index if not exists node_role_ts_idx on node_role(ts); 96 | 97 | ----------------------------------------------------------------------------------- 98 | -- node table 99 | ----------------------------------------------------------------------------------- 100 | 101 | create table if not exists node ( 102 | node text not null primary key 103 | ); 104 | 105 | alter table node owner to system_monitor; 106 | grant select on table node to system_monitor; 107 | grant insert on table node to system_monitor; 108 | grant select on table node to grafana; 109 | 110 | create or replace function update_nodes() 111 | returns trigger 112 | language plpgsql as 113 | \$\$ 114 | begin 115 | insert into node(node) values (NEW.node) on conflict do nothing; 116 | return null; 117 | end; 118 | \$\$; 119 | 120 | drop trigger if exists update_nodes_trigger on node_role; 121 | create trigger update_nodes_trigger 122 | after insert on node_role 123 | for each row 124 | execute procedure update_nodes(); 125 | 126 | EOSQL 127 | -------------------------------------------------------------------------------- /docker/db/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM postgres:12.4 2 | 3 | COPY *.sql /docker-entrypoint-initdb.d/ 4 | COPY *.sh /docker-entrypoint-initdb.d/ 5 | -------------------------------------------------------------------------------- /docker/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.6" 2 | 3 | services: 4 | db: 5 | build: db 6 | ports: 7 | - 5432:5432 8 | environment: 9 | - POSTGRES_PASSWORD=system_monitor_password 10 | 11 | grafana: 12 | build: grafana 13 | depends_on: [db] 14 | ports: 15 | - 3000:3000 16 | -------------------------------------------------------------------------------- /docker/grafana/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM grafana/grafana:latest 2 | 3 | COPY datasources/*.yml /etc/grafana/provisioning/datasources/ 4 | COPY dashboards/*.json /var/lib/grafana/dashboards/ 5 | COPY dashboards/*.yml /etc/grafana/provisioning/dashboards/ 6 | -------------------------------------------------------------------------------- /docker/grafana/dashboards/app_top.json: -------------------------------------------------------------------------------- 1 | { 2 | "annotations": { 3 | "list": [ 4 | { 5 | "builtIn": 1, 6 | "datasource": "-- Grafana --", 7 | "enable": true, 8 | "hide": true, 9 | "iconColor": "rgba(0, 211, 255, 1)", 10 | "name": "Annotations & Alerts", 11 | "type": "dashboard" 12 | } 13 | ] 14 | }, 15 | "editable": true, 16 | "gnetId": null, 17 | "graphTooltip": 0, 18 | "id": 1, 19 | "iteration": 1596192927850, 20 | "links": [], 21 | "panels": [ 22 | { 23 | "aliasColors": {}, 24 | "bars": true, 25 | "dashLength": 10, 26 | "dashes": false, 27 | "datasource": "Postgres", 28 | "fill": 1, 29 | "fillGradient": 0, 30 | "gridPos": { 31 | "h": 9, 32 | "w": 12, 33 | "x": 0, 34 | "y": 0 35 | }, 36 | "hiddenSeries": false, 37 | "id": 6, 38 | "legend": { 39 | "avg": false, 40 | "current": false, 41 | "max": false, 42 | "min": false, 43 | "show": true, 44 | "total": false, 45 | "values": false 46 | }, 47 | "lines": false, 48 | "linewidth": 1, 49 | "links": [], 50 | "nullPointMode": "null", 51 | "options": { 52 | "dataLinks": [] 53 | }, 54 | "percentage": false, 55 | "pointradius": 5, 56 | "points": false, 57 | "renderer": "flot", 58 | "repeat": null, 59 | "repeatDirection": "h", 60 | "seriesOverrides": [], 61 | "spaceLength": 10, 62 | "stack": true, 63 | "steppedLine": false, 64 | "targets": [ 65 | { 66 | "alias": "", 67 | "format": "time_series", 68 | "rawSql": "WITH tp(tp) AS (VALUES(($__unixEpochTo()-$__unixEpochFrom())/100))\nSELECT\n (extract(epoch from ts)/tp)::bigint*tp AS time,\n application AS \"metric\",\n log(avg(value))\nFROM\n app_top, tp\nWHERE\n unit = 'reductions' AND\n node = '[[node]]' AND\n ts BETWEEN $__timeFrom() AND $__timeTo()\nGROUP BY application,time\nORDER BY time", 69 | "refId": "A" 70 | } 71 | ], 72 | "thresholds": [], 73 | "timeFrom": null, 74 | "timeRegions": [], 75 | "timeShift": null, 76 | "title": "reductions per application (log scale)", 77 | "tooltip": { 78 | "shared": false, 79 | "sort": 2, 80 | "value_type": "individual" 81 | }, 82 | "type": "graph", 83 | "xaxis": { 84 | "buckets": null, 85 | "mode": "time", 86 | "name": null, 87 | "show": true, 88 | "values": [] 89 | }, 90 | "yaxes": [ 91 | { 92 | "format": "short", 93 | "label": null, 94 | "logBase": 1, 95 | "max": null, 96 | "min": null, 97 | "show": true 98 | }, 99 | { 100 | "format": "short", 101 | "label": null, 102 | "logBase": 1, 103 | "max": null, 104 | "min": null, 105 | "show": true 106 | } 107 | ], 108 | "yaxis": { 109 | "align": false, 110 | "alignLevel": null 111 | } 112 | }, 113 | { 114 | "aliasColors": {}, 115 | "bars": true, 116 | "dashLength": 10, 117 | "dashes": false, 118 | "datasource": "Postgres", 119 | "fill": 1, 120 | "fillGradient": 0, 121 | "gridPos": { 122 | "h": 9, 123 | "w": 12, 124 | "x": 12, 125 | "y": 0 126 | }, 127 | "hiddenSeries": false, 128 | "id": 8, 129 | "legend": { 130 | "avg": false, 131 | "current": false, 132 | "max": false, 133 | "min": false, 134 | "show": true, 135 | "total": false, 136 | "values": false 137 | }, 138 | "lines": false, 139 | "linewidth": 1, 140 | "links": [], 141 | "nullPointMode": "null", 142 | "options": { 143 | "dataLinks": [] 144 | }, 145 | "percentage": false, 146 | "pointradius": 5, 147 | "points": false, 148 | "renderer": "flot", 149 | "seriesOverrides": [], 150 | "spaceLength": 10, 151 | "stack": true, 152 | "steppedLine": false, 153 | "targets": [ 154 | { 155 | "alias": "", 156 | "format": "time_series", 157 | "rawSql": "WITH tp(tp) AS (VALUES(($__unixEpochTo()-$__unixEpochFrom())/100))\nSELECT\n (extract(epoch from ts)/tp)::bigint*tp AS time,\n application AS \"metric\",\n avg(value)\nFROM\n app_top, tp\nWHERE\n unit = 'memory' AND\n node = '[[node]]' AND\n ts BETWEEN $__timeFrom() AND $__timeTo()\nGROUP BY application,time\nORDER BY time", 158 | "refId": "A" 159 | } 160 | ], 161 | "thresholds": [], 162 | "timeFrom": null, 163 | "timeRegions": [], 164 | "timeShift": null, 165 | "title": "memory per application", 166 | "tooltip": { 167 | "shared": false, 168 | "sort": 2, 169 | "value_type": "individual" 170 | }, 171 | "type": "graph", 172 | "xaxis": { 173 | "buckets": null, 174 | "mode": "time", 175 | "name": null, 176 | "show": true, 177 | "values": [] 178 | }, 179 | "yaxes": [ 180 | { 181 | "format": "decbytes", 182 | "label": null, 183 | "logBase": 1, 184 | "max": null, 185 | "min": null, 186 | "show": true 187 | }, 188 | { 189 | "format": "short", 190 | "label": null, 191 | "logBase": 1, 192 | "max": null, 193 | "min": null, 194 | "show": true 195 | } 196 | ], 197 | "yaxis": { 198 | "align": false, 199 | "alignLevel": null 200 | } 201 | }, 202 | { 203 | "aliasColors": {}, 204 | "bars": true, 205 | "dashLength": 10, 206 | "dashes": false, 207 | "datasource": "Postgres", 208 | "fill": 1, 209 | "fillGradient": 0, 210 | "gridPos": { 211 | "h": 9, 212 | "w": 8, 213 | "x": 0, 214 | "y": 9 215 | }, 216 | "hiddenSeries": false, 217 | "id": 9, 218 | "legend": { 219 | "avg": false, 220 | "current": false, 221 | "max": false, 222 | "min": false, 223 | "show": true, 224 | "total": false, 225 | "values": false 226 | }, 227 | "lines": false, 228 | "linewidth": 1, 229 | "links": [], 230 | "nullPointMode": "null", 231 | "options": { 232 | "dataLinks": [] 233 | }, 234 | "percentage": false, 235 | "pointradius": 5, 236 | "points": false, 237 | "renderer": "flot", 238 | "seriesOverrides": [], 239 | "spaceLength": 10, 240 | "stack": true, 241 | "steppedLine": false, 242 | "targets": [ 243 | { 244 | "alias": "", 245 | "format": "time_series", 246 | "rawSql": "WITH tp(tp) AS (VALUES(($__unixEpochTo()-$__unixEpochFrom())/100))\nSELECT\n (extract(epoch from ts)/tp)::bigint*tp AS time,\n application AS \"metric\",\n avg(value)\nFROM\n app_top, tp\nWHERE\n unit = 'processes' AND\n node = '[[node]]' AND\n ts BETWEEN $__timeFrom() AND $__timeTo()\nGROUP BY application,time\nORDER BY time", 247 | "refId": "A" 248 | } 249 | ], 250 | "thresholds": [], 251 | "timeFrom": null, 252 | "timeRegions": [], 253 | "timeShift": null, 254 | "title": "processes per application", 255 | "tooltip": { 256 | "shared": false, 257 | "sort": 0, 258 | "value_type": "individual" 259 | }, 260 | "type": "graph", 261 | "xaxis": { 262 | "buckets": null, 263 | "mode": "time", 264 | "name": null, 265 | "show": true, 266 | "values": [] 267 | }, 268 | "yaxes": [ 269 | { 270 | "format": "short", 271 | "label": null, 272 | "logBase": 1, 273 | "max": null, 274 | "min": null, 275 | "show": true 276 | }, 277 | { 278 | "format": "short", 279 | "label": null, 280 | "logBase": 1, 281 | "max": null, 282 | "min": null, 283 | "show": true 284 | } 285 | ], 286 | "yaxis": { 287 | "align": false, 288 | "alignLevel": null 289 | } 290 | }, 291 | { 292 | "aliasColors": { 293 | "application_master:main_loop/2": "#eab839", 294 | "dets:open_file_loop2/2": "#0a50a1", 295 | "dist_util:con_loop/2": "#eab839", 296 | "mochiweb_http:request/3": "#e5ac0e", 297 | "prim_inet:accept0/2": "#890f02" 298 | }, 299 | "bars": true, 300 | "dashLength": 10, 301 | "dashes": false, 302 | "datasource": "Postgres", 303 | "fill": 1, 304 | "fillGradient": 0, 305 | "gridPos": { 306 | "h": 9, 307 | "w": 7, 308 | "x": 8, 309 | "y": 9 310 | }, 311 | "hiddenSeries": false, 312 | "id": 2, 313 | "legend": { 314 | "avg": false, 315 | "current": false, 316 | "max": false, 317 | "min": false, 318 | "show": true, 319 | "total": false, 320 | "values": false 321 | }, 322 | "lines": false, 323 | "linewidth": 1, 324 | "links": [], 325 | "nullPointMode": "null", 326 | "options": { 327 | "dataLinks": [] 328 | }, 329 | "percentage": true, 330 | "pointradius": 5, 331 | "points": false, 332 | "renderer": "flot", 333 | "seriesOverrides": [], 334 | "spaceLength": 10, 335 | "stack": true, 336 | "steppedLine": false, 337 | "targets": [ 338 | { 339 | "alias": "", 340 | "format": "time_series", 341 | "rawSql": "WITH tp(tp) AS (VALUES(($__unixEpochTo()-$__unixEpochFrom())/100))\nSELECT\n (extract(epoch from ts)/tp)::bigint*tp AS time,\n fun AS \"metric\",\n avg(num_processes)\nFROM\n fun_top, tp\nWHERE\n fun_type = 'current_function' AND\n node = '[[node]]' AND\n ts BETWEEN $__timeFrom() AND $__timeTo()\nGROUP BY fun,time\nORDER BY time", 342 | "refId": "A" 343 | } 344 | ], 345 | "thresholds": [], 346 | "timeFrom": null, 347 | "timeRegions": [], 348 | "timeShift": null, 349 | "title": "% of processes execuing function", 350 | "tooltip": { 351 | "shared": false, 352 | "sort": 1, 353 | "value_type": "individual" 354 | }, 355 | "type": "graph", 356 | "xaxis": { 357 | "buckets": null, 358 | "mode": "time", 359 | "name": null, 360 | "show": true, 361 | "values": [] 362 | }, 363 | "yaxes": [ 364 | { 365 | "format": "short", 366 | "label": null, 367 | "logBase": 1, 368 | "max": null, 369 | "min": null, 370 | "show": true 371 | }, 372 | { 373 | "format": "short", 374 | "label": null, 375 | "logBase": 1, 376 | "max": null, 377 | "min": null, 378 | "show": true 379 | } 380 | ], 381 | "yaxis": { 382 | "align": false, 383 | "alignLevel": null 384 | } 385 | }, 386 | { 387 | "aliasColors": {}, 388 | "bars": true, 389 | "dashLength": 10, 390 | "dashes": false, 391 | "datasource": "Postgres", 392 | "fill": 1, 393 | "fillGradient": 0, 394 | "gridPos": { 395 | "h": 9, 396 | "w": 9, 397 | "x": 15, 398 | "y": 9 399 | }, 400 | "hiddenSeries": false, 401 | "id": 4, 402 | "legend": { 403 | "avg": false, 404 | "current": false, 405 | "max": false, 406 | "min": false, 407 | "show": true, 408 | "total": false, 409 | "values": false 410 | }, 411 | "lines": false, 412 | "linewidth": 1, 413 | "links": [], 414 | "nullPointMode": "null", 415 | "options": { 416 | "dataLinks": [] 417 | }, 418 | "percentage": true, 419 | "pointradius": 5, 420 | "points": false, 421 | "renderer": "flot", 422 | "seriesOverrides": [], 423 | "spaceLength": 10, 424 | "stack": true, 425 | "steppedLine": false, 426 | "targets": [ 427 | { 428 | "alias": "", 429 | "format": "time_series", 430 | "rawSql": "WITH tp(tp) AS (VALUES(($__unixEpochTo()-$__unixEpochFrom())/100))\nSELECT\n (extract(epoch from ts)/tp)::bigint*tp AS time,\n fun AS \"metric\",\n avg(num_processes)\nFROM\n fun_top, tp\nWHERE\n fun_type = 'initial_call' AND\n node = '[[node]]' AND\n ts BETWEEN $__timeFrom() AND $__timeTo()\nGROUP BY fun,time\nORDER BY time", 431 | "refId": "A" 432 | } 433 | ], 434 | "thresholds": [], 435 | "timeFrom": null, 436 | "timeRegions": [], 437 | "timeShift": null, 438 | "title": "% of processes started from this function", 439 | "tooltip": { 440 | "shared": true, 441 | "sort": 0, 442 | "value_type": "individual" 443 | }, 444 | "type": "graph", 445 | "xaxis": { 446 | "buckets": null, 447 | "mode": "time", 448 | "name": null, 449 | "show": true, 450 | "values": [] 451 | }, 452 | "yaxes": [ 453 | { 454 | "format": "short", 455 | "label": null, 456 | "logBase": 1, 457 | "max": null, 458 | "min": null, 459 | "show": true 460 | }, 461 | { 462 | "format": "short", 463 | "label": null, 464 | "logBase": 1, 465 | "max": null, 466 | "min": null, 467 | "show": true 468 | } 469 | ], 470 | "yaxis": { 471 | "align": false, 472 | "alignLevel": null 473 | } 474 | } 475 | ], 476 | "refresh": false, 477 | "schemaVersion": 22, 478 | "style": "dark", 479 | "tags": [ 480 | "erlang", 481 | "top" 482 | ], 483 | "templating": { 484 | "list": [ 485 | { 486 | "allValue": null, 487 | "current": { 488 | "tags": [], 489 | "text": "", 490 | "value": "" 491 | }, 492 | "datasource": "Postgres", 493 | "definition": "SELECT DISTINCT node FROM node ORDER BY node ASC;", 494 | "hide": 0, 495 | "includeAll": false, 496 | "label": null, 497 | "multi": false, 498 | "name": "node", 499 | "options": [ 500 | { 501 | "selected": true, 502 | "text": "", 503 | "value": "" 504 | } 505 | ], 506 | "query": "SELECT DISTINCT node FROM node ORDER BY node ASC;", 507 | "refresh": 1, 508 | "regex": "", 509 | "skipUrlSync": false, 510 | "sort": 0, 511 | "tagValuesQuery": "", 512 | "tags": [], 513 | "tagsQuery": "", 514 | "type": "query", 515 | "useTags": false 516 | } 517 | ] 518 | }, 519 | "time": { 520 | "from": "now-30m", 521 | "to": "now" 522 | }, 523 | "timepicker": { 524 | "refresh_intervals": [ 525 | "5s", 526 | "10s", 527 | "30s", 528 | "1m", 529 | "5m", 530 | "15m", 531 | "30m", 532 | "1h", 533 | "2h", 534 | "1d" 535 | ], 536 | "time_options": [ 537 | "5m", 538 | "15m", 539 | "1h", 540 | "6h", 541 | "12h", 542 | "24h", 543 | "2d", 544 | "7d", 545 | "30d" 546 | ] 547 | }, 548 | "timezone": "", 549 | "title": "Erlang applications top", 550 | "uid": "tw4QVxniz", 551 | "version": 5 552 | } 553 | -------------------------------------------------------------------------------- /docker/grafana/dashboards/file.yml: -------------------------------------------------------------------------------- 1 | apiVersion: 1 2 | 3 | providers: 4 | - name: 'file' 5 | type: file 6 | disableDeletion: false 7 | editable: true 8 | updateIntervalSeconds: 10 9 | allowUiUpdates: true 10 | options: 11 | path: /var/lib/grafana/dashboards 12 | foldersFromFilesStructure: true 13 | -------------------------------------------------------------------------------- /docker/grafana/dashboards/proc_history.json: -------------------------------------------------------------------------------- 1 | { 2 | "annotations": { 3 | "list": [ 4 | { 5 | "builtIn": 1, 6 | "datasource": "-- Grafana --", 7 | "enable": true, 8 | "hide": true, 9 | "iconColor": "rgba(0, 211, 255, 1)", 10 | "name": "Annotations & Alerts", 11 | "type": "dashboard" 12 | } 13 | ] 14 | }, 15 | "editable": true, 16 | "gnetId": null, 17 | "graphTooltip": 0, 18 | "id": 15, 19 | "iteration": 1596189996375, 20 | "links": [], 21 | "panels": [ 22 | { 23 | "aliasColors": {}, 24 | "bars": false, 25 | "dashLength": 10, 26 | "dashes": false, 27 | "datasource": "Postgres", 28 | "fill": 1, 29 | "fillGradient": 0, 30 | "gridPos": { 31 | "h": 9, 32 | "w": 24, 33 | "x": 0, 34 | "y": 0 35 | }, 36 | "hiddenSeries": false, 37 | "id": 4, 38 | "legend": { 39 | "avg": false, 40 | "current": false, 41 | "max": false, 42 | "min": false, 43 | "show": true, 44 | "total": false, 45 | "values": false 46 | }, 47 | "lines": true, 48 | "linewidth": 0, 49 | "links": [], 50 | "nullPointMode": "null as zero", 51 | "options": { 52 | "dataLinks": [] 53 | }, 54 | "percentage": false, 55 | "pointradius": 0.5, 56 | "points": true, 57 | "renderer": "flot", 58 | "seriesOverrides": [], 59 | "spaceLength": 10, 60 | "stack": false, 61 | "steppedLine": false, 62 | "targets": [ 63 | { 64 | "alias": "", 65 | "format": "time_series", 66 | "rawSql": "SELECT\n $__time(ts),\n dreductions\nFROM\n prc\nWHERE\n (ts BETWEEN $__timeFrom() AND $__timeTo()) AND \n node='[[node]]' AND\n ( ('[[pid]]' != '' AND pid='[[pid]]') OR -- pid is set\n ('[[regname]]' != '' AND registered_name='[[regname]]') OR -- registered_name is set\n ('[[pid]]' = '' AND '[[regname]]' = '') -- none is set, show all processes\n )\nORDER BY ts\n", 67 | "refId": "A" 68 | } 69 | ], 70 | "thresholds": [], 71 | "timeFrom": null, 72 | "timeRegions": [], 73 | "timeShift": null, 74 | "title": "Reductions", 75 | "tooltip": { 76 | "shared": true, 77 | "sort": 0, 78 | "value_type": "individual" 79 | }, 80 | "type": "graph", 81 | "xaxis": { 82 | "buckets": null, 83 | "mode": "time", 84 | "name": null, 85 | "show": true, 86 | "values": [] 87 | }, 88 | "yaxes": [ 89 | { 90 | "format": "short", 91 | "label": null, 92 | "logBase": 1, 93 | "max": null, 94 | "min": null, 95 | "show": true 96 | }, 97 | { 98 | "format": "short", 99 | "label": null, 100 | "logBase": 1, 101 | "max": null, 102 | "min": null, 103 | "show": true 104 | } 105 | ], 106 | "yaxis": { 107 | "align": false, 108 | "alignLevel": null 109 | } 110 | }, 111 | { 112 | "aliasColors": {}, 113 | "bars": false, 114 | "dashLength": 10, 115 | "dashes": false, 116 | "datasource": "Postgres", 117 | "fill": 1, 118 | "fillGradient": 0, 119 | "gridPos": { 120 | "h": 9, 121 | "w": 24, 122 | "x": 0, 123 | "y": 9 124 | }, 125 | "hiddenSeries": false, 126 | "id": 2, 127 | "legend": { 128 | "avg": false, 129 | "current": false, 130 | "max": false, 131 | "min": false, 132 | "show": true, 133 | "total": false, 134 | "values": false 135 | }, 136 | "lines": true, 137 | "linewidth": 0, 138 | "links": [], 139 | "nullPointMode": "null", 140 | "options": { 141 | "dataLinks": [] 142 | }, 143 | "percentage": false, 144 | "pointradius": 0.5, 145 | "points": true, 146 | "renderer": "flot", 147 | "seriesOverrides": [], 148 | "spaceLength": 10, 149 | "stack": true, 150 | "steppedLine": false, 151 | "targets": [ 152 | { 153 | "alias": "", 154 | "format": "time_series", 155 | "rawSql": "SELECT\n $__time(ts),\n stack_size, heap_size, total_heap_size, memory\nFROM\n prc\nWHERE\n (ts BETWEEN $__timeFrom() AND $__timeTo()) AND \n node='[[node]]' AND\n ( ('[[pid]]' != '' AND pid='[[pid]]') OR -- pid is set\n ('[[regname]]' != '' AND registered_name='[[regname]]') OR -- registered_name is set\n ('[[pid]]' = '' AND '[[regname]]' = '') -- none is set, show all processes\n ) \nORDER BY ts\n", 156 | "refId": "A" 157 | } 158 | ], 159 | "thresholds": [], 160 | "timeFrom": null, 161 | "timeRegions": [], 162 | "timeShift": null, 163 | "title": "Memory", 164 | "tooltip": { 165 | "shared": true, 166 | "sort": 0, 167 | "value_type": "individual" 168 | }, 169 | "type": "graph", 170 | "xaxis": { 171 | "buckets": null, 172 | "mode": "time", 173 | "name": null, 174 | "show": true, 175 | "values": [] 176 | }, 177 | "yaxes": [ 178 | { 179 | "format": "decbytes", 180 | "label": null, 181 | "logBase": 1, 182 | "max": null, 183 | "min": null, 184 | "show": true 185 | }, 186 | { 187 | "format": "short", 188 | "label": null, 189 | "logBase": 1, 190 | "max": null, 191 | "min": null, 192 | "show": true 193 | } 194 | ], 195 | "yaxis": { 196 | "align": false, 197 | "alignLevel": null 198 | } 199 | }, 200 | { 201 | "aliasColors": {}, 202 | "bars": false, 203 | "dashLength": 10, 204 | "dashes": false, 205 | "datasource": "Postgres", 206 | "fill": 1, 207 | "fillGradient": 0, 208 | "gridPos": { 209 | "h": 9, 210 | "w": 24, 211 | "x": 0, 212 | "y": 18 213 | }, 214 | "hiddenSeries": false, 215 | "id": 6, 216 | "legend": { 217 | "avg": false, 218 | "current": false, 219 | "max": false, 220 | "min": false, 221 | "show": true, 222 | "total": false, 223 | "values": false 224 | }, 225 | "lines": true, 226 | "linewidth": 0, 227 | "links": [], 228 | "nullPointMode": "null", 229 | "options": { 230 | "dataLinks": [] 231 | }, 232 | "percentage": false, 233 | "pointradius": 0.5, 234 | "points": true, 235 | "renderer": "flot", 236 | "seriesOverrides": [], 237 | "spaceLength": 10, 238 | "stack": false, 239 | "steppedLine": false, 240 | "targets": [ 241 | { 242 | "alias": "", 243 | "format": "time_series", 244 | "rawSql": "SELECT\n $__time(ts), message_queue_len\nFROM prc\nWHERE\n (ts BETWEEN $__timeFrom() AND $__timeTo()) AND \n node='[[node]]' AND\n ( ('[[pid]]' != '' AND pid='[[pid]]') OR -- pid is set\n ('[[regname]]' != '' AND registered_name='[[regname]]') OR -- registered_name is set\n ('[[pid]]' = '' AND '[[regname]]' = '') -- none is set, show all processes\n )\nORDER BY ts", 245 | "refId": "A" 246 | } 247 | ], 248 | "thresholds": [], 249 | "timeFrom": null, 250 | "timeRegions": [], 251 | "timeShift": null, 252 | "title": "Message queue", 253 | "tooltip": { 254 | "shared": true, 255 | "sort": 0, 256 | "value_type": "individual" 257 | }, 258 | "type": "graph", 259 | "xaxis": { 260 | "buckets": null, 261 | "mode": "time", 262 | "name": null, 263 | "show": true, 264 | "values": [] 265 | }, 266 | "yaxes": [ 267 | { 268 | "format": "short", 269 | "label": null, 270 | "logBase": 1, 271 | "max": null, 272 | "min": null, 273 | "show": true 274 | }, 275 | { 276 | "format": "short", 277 | "label": null, 278 | "logBase": 1, 279 | "max": null, 280 | "min": null, 281 | "show": true 282 | } 283 | ], 284 | "yaxis": { 285 | "align": false, 286 | "alignLevel": null 287 | } 288 | }, 289 | { 290 | "columns": [], 291 | "datasource": "Postgres", 292 | "fontSize": "100%", 293 | "gridPos": { 294 | "h": 8, 295 | "w": 24, 296 | "x": 0, 297 | "y": 27 298 | }, 299 | "id": 8, 300 | "links": [], 301 | "options": {}, 302 | "pageSize": null, 303 | "scroll": true, 304 | "showHeader": true, 305 | "sort": { 306 | "col": 0, 307 | "desc": true 308 | }, 309 | "styles": [ 310 | { 311 | "alias": "Time", 312 | "align": "auto", 313 | "dateFormat": "YYYY-MM-DD HH:mm:ss", 314 | "pattern": "Time", 315 | "type": "date" 316 | }, 317 | { 318 | "alias": "Time", 319 | "align": "auto", 320 | "colorMode": null, 321 | "colors": [ 322 | "rgba(245, 54, 54, 0.9)", 323 | "rgba(237, 129, 40, 0.89)", 324 | "rgba(50, 172, 45, 0.97)" 325 | ], 326 | "dateFormat": "YYYY-MM-DD HH:mm:ss", 327 | "decimals": 2, 328 | "pattern": "ts", 329 | "thresholds": [], 330 | "type": "date", 331 | "unit": "short" 332 | }, 333 | { 334 | "alias": "Current function", 335 | "align": "auto", 336 | "colorMode": null, 337 | "colors": [ 338 | "rgba(245, 54, 54, 0.9)", 339 | "rgba(237, 129, 40, 0.89)", 340 | "rgba(50, 172, 45, 0.97)" 341 | ], 342 | "dateFormat": "YYYY-MM-DD HH:mm:ss", 343 | "decimals": 2, 344 | "pattern": "current_function", 345 | "preserveFormat": false, 346 | "thresholds": [], 347 | "type": "string", 348 | "unit": "short" 349 | }, 350 | { 351 | "alias": "", 352 | "align": "auto", 353 | "colorMode": null, 354 | "colors": [ 355 | "rgba(245, 54, 54, 0.9)", 356 | "rgba(237, 129, 40, 0.89)", 357 | "rgba(50, 172, 45, 0.97)" 358 | ], 359 | "decimals": 2, 360 | "pattern": "/.*/", 361 | "thresholds": [], 362 | "type": "number", 363 | "unit": "short" 364 | } 365 | ], 366 | "targets": [ 367 | { 368 | "alias": "", 369 | "format": "table", 370 | "rawSql": "SELECT ts, registered_name, pid, current_function FROM prc \nWHERE \n (ts BETWEEN $__timeFrom() AND $__timeTo()) AND \n node='[[node]]' AND\n ( ('[[pid]]' != '' AND pid='[[pid]]') OR -- pid is set\n ('[[regname]]' != '' AND registered_name='[[regname]]') OR -- registered_name is set\n ('[[pid]]' = '' AND '[[regname]]' = '') -- none is set, show all processes\n )\n", 371 | "refId": "A" 372 | } 373 | ], 374 | "title": "Current function", 375 | "transform": "table", 376 | "type": "table" 377 | } 378 | ], 379 | "refresh": false, 380 | "schemaVersion": 22, 381 | "style": "dark", 382 | "tags": [], 383 | "templating": { 384 | "list": [ 385 | { 386 | "allValue": null, 387 | "datasource": "Postgres", 388 | "definition": "", 389 | "hide": 0, 390 | "includeAll": false, 391 | "label": null, 392 | "multi": false, 393 | "name": "node", 394 | "options": [], 395 | "query": "SELECT DISTINCT node FROM node ORDER BY node ASC;", 396 | "refresh": 1, 397 | "regex": "", 398 | "skipUrlSync": false, 399 | "sort": 0, 400 | "tagValuesQuery": "", 401 | "tags": [], 402 | "tagsQuery": "", 403 | "type": "query", 404 | "useTags": false 405 | }, 406 | { 407 | "current": { 408 | "text": "undefined", 409 | "value": "undefined" 410 | }, 411 | "hide": 0, 412 | "label": "Registered name", 413 | "name": "regname", 414 | "options": [ 415 | { 416 | "selected": false, 417 | "text": "", 418 | "value": "" 419 | } 420 | ], 421 | "query": "", 422 | "skipUrlSync": false, 423 | "type": "constant" 424 | }, 425 | { 426 | "current": { 427 | "text": "<17096.28649.6884> ", 428 | "value": "<17096.28649.6884> " 429 | }, 430 | "hide": 0, 431 | "label": null, 432 | "name": "pid", 433 | "options": [ 434 | { 435 | "selected": false, 436 | "text": "", 437 | "value": "" 438 | } 439 | ], 440 | "query": "", 441 | "skipUrlSync": false, 442 | "type": "constant" 443 | } 444 | ] 445 | }, 446 | "time": { 447 | "from": "2020-07-31T09:51:33.904Z", 448 | "to": "2020-07-31T10:06:31.905Z" 449 | }, 450 | "timepicker": { 451 | "refresh_intervals": [ 452 | "5s", 453 | "10s", 454 | "30s", 455 | "1m", 456 | "5m", 457 | "15m", 458 | "30m", 459 | "1h", 460 | "2h", 461 | "1d" 462 | ], 463 | "time_options": [ 464 | "5m", 465 | "15m", 466 | "1h", 467 | "6h", 468 | "12h", 469 | "24h", 470 | "2d", 471 | "7d", 472 | "30d" 473 | ] 474 | }, 475 | "timezone": "", 476 | "title": "Process history", 477 | "uid": "P2OSAsRmz", 478 | "version": 5 479 | } 480 | -------------------------------------------------------------------------------- /docker/grafana/dashboards/proc_top.json: -------------------------------------------------------------------------------- 1 | { 2 | "annotations": { 3 | "list": [ 4 | { 5 | "builtIn": 1, 6 | "datasource": "-- Grafana --", 7 | "enable": true, 8 | "hide": true, 9 | "iconColor": "rgba(0, 211, 255, 1)", 10 | "name": "Annotations & Alerts", 11 | "type": "dashboard" 12 | } 13 | ] 14 | }, 15 | "editable": true, 16 | "gnetId": null, 17 | "graphTooltip": 0, 18 | "id": 3, 19 | "iteration": 1617800341462, 20 | "links": [], 21 | "panels": [ 22 | { 23 | "aliasColors": {}, 24 | "bars": false, 25 | "dashLength": 10, 26 | "dashes": false, 27 | "datasource": "Postgres", 28 | "fieldConfig": { 29 | "defaults": {}, 30 | "overrides": [] 31 | }, 32 | "fill": 1, 33 | "fillGradient": 0, 34 | "gridPos": { 35 | "h": 7, 36 | "w": 20, 37 | "x": 0, 38 | "y": 0 39 | }, 40 | "hiddenSeries": false, 41 | "id": 8, 42 | "legend": { 43 | "avg": false, 44 | "current": false, 45 | "max": false, 46 | "min": false, 47 | "show": true, 48 | "total": false, 49 | "values": false 50 | }, 51 | "lines": true, 52 | "linewidth": 1, 53 | "nullPointMode": "null", 54 | "options": { 55 | "alertThreshold": true 56 | }, 57 | "percentage": false, 58 | "pluginVersion": "7.5.3", 59 | "pointradius": 2, 60 | "points": false, 61 | "renderer": "flot", 62 | "seriesOverrides": [], 63 | "spaceLength": 10, 64 | "stack": false, 65 | "steppedLine": false, 66 | "targets": [ 67 | { 68 | "format": "time_series", 69 | "group": [], 70 | "metricColumn": "none", 71 | "queryType": "randomWalk", 72 | "rawQuery": false, 73 | "rawSql": "SELECT\n ts AS \"time\",\n dreductions\nFROM prc\nWHERE\n $__timeFilter(ts)\nORDER BY 1", 74 | "refId": "A", 75 | "select": [ 76 | [ 77 | { 78 | "params": [ 79 | "dreductions" 80 | ], 81 | "type": "column" 82 | } 83 | ] 84 | ], 85 | "table": "prc", 86 | "timeColumn": "ts", 87 | "timeColumnType": "timestamp", 88 | "where": [ 89 | { 90 | "name": "$__timeFilter", 91 | "params": [], 92 | "type": "macro" 93 | } 94 | ] 95 | } 96 | ], 97 | "thresholds": [], 98 | "timeFrom": null, 99 | "timeRegions": [], 100 | "timeShift": null, 101 | "title": "Time series", 102 | "tooltip": { 103 | "shared": true, 104 | "sort": 0, 105 | "value_type": "individual" 106 | }, 107 | "type": "graph", 108 | "xaxis": { 109 | "buckets": null, 110 | "mode": "time", 111 | "name": null, 112 | "show": true, 113 | "values": [] 114 | }, 115 | "yaxes": [ 116 | { 117 | "format": "short", 118 | "label": null, 119 | "logBase": 1, 120 | "max": null, 121 | "min": null, 122 | "show": true 123 | }, 124 | { 125 | "format": "short", 126 | "label": null, 127 | "logBase": 1, 128 | "max": null, 129 | "min": null, 130 | "show": true 131 | } 132 | ], 133 | "yaxis": { 134 | "align": false, 135 | "alignLevel": null 136 | } 137 | }, 138 | { 139 | "columns": [], 140 | "datasource": "Postgres", 141 | "fieldConfig": { 142 | "defaults": {}, 143 | "overrides": [] 144 | }, 145 | "fontSize": "180%", 146 | "gridPos": { 147 | "h": 7, 148 | "w": 4, 149 | "x": 20, 150 | "y": 0 151 | }, 152 | "id": 6, 153 | "links": [], 154 | "pageSize": 3, 155 | "scroll": false, 156 | "showHeader": true, 157 | "sort": { 158 | "col": null, 159 | "desc": false 160 | }, 161 | "styles": [ 162 | { 163 | "alias": "", 164 | "align": "auto", 165 | "colorMode": null, 166 | "colors": [ 167 | "rgba(245, 54, 54, 0.9)", 168 | "rgba(237, 129, 40, 0.89)", 169 | "rgba(50, 172, 45, 0.97)" 170 | ], 171 | "decimals": 2, 172 | "pattern": "/.*/", 173 | "preserveFormat": true, 174 | "sanitize": true, 175 | "thresholds": [], 176 | "type": "string", 177 | "unit": "short" 178 | } 179 | ], 180 | "targets": [ 181 | { 182 | "alias": "", 183 | "format": "table", 184 | "group": [], 185 | "metricColumn": "none", 186 | "rawQuery": true, 187 | "rawSql": "SELECT data as status \n FROM node_role WHERE node='[[node]]'\n AND ts=(SELECT max(ts) FROM node_role WHERE node='[[node]]' AND ts > $__timeFrom());", 188 | "refId": "A", 189 | "select": [ 190 | [ 191 | { 192 | "params": [ 193 | "value" 194 | ], 195 | "type": "column" 196 | } 197 | ] 198 | ], 199 | "timeColumn": "time", 200 | "where": [ 201 | { 202 | "name": "$__timeFilter", 203 | "params": [], 204 | "type": "macro" 205 | } 206 | ] 207 | } 208 | ], 209 | "transform": "table", 210 | "type": "table-old" 211 | }, 212 | { 213 | "columns": [], 214 | "datasource": "Postgres", 215 | "fieldConfig": { 216 | "defaults": {}, 217 | "overrides": [] 218 | }, 219 | "fontSize": "100%", 220 | "gridPos": { 221 | "h": 24, 222 | "w": 24, 223 | "x": 0, 224 | "y": 7 225 | }, 226 | "hideTimeOverride": true, 227 | "id": 2, 228 | "links": [], 229 | "maxPerRow": 2, 230 | "pageSize": null, 231 | "repeat": "node", 232 | "repeatDirection": "v", 233 | "scopedVars": { 234 | "node": { 235 | "selected": true, 236 | "text": "nonode@nohost", 237 | "value": "nonode@nohost" 238 | } 239 | }, 240 | "scroll": true, 241 | "showHeader": true, 242 | "sort": { 243 | "col": 4, 244 | "desc": true 245 | }, 246 | "styles": [ 247 | { 248 | "$$hashKey": "object:167", 249 | "alias": "", 250 | "align": "auto", 251 | "colorMode": "cell", 252 | "colors": [ 253 | "rgba(50, 172, 45, 0)", 254 | "rgba(237, 129, 40, 0.89)", 255 | "rgba(245, 54, 54, 0.9)" 256 | ], 257 | "dateFormat": "YYYY-MM-DD HH:mm:ss", 258 | "decimals": 2, 259 | "link": true, 260 | "linkTooltip": "Process history", 261 | "linkUrl": "", 262 | "pattern": "dreductions", 263 | "thresholds": [ 264 | "1000000", 265 | "100000000" 266 | ], 267 | "type": "number", 268 | "unit": "short" 269 | }, 270 | { 271 | "$$hashKey": "object:168", 272 | "alias": "", 273 | "align": "auto", 274 | "colorMode": "cell", 275 | "colors": [ 276 | "rgba(5, 5, 5, 0)", 277 | "rgba(237, 129, 40, 0.89)", 278 | "rgba(245, 54, 54, 0.9)" 279 | ], 280 | "dateFormat": "YYYY-MM-DD HH:mm:ss", 281 | "decimals": 0, 282 | "pattern": "message_queue_len", 283 | "thresholds": [ 284 | "10", 285 | "100" 286 | ], 287 | "type": "number", 288 | "unit": "short" 289 | }, 290 | { 291 | "$$hashKey": "object:169", 292 | "alias": "", 293 | "align": "auto", 294 | "colorMode": "cell", 295 | "colors": [ 296 | "rgba(50, 172, 45, 0.97)", 297 | "rgba(237, 129, 40, 0)", 298 | "rgba(245, 54, 54, 0.9)" 299 | ], 300 | "dateFormat": "YYYY-MM-DD HH:mm:ss", 301 | "decimals": 2, 302 | "pattern": "dmemory", 303 | "thresholds": [ 304 | "-1000000", 305 | "100000000" 306 | ], 307 | "type": "number", 308 | "unit": "Bps" 309 | }, 310 | { 311 | "$$hashKey": "object:170", 312 | "alias": "", 313 | "align": "auto", 314 | "colorMode": null, 315 | "colors": [ 316 | "rgba(245, 54, 54, 0.9)", 317 | "rgba(237, 129, 40, 0.89)", 318 | "rgba(50, 172, 45, 0.97)" 319 | ], 320 | "dateFormat": "YYYY-MM-DD HH:mm:ss", 321 | "decimals": 2, 322 | "link": true, 323 | "linkTargetBlank": true, 324 | "linkTooltip": "history", 325 | "linkUrl": "/d/P2OSAsRmz/process-history?orgId=1&var-node=[[node]]&var-pid=${__cell:raw}&var-regname=undefined&from=$__from&to=$__to", 326 | "pattern": "pid", 327 | "preserveFormat": false, 328 | "sanitize": false, 329 | "thresholds": [], 330 | "type": "string", 331 | "unit": "short" 332 | }, 333 | { 334 | "$$hashKey": "object:171", 335 | "alias": "Current function ", 336 | "align": "auto", 337 | "colorMode": null, 338 | "colors": [ 339 | "rgba(245, 54, 54, 0.9)", 340 | "rgba(237, 129, 40, 0.89)", 341 | "rgba(50, 172, 45, 0.97)" 342 | ], 343 | "dateFormat": "YYYY-MM-DD HH:mm:ss", 344 | "decimals": 2, 345 | "pattern": "current_function", 346 | "preserveFormat": true, 347 | "thresholds": [], 348 | "type": "string", 349 | "unit": "short" 350 | }, 351 | { 352 | "$$hashKey": "object:172", 353 | "alias": "Initial call ", 354 | "align": "auto", 355 | "colorMode": null, 356 | "colors": [ 357 | "rgba(245, 54, 54, 0.9)", 358 | "rgba(237, 129, 40, 0.89)", 359 | "rgba(50, 172, 45, 0.97)" 360 | ], 361 | "dateFormat": "YYYY-MM-DD HH:mm:ss", 362 | "decimals": 2, 363 | "pattern": "initial_call", 364 | "preserveFormat": true, 365 | "thresholds": [], 366 | "type": "string", 367 | "unit": "short" 368 | }, 369 | { 370 | "$$hashKey": "object:173", 371 | "alias": "", 372 | "align": "auto", 373 | "colorMode": "cell", 374 | "colors": [ 375 | "rgba(50, 172, 45, 0)", 376 | "rgba(237, 129, 40, 0.89)", 377 | "rgba(245, 54, 54, 0.9)" 378 | ], 379 | "dateFormat": "YYYY-MM-DD HH:mm:ss", 380 | "decimals": 2, 381 | "pattern": "memory", 382 | "thresholds": [ 383 | "100000000", 384 | "10000000000" 385 | ], 386 | "type": "number", 387 | "unit": "decbytes" 388 | }, 389 | { 390 | "$$hashKey": "object:174", 391 | "alias": "Registered name ", 392 | "align": "auto", 393 | "colorMode": null, 394 | "colors": [ 395 | "rgba(245, 54, 54, 0.9)", 396 | "rgba(237, 129, 40, 0.89)", 397 | "rgba(50, 172, 45, 0.97)" 398 | ], 399 | "dateFormat": "YYYY-MM-DD HH:mm:ss", 400 | "decimals": 2, 401 | "pattern": "registered_name", 402 | "thresholds": [], 403 | "type": "string", 404 | "unit": "short" 405 | }, 406 | { 407 | "$$hashKey": "object:175", 408 | "alias": "", 409 | "align": "auto", 410 | "colorMode": null, 411 | "colors": [ 412 | "rgba(245, 54, 54, 0.9)", 413 | "rgba(237, 129, 40, 0.89)", 414 | "rgba(50, 172, 45, 0.97)" 415 | ], 416 | "decimals": 2, 417 | "pattern": "/.*/", 418 | "thresholds": [], 419 | "type": "number", 420 | "unit": "short" 421 | } 422 | ], 423 | "targets": [ 424 | { 425 | "alias": "", 426 | "format": "table", 427 | "group": [], 428 | "metricColumn": "none", 429 | "rawQuery": true, 430 | "rawSql": "SELECT pid, registered_name, reductions, dreductions, message_queue_len, memory, dmemory, stack_size, heap_size, total_heap_size, initial_call, current_function\nFROM prc\nWHERE node='[[node]]'\nAND ts=(SELECT max(ts) FROM node_role WHERE node='[[node]]' AND ts > $__timeFrom() AND ts < $__timeTo());", 431 | "refId": "A", 432 | "select": [ 433 | [ 434 | { 435 | "params": [ 436 | "value" 437 | ], 438 | "type": "column" 439 | } 440 | ] 441 | ], 442 | "timeColumn": "time", 443 | "where": [ 444 | { 445 | "name": "$__timeFilter", 446 | "params": [], 447 | "type": "macro" 448 | } 449 | ] 450 | } 451 | ], 452 | "timeFrom": "10s", 453 | "timeShift": "1s", 454 | "title": "Top [[node]]", 455 | "transform": "table", 456 | "transparent": true, 457 | "type": "table-old" 458 | } 459 | ], 460 | "refresh": "5s", 461 | "schemaVersion": 27, 462 | "style": "dark", 463 | "tags": [ 464 | "erlang", 465 | "top" 466 | ], 467 | "templating": { 468 | "list": [ 469 | { 470 | "allValue": null, 471 | "current": { 472 | "selected": false, 473 | "text": "nonode@nohost", 474 | "value": "nonode@nohost" 475 | }, 476 | "datasource": "Postgres", 477 | "definition": "SELECT DISTINCT node FROM node ORDER BY node ASC;", 478 | "description": null, 479 | "error": null, 480 | "hide": 0, 481 | "includeAll": false, 482 | "label": null, 483 | "multi": false, 484 | "name": "node", 485 | "options": [], 486 | "query": "SELECT DISTINCT node FROM node ORDER BY node ASC;", 487 | "refresh": 1, 488 | "regex": "", 489 | "skipUrlSync": false, 490 | "sort": 0, 491 | "tagValuesQuery": "", 492 | "tags": [], 493 | "tagsQuery": "", 494 | "type": "query", 495 | "useTags": false 496 | } 497 | ] 498 | }, 499 | "time": { 500 | "from": "now-15m", 501 | "to": "now-2s" 502 | }, 503 | "timepicker": { 504 | "nowDelay": "2s", 505 | "refresh_intervals": [ 506 | "1s", 507 | "2s", 508 | "5s", 509 | "10s", 510 | "" 511 | ], 512 | "time_options": [ 513 | "5m", 514 | "15m", 515 | "1h", 516 | "6h", 517 | "12h", 518 | "24h", 519 | "2d", 520 | "7d", 521 | "30d" 522 | ] 523 | }, 524 | "timezone": "", 525 | "title": "Erlang top", 526 | "uid": "V5HktsRik", 527 | "version": 2 528 | } 529 | -------------------------------------------------------------------------------- /docker/grafana/datasources/postgres.yml: -------------------------------------------------------------------------------- 1 | # config file version 2 | apiVersion: 1 3 | 4 | datasources: 5 | - name: Postgres 6 | type: postgres 7 | url: db:5432 8 | database: system_monitor 9 | user: grafana 10 | secureJsonData: 11 | password: "system_monitor_password" 12 | jsonData: 13 | sslmode: "disable" 14 | timescaledb: false 15 | -------------------------------------------------------------------------------- /include/system_monitor.hrl: -------------------------------------------------------------------------------- 1 | %%-------------------------------------------------------------------------------- 2 | %% Copyright 2020 Klarna Bank AB 3 | %% 4 | %% Licensed under the Apache License, Version 2.0 (the "License"); 5 | %% you may not use this file except in compliance with the License. 6 | %% You may obtain a copy of the License at 7 | %% 8 | %% http://www.apache.org/licenses/LICENSE-2.0 9 | %% 10 | %% Unless required by applicable law or agreed to in writing, software 11 | %% distributed under the License is distributed on an "AS IS" BASIS, 12 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | %% See the License for the specific language governing permissions and 14 | %% limitations under the License. 15 | %%-------------------------------------------------------------------------------- 16 | -ifndef(SYSTEM_MONITOR_HRL). 17 | -define(SYSTEM_MONITOR_HRL, true). 18 | 19 | -define(APP, system_monitor). 20 | 21 | -type function_top() :: 22 | #{ initial_call => [{mfa(), number()}] 23 | , current_function => [{mfa(), number()}] 24 | }. 25 | 26 | -record(pid_info, 27 | { pid :: pid() 28 | , initial_call :: mfa() | undefined 29 | , registered_name :: atom() | [] 30 | , current_function :: mfa() | undefined 31 | , reductions :: integer() 32 | , dreductions :: number() | undefined 33 | , memory :: integer() 34 | , dmemory :: number() | undefined 35 | , message_queue_len :: integer() 36 | , group_leader :: pid() 37 | }). 38 | 39 | -record(erl_top, 40 | { node :: node() 41 | , ts :: integer() 42 | , pid :: string() 43 | , dreductions :: integer() 44 | , dmemory :: integer() 45 | , reductions :: integer() 46 | , memory :: integer() %% bytes 47 | , message_queue_len :: integer() 48 | , current_function :: mfa() 49 | , initial_call :: mfa() 50 | , registered_name :: atom() | [] 51 | , stack_size :: integer() 52 | , heap_size :: integer() %% words 53 | , total_heap_size :: integer() %% words 54 | , current_stacktrace :: list() 55 | , group_leader :: list() 56 | }). 57 | 58 | -record(app_top, 59 | { app :: atom() 60 | , red_abs :: integer() 61 | , red_rel :: float() 62 | , memory :: integer() 63 | , processes :: integer() 64 | }). 65 | 66 | -endif. 67 | -------------------------------------------------------------------------------- /rebar.config: -------------------------------------------------------------------------------- 1 | %% -*- mode:erlang -*- 2 | {erl_opts, 3 | [debug_info, warnings_as_errors]}. 4 | 5 | {deps, 6 | [ {supervisor3, "1.1.9"} 7 | , {epgsql, "4.7.0"} 8 | ]}. 9 | 10 | {dialyzer, [{warnings, [unknown]}]}. 11 | 12 | {profiles, 13 | [ {test, [ {deps, [{proper, "1.2.0"}]} 14 | , {cover_enabled, true} 15 | ]} 16 | , {dev, 17 | [{plugins, [rebar3_hex]}]} 18 | ]}. 19 | 20 | {xref_checks, 21 | [ undefined_function_calls 22 | , deprecated_function_calls 23 | ] 24 | }. 25 | -------------------------------------------------------------------------------- /rebar.lock: -------------------------------------------------------------------------------- 1 | {"1.2.0", 2 | [{<<"epgsql">>,{pkg,<<"epgsql">>,<<"4.7.0">>},0}, 3 | {<<"supervisor3">>,{pkg,<<"supervisor3">>,<<"1.1.9">>},0}]}. 4 | [ 5 | {pkg_hash,[ 6 | {<<"epgsql">>, <<"98361A63E49AE14DF57CBDA8495058D42ABD3A316F822D1F990A40259026FE5E">>}, 7 | {<<"supervisor3">>, <<"F1A3CC12FB6197526F548E79C9FE2B4AF0C74EFB8A687917B3B1EBE5E9C9368D">>}]}, 8 | {pkg_hash_ext,[ 9 | {<<"epgsql">>, <<"90B0145D302AB133D957EA46A884E6E37E847E6E47DEAF93104314D2AD8CB5BB">>}, 10 | {<<"supervisor3">>, <<"71B177C08F8CAB9EC8ECB81C1AA28A23BBC24AAC4B468C2DB69840229D78D5C4">>}]} 11 | ]. 12 | -------------------------------------------------------------------------------- /src/system_monitor.app.src: -------------------------------------------------------------------------------- 1 | %% -*- mode: erlang -*- 2 | %%-------------------------------------------------------------------------------- 3 | %% Copyright 2020 Klarna Bank AB 4 | %% 5 | %% Licensed under the Apache License, Version 2.0 (the "License"); 6 | %% you may not use this file except in compliance with the License. 7 | %% You may obtain a copy of the License at 8 | %% 9 | %% http://www.apache.org/licenses/LICENSE-2.0 10 | %% 11 | %% Unless required by applicable law or agreed to in writing, software 12 | %% distributed under the License is distributed on an "AS IS" BASIS, 13 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | %% See the License for the specific language governing permissions and 15 | %% limitations under the License. 16 | %%-------------------------------------------------------------------------------- 17 | {application, system_monitor, 18 | [ {description, "Monitoring app that exports Erlang VM introspection data to any backend. " 19 | "Defaults to Postgres"} 20 | , {licenses, ["Apache 2.0"]} 21 | , {vsn, "git"} 22 | , {registered, []} 23 | , {modules, []} 24 | , {mod, {system_monitor_app, []}} 25 | , {applications, [kernel, stdlib, supervisor3, epgsql]} 26 | , {env, 27 | [ %% Specifies how many topmost processes should be reported per 28 | %% category (such as `top_memory', `top_reductions', etc.) 29 | {top_num_items, 10} 30 | %% Specifies how often process top should be collected (in ms): 31 | , {top_sample_interval, 2000} 32 | %% Specifies sample size for the approximate metrics, such as 33 | %% 'percentage of processes started by an app', and 'percentage 34 | %% of processes running a function': 35 | , {top_sample_size, 3000} 36 | %% Stop reporting exact process data when the number of 37 | %% processes is above this threshold, in order to avoid 38 | %% hammering the VM with introspection BIFs (this doesn't affect 39 | %% approximate monitors that rely on sampling): 40 | , {top_max_procs, 15000} 41 | %% By default system_monitor tries to collect process dictionary 42 | %% of the topmost processes in order to determine start function 43 | %% more precisely. In theory this can cause problems if process 44 | %% dictionary contains very large amounts of data. This option 45 | %% allows to fallback to safety: 46 | , {collect_process_dictionary, true} 47 | %% Don't report values to `app_top' and `fun_top' below the 48 | %% threshold as insignificant: 49 | , {top_significance_threshold, 50 | #{ current_function => 0.01 % 1 percent 51 | , initial_call => 0.01 % 1 percent 52 | , processes => 100 % number of processes 53 | , reductions => 0.01 % 1 percent 54 | , memory => 10000 % words 55 | }} 56 | 57 | %% Specify node-specific healthcheck function as `{module(), 58 | %% function()}', for example: `{my_app, node_status}'. This 59 | %% function should return an HTML-formatted status report: 60 | , {node_status_fun, undefined} 61 | %% List of additional status check functions: 62 | , {status_checks, []} 63 | 64 | %% BEAM event settings: 65 | , {beam_events, 66 | [ busy_port 67 | , busy_dist_port 68 | , {long_gc, 500} 69 | , {long_schedule, 500} 70 | ]} 71 | , {suspect_procs_max_memory, 524288000} %% 500 MB 72 | , {suspect_procs_max_message_queue_len, 5000} 73 | , {suspect_procs_max_total_heap_size, 524288000} %% 500 MB 74 | 75 | %% Don't query memory if message_queue_len is longer than this: 76 | , {mql_limit_for_memory, 100000} 77 | ]} 78 | ]}. 79 | -------------------------------------------------------------------------------- /src/system_monitor.erl: -------------------------------------------------------------------------------- 1 | %% -*- mode: erlang -*- 2 | %%-------------------------------------------------------------------------------- 3 | %% Copyright 2021 Klarna Bank AB 4 | %% 5 | %% Licensed under the Apache License, Version 2.0 (the "License"); 6 | %% you may not use this file except in compliance with the License. 7 | %% You may obtain a copy of the License at 8 | %% 9 | %% http://www.apache.org/licenses/LICENSE-2.0 10 | %% 11 | %% Unless required by applicable law or agreed to in writing, software 12 | %% distributed under the License is distributed on an "AS IS" BASIS, 13 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | %% See the License for the specific language governing permissions and 15 | %% limitations under the License. 16 | %%-------------------------------------------------------------------------------- 17 | %% @private 18 | -module(system_monitor). 19 | 20 | -behaviour(gen_server). 21 | 22 | %%-------------------------------------------------------------------- 23 | %% Include files 24 | %%-------------------------------------------------------------------- 25 | 26 | -include_lib("system_monitor/include/system_monitor.hrl"). 27 | -include_lib("kernel/include/logger.hrl"). 28 | 29 | %% API 30 | -export([start_link/0]). 31 | 32 | -export([reset/0]). 33 | 34 | -export([ report_full_status/0 35 | , check_process_count/0 36 | , suspect_procs/0 37 | , erl_top_to_str/1 38 | , start_top/0 39 | , stop_top/0 40 | , fmt_mfa/1 41 | , fmt_stack/1 42 | , node_name/0 43 | ]). 44 | 45 | %% gen_server callbacks 46 | -export([ init/1 47 | , handle_call/3 48 | , handle_cast/2 49 | , handle_info/2 50 | , terminate/2 51 | ]). 52 | 53 | -define(SERVER, ?MODULE). 54 | -define(TICK_INTERVAL, 1000). 55 | 56 | -record(state, { monitors = [] 57 | , timer_ref 58 | }). 59 | 60 | %%==================================================================== 61 | %% API 62 | %%==================================================================== 63 | %%-------------------------------------------------------------------- 64 | %% @doc Starts the server 65 | %%-------------------------------------------------------------------- 66 | -spec start_link() -> {ok, pid()} | ignore | {error, term()}. 67 | start_link() -> gen_server:start_link({local, ?SERVER}, ?MODULE, [], []). 68 | 69 | %%-------------------------------------------------------------------- 70 | %% @doc Start printing erlang top to console 71 | %%-------------------------------------------------------------------- 72 | -spec start_top() -> ok. 73 | start_top() -> 74 | application:set_env(?APP, top_printing, group_leader()). 75 | 76 | %%-------------------------------------------------------------------- 77 | %% @doc Stop printing erlang top to console 78 | %%-------------------------------------------------------------------- 79 | -spec stop_top() -> ok. 80 | stop_top() -> 81 | application:set_env(?APP, top_printing, false). 82 | 83 | %%-------------------------------------------------------------------- 84 | %% @doc Reset monitors 85 | %%-------------------------------------------------------------------- 86 | -spec reset() -> ok. 87 | reset() -> 88 | gen_server:cast(?SERVER, reset). 89 | 90 | %%==================================================================== 91 | %% gen_server callbacks 92 | %%==================================================================== 93 | 94 | init([]) -> 95 | {ok, Timer} = timer:send_interval(?TICK_INTERVAL, {self(), tick}), 96 | {ok, #state{monitors = init_monitors(), timer_ref = Timer}}. 97 | 98 | handle_call(_Request, _From, State) -> 99 | {reply, {error, unknown_call}, State}. 100 | 101 | handle_cast(reset, State) -> 102 | {noreply, State#state{monitors = init_monitors()}}; 103 | handle_cast(_Msg, State) -> 104 | {noreply, State}. 105 | 106 | handle_info({Self, tick}, State) when Self =:= self() -> 107 | Monitors = [case Ticks - 1 of 108 | 0 -> 109 | try 110 | apply(Module, Function, []) 111 | catch 112 | EC:Error:Stack -> 113 | error_logger:warning_msg( 114 | "system_monitor ~p crashed:~n~p:~p~nStacktrace: ~p~n", 115 | [{Module, Function}, EC, Error, Stack]) 116 | end, 117 | {Module, Function, RunOnTerminate, TicksReset, TicksReset}; 118 | TicksDecremented -> 119 | {Module, Function, RunOnTerminate, TicksReset, TicksDecremented} 120 | end || {Module, Function, 121 | RunOnTerminate, TicksReset, Ticks} <- State#state.monitors], 122 | {noreply, State#state{monitors = Monitors}}; 123 | handle_info(_Info, State) -> 124 | {noreply, State}. 125 | 126 | -spec terminate(term(), #state{}) -> any(). 127 | terminate(_Reason, State) -> 128 | %% Possibly, one last check. 129 | [apply(?MODULE, Monitor, []) || 130 | {Monitor, true, _TicksReset, _Ticks} <- State#state.monitors]. 131 | 132 | %%============================================================================== 133 | %% Internal functions 134 | %%============================================================================== 135 | 136 | %%------------------------------------------------------------------------------ 137 | %% @doc Returns the list of initiated monitors. 138 | %%------------------------------------------------------------------------------ 139 | -spec init_monitors() -> [{module(), function(), boolean(), pos_integer(), pos_integer()}]. 140 | init_monitors() -> 141 | [{Module, Function, RunOnTerminate, Ticks, Ticks} 142 | || {Module, Function, RunOnTerminate, Ticks} <- monitors()]. 143 | 144 | %%------------------------------------------------------------------------------ 145 | %% @doc Returns the list of monitors. The format is 146 | %% {FunctionName, RunMonitorAtTerminate, NumberOfTicks}. 147 | %% RunMonitorAtTerminate determines whether the monitor is to be run in 148 | %% the terminate gen_server callback. 149 | %% ... and NumberOfTicks is the number of ticks between invocations of 150 | %% the monitor in question. So, if NumberOfTicks is 3600, the monitor is 151 | %% to be run once every hour, as there is a tick every second. 152 | %%------------------------------------------------------------------------------ 153 | -spec monitors() -> [{module(), function(), boolean(), pos_integer()}]. 154 | monitors() -> 155 | {ok, AdditionalMonitors} = application:get_env(system_monitor, status_checks), 156 | MaybeReportFullStatusMonitor = 157 | case system_monitor_callback:is_configured() of 158 | true -> 159 | {ok, TopInterval} = application:get_env(?APP, top_sample_interval), 160 | [{?MODULE, report_full_status, false, TopInterval div 1000}]; 161 | false -> 162 | [] 163 | end, 164 | [{?MODULE, check_process_count, true, 2}, 165 | {?MODULE, suspect_procs, true, 5}] 166 | ++ MaybeReportFullStatusMonitor 167 | ++ AdditionalMonitors. 168 | 169 | %%------------------------------------------------------------------------------ 170 | %% Monitor for number of processes 171 | %%------------------------------------------------------------------------------ 172 | 173 | %%------------------------------------------------------------------------------ 174 | %% @doc Check the number of processes and log an aggregate summary of the 175 | %% process info if the count is above Threshold. 176 | %%------------------------------------------------------------------------------ 177 | -spec check_process_count() -> ok. 178 | check_process_count() -> 179 | {ok, MaxProcs} = application:get_env(?APP, top_max_procs), 180 | case erlang:system_info(process_count) of 181 | Count when Count > MaxProcs -> 182 | ?LOG_WARNING( 183 | "Abnormal process count (~p).~n" 184 | , [Count] 185 | , #{domain => [system_monitor]} 186 | ); 187 | _ -> ok 188 | end. 189 | 190 | 191 | %%------------------------------------------------------------------------------ 192 | %% Monitor for processes with suspect stats 193 | %%------------------------------------------------------------------------------ 194 | suspect_procs() -> 195 | {_TS, ProcTop} = system_monitor_top:get_proc_top(), 196 | Env = fun(Name) -> application:get_env(?APP, Name, undefined) end, 197 | Conf = 198 | {Env(suspect_procs_max_memory), 199 | Env(suspect_procs_max_message_queue_len), 200 | Env(suspect_procs_max_total_heap_size)}, 201 | SuspectProcs = lists:filter(fun(Proc) -> is_suspect_proc(Proc, Conf) end, ProcTop), 202 | lists:foreach(fun log_suspect_proc/1, SuspectProcs). 203 | 204 | is_suspect_proc(Proc, {MaxMemory, MaxMqLen, MaxTotalHeapSize}) -> 205 | #erl_top{memory = Memory, 206 | message_queue_len = MessageQueueLen, 207 | total_heap_size = TotalHeapSize} = 208 | Proc, 209 | GreaterIfDef = 210 | fun ({undefined, _}) -> 211 | false; 212 | ({Comp, Value}) -> 213 | Value >= Comp 214 | end, 215 | ToCompare = 216 | [{MaxMemory, Memory}, {MaxMqLen, MessageQueueLen}, {MaxTotalHeapSize, TotalHeapSize}], 217 | lists:any(GreaterIfDef, ToCompare). 218 | 219 | log_suspect_proc(Proc) -> 220 | ErlTopStr = erl_top_to_str(Proc), 221 | Format = "Suspect Proc~n~s", 222 | ?LOG_WARNING(Format, [ErlTopStr], #{domain => [system_monitor]}). 223 | 224 | %%------------------------------------------------------------------------------ 225 | %% @doc Report top processes 226 | %%------------------------------------------------------------------------------ 227 | -spec report_full_status() -> ok. 228 | report_full_status() -> 229 | %% `TS' variable should be used consistently in all following 230 | %% reports for this time interval, so it can be used as a key to 231 | %% lookup the relevant events 232 | {TS, ProcTop} = system_monitor_top:get_proc_top(), 233 | system_monitor_callback:produce(proc_top, ProcTop), 234 | report_app_top(TS), 235 | %% Node status report goes last, and it "seals" the report for this 236 | %% time interval: 237 | NodeReport = 238 | case application:get_env(?APP, node_status_fun) of 239 | {ok, {Module, Function}} -> 240 | try 241 | Module:Function() 242 | catch 243 | _:_ -> 244 | <<>> 245 | end; 246 | _ -> 247 | <<>> 248 | end, 249 | system_monitor_callback:produce( 250 | node_role, [{node_role, node_name(), TS, iolist_to_binary(NodeReport)}]). 251 | 252 | %%------------------------------------------------------------------------------ 253 | %% @doc Calculate reductions per application. 254 | %%------------------------------------------------------------------------------ 255 | -spec report_app_top(integer()) -> ok. 256 | report_app_top(TS) -> 257 | AppReds = system_monitor_top:get_abs_app_top(), 258 | present_results(app_top, reductions, AppReds, TS), 259 | AppMem = system_monitor_top:get_app_memory(), 260 | present_results(app_top, memory, AppMem, TS), 261 | AppProcs = system_monitor_top:get_app_processes(), 262 | present_results(app_top, processes, AppProcs, TS), 263 | #{ current_function := CurrentFunction 264 | , initial_call := InitialCall 265 | } = system_monitor_top:get_function_top(), 266 | present_results(fun_top, current_function, CurrentFunction, TS), 267 | present_results(fun_top, initial_call, InitialCall, TS), 268 | ok. 269 | 270 | %%-------------------------------------------------------------------- 271 | %% @doc Push app_top or fun_top information to the configured destination 272 | %%-------------------------------------------------------------------- 273 | present_results(Record, Tag, Values, TS) -> 274 | {ok, Thresholds} = application:get_env(?APP, top_significance_threshold), 275 | Threshold = maps:get(Tag, Thresholds, 0), 276 | L = lists:filtermap(fun ({Key, Val}) when Val > Threshold -> 277 | {true, {Record, node_name(), TS, Key, Tag, Val}}; 278 | (_) -> 279 | false 280 | end, 281 | Values), 282 | system_monitor_callback:produce(Record, L). 283 | 284 | node_name() -> 285 | application:get_env(?APP, node_name, node()). 286 | 287 | %%-------------------------------------------------------------------- 288 | %% @doc logs "the interesting parts" of erl_top 289 | %%-------------------------------------------------------------------- 290 | erl_top_to_str(Proc) -> 291 | #erl_top{registered_name = RegisteredName, 292 | pid = Pid, 293 | initial_call = InitialCall, 294 | memory = Memory, 295 | message_queue_len = MessageQueueLength, 296 | stack_size = StackSize, 297 | heap_size = HeapSize, 298 | total_heap_size = TotalHeapSize, 299 | current_function = CurrentFunction, 300 | current_stacktrace = CurrentStack} = 301 | Proc, 302 | WordSize = erlang:system_info(wordsize), 303 | Format = 304 | "registered_name=~p~n" 305 | "offending_pid=~s~n" 306 | "initial_call=~s~n" 307 | "memory=~p (~s)~n" 308 | "message_queue_len=~p~n" 309 | "stack_size=~p~n" 310 | "heap_size=~p (~s)~n" 311 | "total_heap_size=~p (~s)~n" 312 | "current_function=~s~n" 313 | "current_stack:~n~s", 314 | Args = 315 | [RegisteredName, 316 | Pid, 317 | fmt_mfa(InitialCall), 318 | Memory, fmt_mem(Memory), 319 | MessageQueueLength, 320 | StackSize, 321 | HeapSize, fmt_mem(WordSize * HeapSize), 322 | TotalHeapSize, fmt_mem(WordSize * TotalHeapSize), 323 | fmt_mfa(CurrentFunction), 324 | fmt_stack(CurrentStack)], 325 | io_lib:format(Format, Args). 326 | 327 | fmt_mem(Mem) -> 328 | Units = [{1, "Bytes"}, {1024, "KB"}, {1024 * 1024, "MB"}, {1024 * 1024 * 1024, "GB"}], 329 | MemIsSmallEnough = fun({Dividor, _UnitStr}) -> Mem =< Dividor * 1024 end, 330 | {Dividor, UnitStr} = 331 | find_first(MemIsSmallEnough, Units, {1024 * 1024 * 1024 * 1024, "TB"}), 332 | io_lib:format("~.1f ~s", [Mem / Dividor, UnitStr]). 333 | 334 | fmt_stack(CurrentStack) -> 335 | [[fmt_mfa(MFA), "\n"] || MFA <- CurrentStack]. 336 | 337 | fmt_mfa({Mod, Fun, Arity, Prop}) -> 338 | case proplists:get_value(line, Prop, undefined) of 339 | undefined -> 340 | fmt_mfa({Mod, Fun, Arity}); 341 | Line -> 342 | io_lib:format("~s:~s/~p (Line ~p)", [Mod, Fun, Arity, Line]) 343 | end; 344 | fmt_mfa({Mod, Fun, Arity}) -> 345 | io_lib:format("~s:~s/~p", [Mod, Fun, Arity]); 346 | fmt_mfa(L) -> 347 | io_lib:format("~p", [L]). 348 | 349 | -spec find_first(fun((any()) -> boolean()), [T], Default) -> T | Default. 350 | find_first(Pred, List, Default) -> 351 | case lists:search(Pred, List) of 352 | {value, Elem} -> Elem; 353 | false -> Default 354 | end. 355 | -------------------------------------------------------------------------------- /src/system_monitor_app.erl: -------------------------------------------------------------------------------- 1 | %%-------------------------------------------------------------------------------- 2 | %% Copyright 2020 Klarna Bank AB 3 | %% 4 | %% Licensed under the Apache License, Version 2.0 (the "License"); 5 | %% you may not use this file except in compliance with the License. 6 | %% You may obtain a copy of the License at 7 | %% 8 | %% http://www.apache.org/licenses/LICENSE-2.0 9 | %% 10 | %% Unless required by applicable law or agreed to in writing, software 11 | %% distributed under the License is distributed on an "AS IS" BASIS, 12 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | %% See the License for the specific language governing permissions and 14 | %% limitations under the License. 15 | %%-------------------------------------------------------------------------------- 16 | -module(system_monitor_app). 17 | 18 | -behaviour(application). 19 | 20 | -export([start/2, stop/1]). 21 | 22 | start(_Type, _StartArgs) -> 23 | system_monitor_sup:start_link(). 24 | 25 | stop(_State) -> 26 | ok. 27 | -------------------------------------------------------------------------------- /src/system_monitor_callback.erl: -------------------------------------------------------------------------------- 1 | %%-------------------------------------------------------------------------------- 2 | %% Copyright 2021 Klarna Bank AB 3 | %% 4 | %% Licensed under the Apache License, Version 2.0 (the "License"); 5 | %% you may not use this file except in compliance with the License. 6 | %% You may obtain a copy of the License at 7 | %% 8 | %% http://www.apache.org/licenses/LICENSE-2.0 9 | %% 10 | %% Unless required by applicable law or agreed to in writing, software 11 | %% distributed under the License is distributed on an "AS IS" BASIS, 12 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | %% See the License for the specific language governing permissions and 14 | %% limitations under the License. 15 | %%-------------------------------------------------------------------------------- 16 | 17 | -module(system_monitor_callback). 18 | 19 | -export([ produce/2 20 | , is_configured/0 21 | , get_callback_mod/0 22 | ]). 23 | 24 | -include_lib("system_monitor/include/system_monitor.hrl"). 25 | 26 | -callback produce(atom(), list()) -> ok. 27 | 28 | produce(Type, Events) -> 29 | (get_callback_mod()):?FUNCTION_NAME(Type, Events). 30 | 31 | -compile({inline, [get_callback_mod/0]}). 32 | get_callback_mod() -> 33 | application:get_env(?APP, callback_mod, undefined). 34 | 35 | is_configured() -> 36 | get_callback_mod() =/= undefined. 37 | -------------------------------------------------------------------------------- /src/system_monitor_events.erl: -------------------------------------------------------------------------------- 1 | %%-------------------------------------------------------------------------------- 2 | %% Copyright 2020 Klarna Bank AB 3 | %% 4 | %% Licensed under the Apache License, Version 2.0 (the "License"); 5 | %% you may not use this file except in compliance with the License. 6 | %% You may obtain a copy of the License at 7 | %% 8 | %% http://www.apache.org/licenses/LICENSE-2.0 9 | %% 10 | %% Unless required by applicable law or agreed to in writing, software 11 | %% distributed under the License is distributed on an "AS IS" BASIS, 12 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | %% See the License for the specific language governing permissions and 14 | %% limitations under the License. 15 | %%-------------------------------------------------------------------------------- 16 | %%% @doc 17 | %%% Print BEAM VM events to the logs 18 | %%% 19 | %%% @end 20 | -module(system_monitor_events). 21 | 22 | -behaviour(gen_server). 23 | 24 | -include("system_monitor.hrl"). 25 | -include_lib("kernel/include/logger.hrl"). 26 | 27 | -export([start_link/0]). 28 | 29 | %% gen_server callbacks 30 | -export([ init/1 31 | , handle_call/3 32 | , handle_cast/2 33 | , handle_info/2 34 | , terminate/2 35 | ]). 36 | 37 | %%-------------------------------------------------------------------- 38 | %% @doc 39 | %% Starts the server 40 | %% @end 41 | %%-------------------------------------------------------------------- 42 | -spec start_link() -> {ok, pid()}. 43 | start_link() -> 44 | gen_server:start_link({local, ?MODULE}, ?MODULE, [], []). 45 | 46 | %%==================================================================== 47 | %% gen_server callbacks 48 | %%==================================================================== 49 | 50 | init([]) -> 51 | setup_system_monitor(), 52 | {ok, {}}. 53 | 54 | handle_call(_Request, _From, State) -> 55 | {reply, {error, unknown_call}, State}. 56 | 57 | handle_cast(_Msg, State) -> 58 | {noreply, State}. 59 | 60 | handle_info({monitor, PidOrPort, EventKind, Info}, State) -> 61 | ReferenceData = data_for_reference(PidOrPort), 62 | InfoTxt = format_system_event_info(Info), 63 | ?LOG_INFO( 64 | "sysmon type=~p reference=~p~n~s~n~s" 65 | , [EventKind, PidOrPort, InfoTxt, ReferenceData] 66 | , #{domain => [system_monitor]} 67 | ), 68 | case application:get_env(?APP, external_monitoring) of 69 | {ok, Mod} -> Mod:system_monitor_event(EventKind, Info); 70 | undefined -> ok 71 | end, 72 | {noreply, State}; 73 | handle_info(_Info, State) -> 74 | {noreply, State}. 75 | 76 | terminate(_Reason, _State) -> 77 | ok. 78 | 79 | %%============================================================================== 80 | %% Internal functions 81 | %%============================================================================== 82 | 83 | %%-------------------------------------------------------------------- 84 | %% @doc: Set the current process as the receiver of the BEAM system 85 | %% events 86 | %%-------------------------------------------------------------------- 87 | -spec setup_system_monitor() -> ok. 88 | setup_system_monitor() -> 89 | {ok, Opts} = application:get_env(?APP, beam_events), 90 | erlang:system_monitor(self(), Opts), 91 | ok. 92 | 93 | data_for_reference(Pid) when is_pid(Pid) -> 94 | case system_monitor_top:get_proc_top(Pid) of 95 | false -> "Proc not in top"; 96 | ProcErlTop -> system_monitor:erl_top_to_str(ProcErlTop) 97 | end; 98 | data_for_reference(_Port) -> 99 | "". 100 | 101 | -spec format_system_event_info(term()) -> io_lib:chars(). 102 | format_system_event_info(Info) when is_list(Info) -> 103 | lists:foldl( 104 | fun({Key, Value}, Acc) -> 105 | [io_lib:format("~p=~p ", [Key, Value])|Acc]; 106 | (Value, Acc) -> 107 | [io_lib:format("~p ", [Value])|Acc] 108 | end, 109 | [], 110 | Info); 111 | format_system_event_info(Port) when is_port(Port) -> 112 | format_system_event_info([{port, Port}]); 113 | format_system_event_info(Pid) when is_pid(Pid) -> 114 | format_system_event_info([{pid_2, Pid}]); 115 | format_system_event_info(Term) -> 116 | format_system_event_info([{info, Term}]). 117 | 118 | %%%_* Emacs ============================================================ 119 | %%% Local Variables: 120 | %%% allout-layout: t 121 | %%% erlang-indent-level: 2 122 | %%% End: 123 | -------------------------------------------------------------------------------- /src/system_monitor_pg.erl: -------------------------------------------------------------------------------- 1 | %%-------------------------------------------------------------------------------- 2 | %% Copyright 2021 Klarna Bank AB 3 | %% 4 | %% Licensed under the Apache License, Version 2.0 (the "License"); 5 | %% you may not use this file except in compliance with the License. 6 | %% You may obtain a copy of the License at 7 | %% 8 | %% http://www.apache.org/licenses/LICENSE-2.0 9 | %% 10 | %% Unless required by applicable law or agreed to in writing, software 11 | %% distributed under the License is distributed on an "AS IS" BASIS, 12 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | %% See the License for the specific language governing permissions and 14 | %% limitations under the License. 15 | %%-------------------------------------------------------------------------------- 16 | -module(system_monitor_pg). 17 | 18 | -behaviour(gen_server). 19 | -export([ start_link/0 20 | , init/1 21 | , handle_continue/2 22 | , handle_call/3 23 | , handle_info/2 24 | , handle_cast/2 25 | , format_status/1 26 | , terminate/2 27 | ]). 28 | 29 | -if(?OTP_RELEASE < 27). 30 | -export([ format_status/2 31 | ]). 32 | -endif. 33 | 34 | -behaviour(system_monitor_callback). 35 | -export([ produce/2 ]). 36 | 37 | -include_lib("system_monitor/include/system_monitor.hrl"). 38 | -include_lib("kernel/include/logger.hrl"). 39 | 40 | -define(SERVER, ?MODULE). 41 | -define(FIVE_SECONDS, 5000). 42 | -define(ONE_HOUR, 60*60*1000). 43 | 44 | %%%_* API ================================================================= 45 | produce(Type, Events) -> 46 | gen_server:cast(?SERVER, {produce, Type, Events}). 47 | 48 | %%%_* Callbacks ================================================================= 49 | start_link() -> 50 | gen_server:start_link({local, ?SERVER}, ?MODULE, [], []). 51 | 52 | init(_Args) -> 53 | erlang:process_flag(trap_exit, true), 54 | {ok, #{}, {continue, start_pg}}. 55 | 56 | handle_continue(start_pg, State) -> 57 | Conn = initialize(), 58 | case Conn of 59 | undefined -> 60 | timer:send_after(?FIVE_SECONDS, reinitialize); 61 | Conn -> 62 | ok 63 | end, 64 | timer:send_after(?ONE_HOUR, mk_partitions), 65 | {noreply, State#{connection => Conn, buffer => buffer_new()}}. 66 | 67 | handle_call(_Msg, _From, State) -> 68 | {reply, ok, State}. 69 | 70 | handle_info({'EXIT', Conn, _Reason}, #{connection := Conn} = State) -> 71 | timer:send_after(?FIVE_SECONDS, reinitialize), 72 | {noreply, State#{connection => undefined}}; 73 | handle_info({'EXIT', _Conn, _Reason}, #{connection := undefined} = State) -> 74 | timer:send_after(?FIVE_SECONDS, reinitialize), 75 | {noreply, State}; 76 | handle_info({'EXIT', _Conn, normal}, State) -> 77 | {noreply, State}; 78 | handle_info(mk_partitions, #{connection := undefined} = State) -> 79 | timer:send_after(?ONE_HOUR, mk_partitions), 80 | {noreply, State}; 81 | handle_info(mk_partitions, #{connection := Conn} = State) -> 82 | mk_partitions(Conn), 83 | timer:send_after(?ONE_HOUR, mk_partitions), 84 | {noreply, State}; 85 | handle_info(reinitialize, State) -> 86 | {noreply, State#{connection => initialize()}}. 87 | 88 | handle_cast({produce, Type, Events}, #{connection := undefined, buffer := Buffer} = State) -> 89 | {noreply, State#{buffer => buffer_add(Buffer, {Type, Events})}}; 90 | handle_cast({produce, Type, Events}, #{connection := Conn, buffer := Buffer} = State) -> 91 | MaxMsgQueueSize = application:get_env(?APP, max_message_queue_len, 1000), 92 | case process_info(self(), message_queue_len) of 93 | {_, N} when N > MaxMsgQueueSize -> 94 | {noreply, State}; 95 | _ -> 96 | lists:foreach(fun({Type0, Events0}) -> 97 | run_query(Conn, Type0, Events0) 98 | end, buffer_to_list(buffer_add(Buffer, {Type, Events}))), 99 | {noreply, State#{buffer => buffer_new()}} 100 | end. 101 | 102 | format_status(Status = #{reason := _Reason, state := State}) -> 103 | Status#{state => State#{buffer => buffer_new()}}; 104 | format_status(Status) -> 105 | Status. 106 | 107 | -if(?OTP_RELEASE < 27). 108 | %% TODO: remove once OTP-25 is the oldest supported OTP version 109 | format_status(normal, [_PDict, State]) -> 110 | [{data, [{"State", State}]}]; 111 | format_status(terminate, [_PDict, State]) -> 112 | State#{buffer => buffer_new()}. 113 | -endif. 114 | 115 | terminate(_Reason, #{connection := undefined}) -> 116 | ok; 117 | terminate(_Reason, #{connection := Conn}) -> 118 | epgsql:close(Conn). 119 | 120 | %%%_* Internal buffer functions =============================================== 121 | buffer_new() -> 122 | {0, queue:new()}. 123 | 124 | buffer_add({Len, Buffer}, Element) -> 125 | MaxBufferSize = application:get_env(?APP, max_buffer_size, 1000), 126 | case Len >= MaxBufferSize of 127 | true -> {Len, queue:in(Element, queue:drop(Buffer))}; 128 | false -> {Len + 1, queue:in(Element, Buffer)} 129 | end. 130 | 131 | buffer_to_list({_, Buffer}) -> 132 | queue:to_list(Buffer). 133 | 134 | %%%_* Internal functions ====================================================== 135 | run_query(Conn, Type, Events) -> 136 | {ok, Statement} = epgsql:parse(Conn, query(Type)), 137 | Batch = [{Statement, params(Type, I)} || I <- Events], 138 | Results = epgsql:execute_batch(Conn, Batch), 139 | %% Crash on failure 140 | lists:foreach(fun ({ok, _}) -> 141 | ok; 142 | ({ok, _, _}) -> 143 | ok 144 | end, 145 | Results). 146 | 147 | initialize() -> 148 | case connect() of 149 | undefined -> 150 | log_failed_connection(), 151 | undefined; 152 | Conn -> 153 | mk_partitions(Conn), 154 | Conn 155 | end. 156 | 157 | connect() -> 158 | case epgsql:connect(connect_options()) of 159 | {ok, Conn} -> 160 | Conn; 161 | _ -> 162 | undefined 163 | end. 164 | 165 | connect_options() -> 166 | #{host => application:get_env(?APP, db_hostname, "localhost"), 167 | port => application:get_env(?APP, db_port, 5432), 168 | username => application:get_env(?APP, db_username, "system_monitor"), 169 | password => application:get_env(?APP, db_password, "system_monitor_password"), 170 | database => application:get_env(?APP, db_name, "system_monitor"), 171 | timeout => application:get_env(?APP, db_connection_timeout, 5000), 172 | codecs => []}. 173 | 174 | log_failed_connection() -> 175 | ?LOG_WARNING("Failed to open connection to the DB.", [], #{domain => [system_monitor]}). 176 | 177 | mk_partitions(Conn) -> 178 | DaysAhead = application:get_env(system_monitor, partition_days_ahead, 10), 179 | DaysBehind = application:get_env(system_monitor, partition_days_behind, 10), 180 | GDate = calendar:date_to_gregorian_days(date()), 181 | DaysAheadL = lists:seq(GDate, GDate + DaysAhead), 182 | %% Delete 10 days older than partition_days_behind config 183 | DaysBehindL = lists:seq(GDate - DaysBehind - 10, GDate - DaysBehind - 1), 184 | lists:foreach(fun(Day) -> create_partition_tables(Conn, Day) end, DaysAheadL), 185 | lists:foreach(fun(Day) -> delete_partition_tables(Conn, Day) end, DaysBehindL). 186 | 187 | create_partition_tables(Conn, Day) -> 188 | Tables = [<<"prc">>, <<"app_top">>, <<"fun_top">>, <<"node_role">>], 189 | From = to_postgres_date(Day), 190 | To = to_postgres_date(Day + 1), 191 | lists:foreach(fun(Table) -> 192 | Query = create_partition_query(Table, Day, From, To), 193 | [{ok, [], []}, {ok, [], []}] = epgsql:squery(Conn, Query) 194 | end, 195 | Tables). 196 | 197 | delete_partition_tables(Conn, Day) -> 198 | Tables = [<<"prc">>, <<"app_top">>, <<"fun_top">>, <<"node_role">>], 199 | lists:foreach(fun(Table) -> 200 | Query = delete_partition_query(Table, Day), 201 | {ok, [], []} = epgsql:squery(Conn, Query) 202 | end, 203 | Tables). 204 | 205 | create_partition_query(Table, Day, From, To) -> 206 | <<"CREATE TABLE IF NOT EXISTS ", Table/binary, "_", (integer_to_binary(Day))/binary, " ", 207 | "PARTITION OF ", Table/binary, " ", 208 | "FOR VALUES " 209 | "FROM ('", (list_to_binary(From))/binary, "') TO ('", (list_to_binary(To))/binary, "');" 210 | "CREATE INDEX IF NOT EXISTS ", 211 | Table/binary, "_", (integer_to_binary(Day))/binary, "_ts_idx " 212 | "ON ", Table/binary, "_", (integer_to_binary(Day))/binary, "(ts);">>. 213 | 214 | delete_partition_query(Table, Day) -> 215 | <<"DROP TABLE IF EXISTS ", Table/binary, "_", (integer_to_binary(Day))/binary, ";">>. 216 | 217 | to_postgres_date(GDays) -> 218 | {YY, MM, DD} = calendar:gregorian_days_to_date(GDays), 219 | lists:flatten(io_lib:format("~w-~2..0w-~2..0w", [YY, MM, DD])). 220 | 221 | query(fun_top) -> 222 | fun_top_query(); 223 | query(app_top) -> 224 | app_top_query(); 225 | query(node_role) -> 226 | node_role_query(); 227 | query(proc_top) -> 228 | prc_query(). 229 | 230 | prc_query() -> 231 | <<"insert into prc (node, ts, pid, dreductions, dmemory, reductions, " 232 | "memory, message_queue_len, current_function, initial_call, " 233 | "registered_name, stack_size, heap_size, total_heap_size, current_stacktrace, group_leader) " 234 | "VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16);">>. 235 | 236 | app_top_query() -> 237 | <<"insert into app_top (node, ts, application, unit, value) VALUES ($1, $2, $3, $4, $5);">>. 238 | 239 | fun_top_query() -> 240 | <<"insert into fun_top (node, ts, fun, fun_type, num_processes) VALUES ($1, $2, $3, $4, $5);">>. 241 | 242 | node_role_query() -> 243 | <<"insert into node_role (node, ts, data) VALUES ($1, $2, $3);">>. 244 | 245 | params(fun_top, {fun_top, Node, TS, Key, Tag, Val} = _Event) -> 246 | [atom_to_list(Node), ts_to_timestamp(TS), system_monitor:fmt_mfa(Key), Tag, Val]; 247 | params(app_top, {app_top, Node, TS, Application, Tag, Val} = _Event) -> 248 | [atom_to_list(Node), 249 | ts_to_timestamp(TS), 250 | atom_to_list(Application), 251 | atom_to_list(Tag), 252 | Val]; 253 | params(node_role, {node_role, Node, TS, Bin}) -> 254 | [atom_to_list(Node), ts_to_timestamp(TS), Bin]; 255 | params(proc_top, 256 | #erl_top{node = Node, 257 | ts = TS, 258 | pid = Pid, 259 | dreductions = DR, 260 | dmemory = DM, 261 | reductions = R, 262 | memory = M, 263 | message_queue_len = MQL, 264 | current_function = CF, 265 | initial_call = IC, 266 | registered_name = RN, 267 | stack_size = SS, 268 | heap_size = HS, 269 | total_heap_size = THS, 270 | current_stacktrace = CS, 271 | group_leader = GL} = 272 | _Event) -> 273 | [atom_to_list(Node), 274 | ts_to_timestamp(TS), 275 | Pid, 276 | DR, 277 | DM, 278 | R, 279 | M, 280 | MQL, 281 | system_monitor:fmt_mfa(CF), 282 | system_monitor:fmt_mfa(IC), 283 | name_to_list(RN), 284 | SS, 285 | HS, 286 | THS, 287 | system_monitor:fmt_stack(CS), 288 | GL]. 289 | 290 | ts_to_timestamp(TS) -> 291 | calendar:system_time_to_universal_time(TS, native). 292 | 293 | name_to_list(Term) -> 294 | case io_lib:printable_latin1_list(Term) of 295 | true -> 296 | Term; 297 | false -> 298 | lists:flatten(io_lib:format("~p", [Term])) 299 | end. 300 | -------------------------------------------------------------------------------- /src/system_monitor_sup.erl: -------------------------------------------------------------------------------- 1 | %%-------------------------------------------------------------------------------- 2 | %% Copyright 2020 Klarna Bank AB 3 | %% 4 | %% Licensed under the Apache License, Version 2.0 (the "License"); 5 | %% you may not use this file except in compliance with the License. 6 | %% You may obtain a copy of the License at 7 | %% 8 | %% http://www.apache.org/licenses/LICENSE-2.0 9 | %% 10 | %% Unless required by applicable law or agreed to in writing, software 11 | %% distributed under the License is distributed on an "AS IS" BASIS, 12 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | %% See the License for the specific language governing permissions and 14 | %% limitations under the License. 15 | %%-------------------------------------------------------------------------------- 16 | -module(system_monitor_sup). 17 | 18 | %% TODO: Dialyzer doesn't like this one: 19 | %-behaviour(supervisor3). 20 | 21 | %% External exports 22 | -export([start_link/0]). 23 | 24 | %% supervisor callbacks 25 | -export([init/1, post_init/1]). 26 | 27 | %%-------------------------------------------------------------------- 28 | %% Macros 29 | %%-------------------------------------------------------------------- 30 | -define(SERVER, ?MODULE). 31 | -define(SUP2, system_monitor2_sup). 32 | 33 | %%%---------------------------------------------------------------------- 34 | %%% API 35 | %%%---------------------------------------------------------------------- 36 | start_link() -> 37 | supervisor3:start_link({local, ?SERVER}, ?MODULE, ?SERVER). 38 | 39 | %%%---------------------------------------------------------------------- 40 | %%% Callback functions from supervisor 41 | %%%---------------------------------------------------------------------- 42 | 43 | server(Name, Type) -> 44 | server(Name, Type, 2000). 45 | 46 | server(Name, Type, Shutdown) -> 47 | {Name, {Name, start_link, []}, {permanent, 15}, Shutdown, Type, [Name]}. 48 | 49 | worker(Name) -> server(Name, worker). 50 | 51 | post_init(_) -> 52 | ignore. 53 | 54 | init(?SERVER) -> 55 | %% The top level supervisor *does not allow restarts*; if a component 56 | %% directly under this supervisor crashes, the entire node will shut 57 | %% down and restart. Thus, only those components that must never be 58 | %% unavailable should be directly under this supervisor. 59 | 60 | SecondSup = {?SUP2, 61 | {supervisor3, start_link, 62 | [{local, ?SUP2}, ?MODULE, ?SUP2]}, 63 | permanent, 2000, supervisor, [?MODULE]}, 64 | 65 | {ok, {{one_for_one,0,1}, % no restarts allowed! 66 | [SecondSup] 67 | }}; 68 | init(?SUP2) -> 69 | %% The second-level supervisor allows some restarts. This is where the 70 | %% normal services live. 71 | {ok, {{one_for_one, 10, 20}, 72 | [ worker(system_monitor_top) 73 | , worker(system_monitor_events) 74 | , worker(system_monitor) 75 | ] ++ producer_callback() 76 | }}. 77 | 78 | producer_callback() -> 79 | case system_monitor_callback:get_callback_mod() of 80 | undefined -> []; 81 | Mod -> [worker(Mod)] 82 | end. 83 | -------------------------------------------------------------------------------- /src/system_monitor_top.erl: -------------------------------------------------------------------------------- 1 | %%-------------------------------------------------------------------------------- 2 | %% Copyright 2020 Klarna Bank AB 3 | %% 4 | %% Licensed under the Apache License, Version 2.0 (the "License"); 5 | %% you may not use this file except in compliance with the License. 6 | %% You may obtain a copy of the License at 7 | %% 8 | %% http://www.apache.org/licenses/LICENSE-2.0 9 | %% 10 | %% Unless required by applicable law or agreed to in writing, software 11 | %% distributed under the License is distributed on an "AS IS" BASIS, 12 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | %% See the License for the specific language governing permissions and 14 | %% limitations under the License. 15 | %%-------------------------------------------------------------------------------- 16 | %%% @doc 17 | %%% Collect Erlang process statistics and push it to the configured destination 18 | %%% @end 19 | -module(system_monitor_top). 20 | 21 | -behaviour(gen_server). 22 | 23 | -include_lib("system_monitor/include/system_monitor.hrl"). 24 | 25 | -ifdef(TEST). 26 | -include_lib("proper/include/proper.hrl"). 27 | -include_lib("eunit/include/eunit.hrl"). 28 | -endif. % TEST 29 | 30 | %% API 31 | -export([start_link/0, get_app_top/0, get_abs_app_top/0, 32 | get_app_memory/0, get_app_processes/0, 33 | get_function_top/0, get_proc_top/0, get_proc_top/1]). 34 | 35 | %% gen_server callbacks 36 | -export([init/1, handle_call/3, handle_cast/2, handle_info/2, 37 | terminate/2, code_change/3]). 38 | 39 | -export_type([function_top/0]). 40 | 41 | -define(SERVER, ?MODULE). 42 | 43 | -define(TOP_APP_TAB, sysmon_top_app_tab). 44 | -define(TAB_OPTS, [private, named_table, set, {keypos, 1}]). 45 | 46 | %% Type and record definitions 47 | 48 | -record(state, 49 | { max_procs :: integer() 50 | , sample_size :: non_neg_integer() 51 | , interval :: integer() 52 | , num_items :: integer() 53 | , timer :: timer:tref() 54 | , old_data :: [#pid_info{}] 55 | , last_ts :: integer() 56 | , proc_top = [] :: [#erl_top{}] 57 | , app_top = [] :: [#app_top{}] 58 | , function_top = 59 | #{ initial_call => [] 60 | , current_function => [] 61 | } :: function_top() 62 | }). 63 | 64 | -type top() :: {integer(), gb_trees:tree(integer(), [#pid_info{}])}. 65 | 66 | -define(PROCESS_INFO_FIELDS_NEW, 67 | [ initial_call, dictionary, registered_name, group_leader, reductions, memory, 68 | message_queue_len, current_function]). 69 | 70 | -define(PROCESS_INFO_FIELDS_UPDATE, 71 | [ reductions, memory, message_queue_len, current_function]). 72 | 73 | -define(ADDITIONAL_FIELDS, 74 | [ stack_size 75 | , heap_size, total_heap_size, current_stacktrace 76 | ]). 77 | 78 | %%%=================================================================== 79 | %%% API 80 | %%%=================================================================== 81 | 82 | %%-------------------------------------------------------------------- 83 | %% @doc 84 | %% Get Erlang process top 85 | %% @end 86 | %%-------------------------------------------------------------------- 87 | -spec get_proc_top() -> {integer(), [#erl_top{}]}. 88 | get_proc_top() -> 89 | {ok, Data} = gen_server:call(?SERVER, get_proc_top, infinity), 90 | Data. 91 | 92 | 93 | %%-------------------------------------------------------------------- 94 | %% @doc 95 | %% Get Erlang process top info for one process 96 | %% @end 97 | %%-------------------------------------------------------------------- 98 | -spec get_proc_top(pid()) -> #erl_top{} | false. 99 | get_proc_top(Pid) -> 100 | gen_server:call(?SERVER, {get_proc_top, Pid}, infinity). 101 | 102 | %%-------------------------------------------------------------------- 103 | %% @doc 104 | %% Get relative reduction utilization per application, sorted by 105 | %% reductions 106 | %% @end 107 | %%-------------------------------------------------------------------- 108 | -spec get_app_top() -> [{atom(), float()}]. 109 | get_app_top() -> 110 | do_get_app_top(#app_top.red_rel). 111 | 112 | %%-------------------------------------------------------------------- 113 | %% @doc 114 | %% Get absolute reduction utilization per application, sorted by 115 | %% reductions 116 | %% @end 117 | %%-------------------------------------------------------------------- 118 | -spec get_abs_app_top() -> [{atom(), integer()}]. 119 | get_abs_app_top() -> 120 | do_get_app_top(#app_top.red_abs). 121 | 122 | %%-------------------------------------------------------------------- 123 | %% @doc 124 | %% Get memory utilization per application, sorted by memory 125 | %% @end 126 | %%-------------------------------------------------------------------- 127 | -spec get_app_memory() -> [{atom(), integer()}]. 128 | get_app_memory() -> 129 | do_get_app_top(#app_top.memory). 130 | 131 | %%-------------------------------------------------------------------- 132 | %% @doc 133 | %% Get number of processes spawned by each application 134 | %% @end 135 | %%-------------------------------------------------------------------- 136 | -spec get_app_processes() -> [{atom(), integer()}]. 137 | get_app_processes() -> 138 | do_get_app_top(#app_top.processes). 139 | 140 | %%-------------------------------------------------------------------- 141 | %% @doc 142 | %% Get approximate distribution of initilal_call and current_function 143 | %% per process 144 | %% @end 145 | %%-------------------------------------------------------------------- 146 | -spec get_function_top() -> function_top(). 147 | get_function_top() -> 148 | {ok, Data} = gen_server:call(?SERVER, get_function_top, infinity), 149 | Data. 150 | 151 | %%-------------------------------------------------------------------- 152 | %% @doc 153 | %% Starts the server 154 | %% @end 155 | %%-------------------------------------------------------------------- 156 | -spec start_link() -> 157 | {ok, pid()} | ignore | {error, term()}. 158 | start_link() -> 159 | gen_server:start_link({local, ?SERVER}, ?MODULE, [], []). 160 | 161 | %%%=================================================================== 162 | %%% gen_server callbacks 163 | %%%=================================================================== 164 | 165 | init([]) -> 166 | {ok, MaxProcs} = application:get_env(?APP , top_max_procs), 167 | {ok, SampleSize} = application:get_env(?APP, top_sample_size), 168 | {ok, Interval} = application:get_env(?APP, top_sample_interval), 169 | {ok, NumItems} = application:get_env(?APP, top_num_items), 170 | {ok, TRef} = timer:send_after(0, collect_data), 171 | {ok, #state{ max_procs = MaxProcs 172 | , sample_size = SampleSize 173 | , interval = Interval 174 | , num_items = NumItems 175 | , timer = TRef 176 | , last_ts = os:system_time() 177 | , old_data = [] 178 | }}. 179 | 180 | handle_call(get_proc_top, _From, State) -> 181 | Top = State#state.proc_top, 182 | SnapshotTS = State#state.last_ts, 183 | Data = {SnapshotTS, Top}, 184 | {reply, {ok, Data}, State}; 185 | handle_call({get_proc_top, Pid}, _From, State) -> 186 | Top = State#state.proc_top, 187 | {reply, lists:keyfind(pid_to_list(Pid), #erl_top.pid, Top), State}; 188 | handle_call(get_app_top, _From, State) -> 189 | Data = State#state.app_top, 190 | {reply, {ok, Data}, State}; 191 | handle_call(get_function_top, _From, State) -> 192 | Data = State#state.function_top, 193 | {reply, {ok, Data}, State}; 194 | handle_call(_Msg, _From, State) -> 195 | {reply, {error, bad_call}, State}. 196 | 197 | handle_cast(_Msg, State) -> 198 | {noreply, State}. 199 | 200 | handle_info(collect_data, State) -> 201 | T1 = os:system_time(), 202 | NumProcesses = erlang:system_info(process_count), 203 | case should_calculate_info(NumProcesses, State#state.max_procs) of 204 | true -> 205 | T0 = State#state.last_ts, 206 | Dt = erlang:convert_time_unit(T1 - T0, native, microsecond), 207 | Pids = lists:sort(processes()), 208 | OldData = State#state.old_data, 209 | NewData = calc_deltas(OldData, Pids, Dt), 210 | ProcTop = do_proc_top(NewData, State, T1), 211 | AppTop = do_app_top(NewData); 212 | false -> 213 | AppTop = [], 214 | NewData = [], 215 | ProcTop = [fake_erl_top_msg(T1)] 216 | end, 217 | FunctionTop = process_aggregate(NewData, State#state.sample_size), 218 | %% Calculate timer interval. Sleep at least half a second between 219 | %% samples when sysmon is running very slow: 220 | T2 = os:system_time(), 221 | Dt2 = erlang:convert_time_unit(T2 - T1, native, microsecond), 222 | SleepTime = max(500, State#state.interval - Dt2), 223 | {ok, TRef} = timer:send_after(SleepTime, collect_data), 224 | {noreply, State#state{ last_ts = T1 225 | , old_data = NewData 226 | , proc_top = ProcTop 227 | , app_top = AppTop 228 | , function_top = FunctionTop 229 | , timer = TRef 230 | }}; 231 | handle_info(_Info, State) -> 232 | {noreply, State}. 233 | 234 | terminate(_Reason, _State) -> 235 | ok. 236 | 237 | code_change(_OldVsn, State, _Extra) -> 238 | {ok, State}. 239 | 240 | %%%=================================================================== 241 | %%% Internal functions 242 | %%%=================================================================== 243 | 244 | %%-------------------------------------------------------------------- 245 | %% @doc 246 | %% Calculate resource consumption per application 247 | %% @end 248 | %%-------------------------------------------------------------------- 249 | -spec do_app_top([#pid_info{}]) -> [#app_top{}]. 250 | do_app_top(Deltas) -> 251 | %% Prepare the temporary table: 252 | case ets:info(?TOP_APP_TAB) of 253 | undefined -> 254 | ets:new(?TOP_APP_TAB, ?TAB_OPTS); 255 | _ -> 256 | ets:delete_all_objects(?TOP_APP_TAB) 257 | end, 258 | %% Traverse process infos: 259 | lists:foreach( 260 | fun(#pid_info{group_leader = GL, dreductions = DR, memory=Mem}) -> 261 | ets:update_counter( ?TOP_APP_TAB 262 | , GL 263 | , [ {2, round(DR)} 264 | , {3, Mem} 265 | , {4, 1} 266 | ] 267 | , {GL, 0, 0, 0} 268 | ) 269 | end, 270 | Deltas), 271 | %% Calculate final values: 272 | TotalReds = 273 | ets:foldl( 274 | fun({_, DR, _, _}, Acc) -> 275 | Acc + DR 276 | end, 277 | 0, 278 | ?TOP_APP_TAB), 279 | {AppInfo, UnknownReds, UnknownMem, UnknownProcs} = 280 | ets:foldl( 281 | fun( {GL, Reds, Mem, Procs} 282 | , {Apps, UnknownReds, UnknownMem, UnknownProcs} 283 | ) -> 284 | case application_controller:get_application(GL) of 285 | undefined -> 286 | { Apps 287 | , UnknownReds + Reds 288 | , UnknownMem + Mem 289 | , UnknownProcs + Procs 290 | }; 291 | {ok, App} -> 292 | AppInfo = #app_top{ app = App 293 | , red_rel = divide(Reds, TotalReds) 294 | , red_abs = Reds 295 | , memory = Mem 296 | , processes = Procs 297 | }, 298 | {[AppInfo|Apps], UnknownReds, UnknownMem, UnknownProcs} 299 | end 300 | end, 301 | {[], 0, 0, 0}, 302 | ?TOP_APP_TAB), 303 | UnknownApp = #app_top{ app = unknown 304 | , red_rel = divide(UnknownReds, TotalReds) 305 | , red_abs = UnknownReds 306 | , memory = UnknownMem 307 | , processes = UnknownProcs 308 | }, 309 | [UnknownApp|AppInfo]. 310 | 311 | divide(_A, 0) -> 312 | 0; 313 | divide(A, B) -> 314 | A / B. 315 | 316 | %%------------------------------------------------------------------------------ 317 | %% @doc Produce an aggregate summary of initial call and current function for 318 | %% processes. 319 | %%------------------------------------------------------------------------------ 320 | -spec process_aggregate([#pid_info{}], non_neg_integer()) -> function_top(). 321 | process_aggregate(ProcInfos0, SampleSize) -> 322 | ProcInfos = random_sample(ProcInfos0, SampleSize), 323 | NumProcs = length(ProcInfos), 324 | InitCallT = ets:new(sysmon_init_call, []), 325 | CurrFunT = ets:new(sysmon_curr_fun, []), 326 | Fun = fun(#pid_info{initial_call = InitCall, current_function = CurrFun0}) -> 327 | ets:update_counter(InitCallT, InitCall, {2, 1}, {InitCall, 0}), 328 | CurrFun = 329 | case CurrFun0 of 330 | %% process_info/2 may return 'undefined' in some 331 | %% cases (e.g. native compiled (HiPE) 332 | %% modules). We collect all of these under 333 | %% {undefined, undefined, 0}. 334 | undefined -> {undefined, undefined, 0}; 335 | _ -> CurrFun0 336 | end, 337 | ets:update_counter(CurrFunT, CurrFun, {2, 1}, {CurrFun, 0}) 338 | end, 339 | lists:foreach(Fun, ProcInfos), 340 | Finalize = fun(A) -> 341 | Sorted = lists:reverse(lists:keysort(2, ets:tab2list(A))), 342 | lists:map(fun({Key, Val}) -> {Key, Val/NumProcs} end, Sorted) 343 | end, 344 | Result = #{ initial_call => Finalize(InitCallT) 345 | , current_function => Finalize(CurrFunT) 346 | }, 347 | ets:delete(InitCallT), 348 | ets:delete(CurrFunT), 349 | Result. 350 | 351 | %%-------------------------------------------------------------------- 352 | %% @doc 353 | %% Find processes that take the most resources 354 | %% @end 355 | %%-------------------------------------------------------------------- 356 | -spec do_proc_top([#pid_info{}], #state{}, integer()) -> [#erl_top{}]. 357 | do_proc_top(Deltas, State, Now) -> 358 | NumElems = State#state.num_items, 359 | case length(Deltas) > NumElems of 360 | true -> 361 | {First, Rest} = lists:split(NumElems, Deltas); 362 | false -> 363 | First = Deltas, 364 | Rest = [] 365 | end, 366 | %% Generate initial conditions for the top search using the first 367 | %% NumElems: 368 | Acc0 = { sort_top(#pid_info.dreductions, First) 369 | , sort_top(#pid_info.memory, First) 370 | , sort_top(#pid_info.dmemory, First) 371 | , sort_top(#pid_info.message_queue_len, First) 372 | }, 373 | %% Iterate through the rest of the processes: 374 | TopGroups = 375 | lists:foldl( 376 | fun(Delta, {TopDRed, TopMem, TopDMem, TopMQ}) -> 377 | { maybe_push_to_top(#pid_info.dreductions, Delta, TopDRed) 378 | , maybe_push_to_top(#pid_info.memory, Delta, TopMem) 379 | , maybe_push_to_top(#pid_info.dmemory, Delta, TopDMem) 380 | , maybe_push_to_top(#pid_info.message_queue_len, Delta, TopMQ) 381 | } 382 | end, 383 | Acc0, 384 | Rest), 385 | %% Some pids may appear in more than one group, fix this: 386 | TopElems = lists:usort( 387 | lists:flatten( 388 | [top_to_list(Grp) || Grp <- tuple_to_list(TopGroups)] 389 | )), 390 | %% Request additional data for the top processes: 391 | [finalize_proc_info(P, Now) || P <- TopElems]. 392 | 393 | -spec finalize_proc_info(#pid_info{}, integer()) -> #erl_top{}. 394 | finalize_proc_info(#pid_info{pid = Pid, initial_call = InitialCall, 395 | registered_name = Name, 396 | group_leader = GL} = ProcInfo, Now) -> 397 | ProcessInfo = process_info(Pid, ?ADDITIONAL_FIELDS), 398 | case ProcessInfo of 399 | [{stack_size, Stack}, 400 | {heap_size, Heap}, 401 | {total_heap_size, Total}, 402 | {current_stacktrace, Stacktrace}] -> 403 | CurrentFunction = 404 | case Stacktrace of 405 | [] -> 406 | {unknown, unknown, 0}; 407 | [{CurrModule, CurrFun, CurrArity, _} | _] -> 408 | {CurrModule, CurrFun, CurrArity} 409 | end, 410 | #erl_top{node = system_monitor:node_name(), 411 | ts = Now, 412 | pid = pid_to_list(ProcInfo#pid_info.pid), 413 | group_leader = pid_to_list(GL), 414 | dreductions = ProcInfo#pid_info.dreductions, 415 | dmemory = ProcInfo#pid_info.dmemory, 416 | reductions = ProcInfo#pid_info.reductions, 417 | memory = ProcInfo#pid_info.memory, 418 | message_queue_len = ProcInfo#pid_info.message_queue_len, 419 | initial_call = InitialCall, 420 | registered_name = Name, 421 | stack_size = Stack, 422 | heap_size = Heap, 423 | total_heap_size = Total, 424 | current_stacktrace = Stacktrace, 425 | current_function = CurrentFunction}; 426 | undefined -> 427 | #erl_top{node = system_monitor:node_name(), 428 | ts = Now, 429 | pid = pid_to_list(ProcInfo#pid_info.pid), 430 | group_leader = pid_to_list(GL), 431 | dreductions = ProcInfo#pid_info.dreductions, 432 | dmemory = ProcInfo#pid_info.dmemory, 433 | reductions = ProcInfo#pid_info.reductions, 434 | memory = ProcInfo#pid_info.memory, 435 | message_queue_len = ProcInfo#pid_info.message_queue_len, 436 | initial_call = {unknown, unknown, 0}, 437 | current_function = {unknown, unknown, 0}, 438 | stack_size = 0, 439 | heap_size = 0, 440 | total_heap_size = 0, 441 | current_stacktrace = []} 442 | end. 443 | 444 | -spec maybe_push_to_top(integer(), #pid_info{}, top()) -> top(). 445 | maybe_push_to_top(FieldID, Val, {OldMin, OldTop}) -> 446 | Key = element(FieldID, Val), 447 | if OldMin < Key -> 448 | {SKey, SVal, Top1} = gb_trees:take_smallest(OldTop), 449 | case SVal of 450 | [_] -> 451 | Top2 = Top1; 452 | [_|SVal2] -> 453 | Top2 = gb_insert(SKey, SVal2, Top1) 454 | end, 455 | NewTop = gb_insert(Key, Val, Top2), 456 | {Minimal, _} = gb_trees:smallest(NewTop), 457 | {Minimal, NewTop}; 458 | true -> 459 | {OldMin, OldTop} 460 | end. 461 | 462 | -spec sort_top(integer(), [#pid_info{}]) -> top(). 463 | sort_top(Field, L) -> 464 | Top = 465 | lists:foldl( 466 | fun(Val, Acc) -> 467 | gb_insert(element(Field, Val), Val, Acc) 468 | end, 469 | gb_trees:empty(), 470 | L), 471 | {Minimal, _} = gb_trees:smallest(Top), 472 | {Minimal, Top}. 473 | 474 | gb_insert(Key, Val, Tree) -> 475 | case gb_trees:lookup(Key, Tree) of 476 | none -> 477 | gb_trees:enter(Key, [Val], Tree); 478 | {value, Vals} -> 479 | gb_trees:update(Key, [Val|Vals], Tree) 480 | end. 481 | 482 | -spec should_calculate_info(non_neg_integer(), integer()) -> boolean(). 483 | should_calculate_info(NumPids, MaxProcs) -> 484 | not (MaxProcs < NumPids andalso MaxProcs > 0). 485 | 486 | pid_info_update(PI) -> 487 | #pid_info{pid = Pid} = PI, 488 | case should_not_update_memory(PI) of 489 | true -> 490 | %% Calling process_info(Pid, memory) can block both system_monitor and the 491 | %% monitored Pid for a long time, which can degrade system performance. 492 | %% If it seems dangerous to query memory, don't do that. The memory metric 493 | %% must be present if Pid's sample is present, so we set it to zero. 494 | case erlang:process_info(Pid, ?PROCESS_INFO_FIELDS_UPDATE -- [memory]) of 495 | [ {reductions, Reds} 496 | , {message_queue_len, MQL} 497 | , {current_function, CurFun} 498 | ] -> 499 | pid_info_update(PI, Reds, _Mem = 0, MQL, CurFun); 500 | undefined -> 501 | undefined 502 | end; 503 | false -> 504 | case erlang:process_info(Pid, ?PROCESS_INFO_FIELDS_UPDATE) of 505 | [ {reductions, Reds} 506 | , {memory, Mem} 507 | , {message_queue_len, MQL} 508 | , {current_function, CurFun} 509 | ] -> 510 | pid_info_update(PI, Reds, Mem, MQL, CurFun); 511 | undefined -> 512 | undefined 513 | end 514 | end. 515 | 516 | pid_info_update(PI, Reds, Mem, MQL, CurFun) -> 517 | PI#pid_info{ 518 | reductions = Reds 519 | , memory = Mem 520 | , message_queue_len = MQL 521 | , current_function = CurFun 522 | }. 523 | 524 | should_not_update_memory(PI) -> 525 | %% this relies on number < atom in Erlang's term order 526 | PI#pid_info.message_queue_len > application:get_env(?APP, mql_limit_for_memory, undefined). 527 | 528 | -spec pid_info_new(pid()) -> #pid_info{} | undefined. 529 | pid_info_new(Pid) -> 530 | case erlang:process_info(Pid, ?PROCESS_INFO_FIELDS_NEW) of 531 | [ {initial_call, _} = InitialCallProp 532 | , {dictionary, _} = DictProp 533 | , {registered_name, RegisteredName} 534 | , {group_leader, GL} 535 | , {reductions, Red} 536 | , {memory, Mem} 537 | , {message_queue_len, MQ} 538 | , {current_function, CF} 539 | ] -> 540 | #pid_info{ 541 | pid = Pid, 542 | initial_call = initial_call([InitialCallProp,DictProp]), 543 | registered_name = RegisteredName, 544 | group_leader = GL, 545 | reductions = Red, 546 | memory = Mem, 547 | message_queue_len = MQ, 548 | current_function = CF 549 | }; 550 | undefined -> 551 | %% The proces has died while we were collecting other data... 552 | undefined 553 | end. 554 | 555 | calc_deltas(OldData, Pids, Dt) -> 556 | NewData = calc_deltas(OldData, Pids, [], Dt), 557 | lists:filter( 558 | fun(undefined) -> false; 559 | (#pid_info{}) -> true 560 | end, 561 | NewData). 562 | 563 | -spec calc_deltas(PIL, [pid()], PIL, number()) -> PIL 564 | when PIL :: [#pid_info{}]. 565 | calc_deltas([], New, Acc, Dt) -> 566 | %% The rest of the processess are new 567 | lists:sort([delta(undefined, pid_info_new(Pid), Dt) || Pid <- New] ++ Acc); 568 | calc_deltas(_Old, [], Acc, _) -> 569 | %% The rest of the processes have terminated 570 | lists:sort(Acc); 571 | calc_deltas(Old, Pids, Acc, Dt) -> 572 | [PI1 = #pid_info{pid = P1} | OldT] = Old, 573 | [P2 | PidsT] = Pids, 574 | if P1 > P2 -> %% P1 has terminated 575 | calc_deltas(OldT, Pids, Acc, Dt); 576 | P1 < P2 -> %% P2 is a new process 577 | Delta = delta(undefined, pid_info_new(P2), Dt), 578 | calc_deltas(Old, PidsT, [Delta|Acc], Dt); 579 | P1 =:= P2 -> %% We already have record of P2 580 | case pid_info_update(PI1) of 581 | undefined -> % P1 just terminated 582 | calc_deltas(OldT, PidsT, Acc, Dt); 583 | PI2 -> 584 | Delta = delta(PI1, PI2, Dt), 585 | calc_deltas(OldT, PidsT, [Delta|Acc], Dt) 586 | end 587 | end. 588 | 589 | -spec top_to_list(top()) -> [#pid_info{}]. 590 | top_to_list({_, Top}) -> 591 | lists:append(gb_trees:values(Top)). 592 | 593 | -spec delta(undefined, undefined, number()) -> undefined; 594 | (#pid_info{} | undefined, #pid_info{}, number()) -> #pid_info{}. 595 | delta(undefined, undefined, _Dt) -> undefined; 596 | delta(P1, P2, Dt) -> 597 | case P1 of 598 | undefined -> 599 | DRed = divide(P2#pid_info.reductions, Dt), 600 | DMem = divide(P2#pid_info.memory, Dt); 601 | _ -> 602 | DRed = divide((P2#pid_info.reductions - P1#pid_info.reductions), Dt), 603 | DMem = divide((P2#pid_info.memory - P1#pid_info.memory), Dt) 604 | end, 605 | P2#pid_info 606 | { dreductions = DRed 607 | , dmemory = DMem 608 | }. 609 | 610 | -spec do_get_app_top(integer()) -> [{atom(), number()}]. 611 | do_get_app_top(FieldId) -> 612 | {ok, Data} = gen_server:call(?SERVER, get_app_top, infinity), 613 | lists:reverse( 614 | lists:keysort(2, [{Val#app_top.app, element(FieldId, Val)} 615 | || Val <- Data])). 616 | 617 | -spec fake_erl_top_msg(integer()) -> #erl_top{}. 618 | fake_erl_top_msg(Now) -> 619 | #erl_top{ node = system_monitor:node_name() 620 | , ts = Now 621 | , pid = "<42.42.42>" 622 | , group_leader = "<42.42.42>" 623 | , dreductions = 0 624 | , dmemory = 0 625 | , reductions = -1 626 | , memory = -1 627 | , message_queue_len = -1 628 | , initial_call = {too_many, processes, 0} 629 | , registered_name = error_too_many_processes 630 | , current_stacktrace = [] 631 | , current_function = {too_many, processes, 0} 632 | , stack_size = -1 633 | , heap_size = -1 634 | , total_heap_size = -1 635 | }. 636 | 637 | -spec random_sample(list(A), non_neg_integer()) -> list(A). 638 | %% Note: actual sample size may slightly differ from 639 | %% the SampleSize argument 640 | random_sample([], _SampleSize) -> []; 641 | random_sample(L, SampleSize) -> 642 | P = SampleSize/length(L), 643 | lists:filter(fun(_I) -> rand:uniform() < P end, L). 644 | 645 | -spec initial_call(proplists:proplist()) -> mfa(). 646 | initial_call(Info) -> 647 | case proplists:get_value(initial_call, Info) of 648 | {proc_lib, init_p, 5} -> 649 | proc_lib:translate_initial_call(Info); 650 | ICall -> 651 | ICall 652 | end. 653 | 654 | %%%=================================================================== 655 | %%% Tests 656 | %%%=================================================================== 657 | 658 | -ifdef(TEST). 659 | 660 | -dialyzer({nowarn_function, [ maybe_push_to_top_test/0 661 | , maybe_push_to_top_same_as_sort_prop/0 662 | , initial_call_test/0 663 | , initial_call_fallback_test/0 664 | ]}). 665 | 666 | maybe_push_to_top_wrapper(Val, Top) -> 667 | Init = sort_top(1, Top), 668 | Result = top_to_list(maybe_push_to_top(1, Val, Init)), 669 | lists:sort(Result). 670 | 671 | %% maybe_push_to_top function is just an optimized version 672 | %% of sorting a list and then taking its first N elements. 673 | %% 674 | %% Check that it is indeed true 675 | maybe_push_to_top_same_as_sort_prop() -> 676 | ?FORALL({Val, Top}, {{number()}, [{number()}]}, 677 | begin 678 | NumElems = length(Top), 679 | PlainSort = lists:reverse(lists:sort([Val|Top])), 680 | Reference = lists:sublist(PlainSort, NumElems), 681 | Result = maybe_push_to_top_wrapper(Val, Top), 682 | Result == Reference 683 | end). 684 | 685 | maybe_push_to_top_test() -> 686 | ?assertEqual(true, proper:quickcheck( 687 | proper:numtests( 688 | 1000, 689 | maybe_push_to_top_same_as_sort_prop()) 690 | )). 691 | 692 | initial_call_test() -> 693 | GetProcInfo = fun(Pid) -> 694 | erlang:process_info(Pid, [initial_call, dictionary]) 695 | end, 696 | Pid1 = spawn(fun() -> timer:sleep(1000) end), 697 | timer:sleep(100), %% Sleep to avoid race condition 698 | ?assertEqual( {erlang, apply, 2} 699 | , initial_call(GetProcInfo(Pid1)) 700 | ), 701 | Pid2 = proc_lib:spawn(timer, sleep, [1000]), 702 | timer:sleep(100), %% Sleep to avoid race condition 703 | ?assertEqual( {timer, sleep, 1} 704 | , initial_call(GetProcInfo(Pid2)) 705 | ). 706 | 707 | initial_call_fallback_test() -> 708 | GetProcInfo = fun(Pid) -> 709 | erlang:process_info(Pid, [initial_call]) 710 | end, 711 | Pid1 = spawn(fun() -> timer:sleep(1000) end), 712 | timer:sleep(100), %% Sleep to avoid race condition 713 | ?assertEqual( {erlang, apply, 2} 714 | , initial_call(GetProcInfo(Pid1)) 715 | ), 716 | Pid2 = proc_lib:spawn(timer, sleep, [1000]), 717 | timer:sleep(100), %% Sleep to avoid race condition 718 | ?assertEqual( {proc_lib, init_p, 5} 719 | , initial_call(GetProcInfo(Pid2)) 720 | ). 721 | 722 | -endif. 723 | 724 | %%%_* Emacs ============================================================ 725 | %%% Local Variables: 726 | %%% allout-layout: t 727 | %%% erlang-indent-level: 2 728 | %%% End: 729 | -------------------------------------------------------------------------------- /test/system_monitor_tests.erl: -------------------------------------------------------------------------------- 1 | -module(system_monitor_tests). 2 | 3 | -include_lib("eunit/include/eunit.hrl"). 4 | 5 | start_test() -> 6 | ?assertMatch({ok, _}, application:ensure_all_started(system_monitor)), 7 | application:stop(system_monitor). 8 | 9 | callback_is_started_when_configured_test() -> 10 | application:set_env(system_monitor, callback_mod, system_monitor_pg), 11 | ?assertMatch({ok, _}, application:ensure_all_started(system_monitor)), 12 | ?assertNotEqual(undefined, whereis(system_monitor_pg)), 13 | application:stop(system_monitor). 14 | 15 | callback_is_started_test() -> 16 | application:unset_env(system_monitor, callback_mod), 17 | ?assertMatch({ok, _}, application:ensure_all_started(system_monitor)), 18 | ?assertEqual(undefined, whereis(system_monitor_pg)), 19 | application:stop(system_monitor). 20 | --------------------------------------------------------------------------------