├── .github
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
└── workflows
│ ├── ci.yml
│ └── repolint.yml
├── .gitignore
├── CHANGELOG.md
├── LICENSE
├── Makefile
├── README.md
├── doc
├── app_top.png
├── proc_history.png
└── proc_top.png
├── docker
├── db
│ ├── 10-create_users.sql
│ ├── 20-create_db.sql
│ ├── 30-db_schema.sh
│ └── Dockerfile
├── docker-compose.yml
└── grafana
│ ├── Dockerfile
│ ├── dashboards
│ ├── app_top.json
│ ├── file.yml
│ ├── proc_history.json
│ └── proc_top.json
│ └── datasources
│ └── postgres.yml
├── include
└── system_monitor.hrl
├── rebar.config
├── rebar.lock
├── src
├── system_monitor.app.src
├── system_monitor.erl
├── system_monitor_app.erl
├── system_monitor_callback.erl
├── system_monitor_events.erl
├── system_monitor_pg.erl
├── system_monitor_sup.erl
└── system_monitor_top.erl
└── test
└── system_monitor_tests.erl
/.github/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Contributor Covenant Code of Conduct
2 |
3 | ## Our Pledge
4 |
5 | We as members, contributors, and leaders pledge to make participation in our
6 | community a harassment-free experience for everyone, regardless of age, body
7 | size, visible or invisible disability, ethnicity, sex characteristics, gender
8 | identity and expression, level of experience, education, socio-economic status,
9 | nationality, personal appearance, race, religion, or sexual identity
10 | and orientation.
11 |
12 | We pledge to act and interact in ways that contribute to an open, welcoming,
13 | diverse, inclusive, and healthy community.
14 |
15 | ## Our Standards
16 |
17 | Examples of behavior that contributes to a positive environment for our
18 | community include:
19 |
20 | - Demonstrating empathy and kindness toward other people
21 | - Being respectful of differing opinions, viewpoints, and experiences
22 | - Giving and gracefully accepting constructive feedback
23 | - Accepting responsibility and apologizing to those affected by our mistakes,
24 | and learning from the experience
25 | - Focusing on what is best not just for us as individuals, but for the
26 | overall community
27 |
28 | Examples of unacceptable behavior include:
29 |
30 | - The use of sexualized language or imagery, and sexual attention or
31 | advances of any kind
32 | - Trolling, insulting or derogatory comments, and personal or political attacks
33 | - Public or private harassment
34 | - Publishing others' private information, such as a physical or email
35 | address, without their explicit permission
36 | - Other conduct which could reasonably be considered inappropriate in a
37 | professional setting
38 |
39 | ## Enforcement Responsibilities
40 |
41 | Community leaders are responsible for clarifying and enforcing our standards of
42 | acceptable behavior and will take appropriate and fair corrective action in
43 | response to any behavior that they deem inappropriate, threatening, offensive,
44 | or harmful.
45 |
46 | Community leaders have the right and responsibility to remove, edit, or reject
47 | comments, commits, code, wiki edits, issues, and other contributions that are
48 | not aligned to this Code of Conduct, and will communicate reasons for moderation
49 | decisions when appropriate.
50 |
51 | ## Scope
52 |
53 | This Code of Conduct applies within all community spaces, and also applies when
54 | an individual is officially representing the community in public spaces.
55 | Examples of representing our community include using an official e-mail address,
56 | posting via an official social media account, or acting as an appointed
57 | representative at an online or offline event.
58 |
59 | ## Enforcement
60 |
61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
62 | reported to the community leaders responsible for enforcement at
63 | developers@klarna.com.
64 | All complaints will be reviewed and investigated promptly and fairly.
65 |
66 | All community leaders are obligated to respect the privacy and security of the
67 | reporter of any incident.
68 |
69 | ## Enforcement Guidelines
70 |
71 | Community leaders will follow these Community Impact Guidelines in determining
72 | the consequences for any action they deem in violation of this Code of Conduct:
73 |
74 | ### 1. Correction
75 |
76 | **Community Impact**: Use of inappropriate language or other behavior deemed
77 | unprofessional or unwelcome in the community.
78 |
79 | **Consequence**: A private, written warning from community leaders, providing
80 | clarity around the nature of the violation and an explanation of why the
81 | behavior was inappropriate. A public apology may be requested.
82 |
83 | ### 2. Warning
84 |
85 | **Community Impact**: A violation through a single incident or series
86 | of actions.
87 |
88 | **Consequence**: A warning with consequences for continued behavior. No
89 | interaction with the people involved, including unsolicited interaction with
90 | those enforcing the Code of Conduct, for a specified period of time. This
91 | includes avoiding interactions in community spaces as well as external channels
92 | like social media. Violating these terms may lead to a temporary or
93 | permanent ban.
94 |
95 | ### 3. Temporary Ban
96 |
97 | **Community Impact**: A serious violation of community standards, including
98 | sustained inappropriate behavior.
99 |
100 | **Consequence**: A temporary ban from any sort of interaction or public
101 | communication with the community for a specified period of time. No public or
102 | private interaction with the people involved, including unsolicited interaction
103 | with those enforcing the Code of Conduct, is allowed during this period.
104 | Violating these terms may lead to a permanent ban.
105 |
106 | ### 4. Permanent Ban
107 |
108 | **Community Impact**: Demonstrating a pattern of violation of community
109 | standards, including sustained inappropriate behavior, harassment of an
110 | individual, or aggression toward or disparagement of classes of individuals.
111 |
112 | **Consequence**: A permanent ban from any sort of public interaction within
113 | the community.
114 |
115 | ## Attribution
116 |
117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
118 | version 2.0, available at
119 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
120 |
121 | Community Impact Guidelines were inspired by [Mozilla's code of conduct
122 | enforcement ladder](https://github.com/mozilla/diversity).
123 |
124 | [homepage]: https://www.contributor-covenant.org
125 |
126 | For answers to common questions about this code of conduct, see the FAQ at
127 | https://www.contributor-covenant.org/faq. Translations are available at
128 | https://www.contributor-covenant.org/translations.
129 |
--------------------------------------------------------------------------------
/.github/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing to this Klarna project
2 |
3 | Are you here to help with this Klarna project? Welcome! Please read the following to better understand how to ask questions or work on something.
4 |
5 | All members of our community are expected to follow our [Code of Conduct](CODE_OF_CONDUCT.md). Please make sure you are welcoming and friendly in all of our spaces.
6 |
7 | ## Get in touch
8 |
9 | - Report bugs, suggest features or view the source code on GitHub.
10 | - If you have any questions concerning this product, please contact developers@klarna.com.
11 |
12 | ## Contributing to development
13 |
14 | At Klarna, we strive toward achieving the highest possible quality for our
15 | products. Therefore, we require you to follow these guidelines if you wish
16 | to contribute.
17 |
18 | Your contribution has to meet the following criteria:
19 |
20 | - It is accompanied by a description regarding what has been changed and why.
21 | - Pull requests should implement a boxed change, meaning they should optimally not try to address many things at once.
22 | - All code and documentation must follow the style specified by
23 | the included configuration.
24 | - New features and bug fixes must have accompanying unit tests.
25 | - All unit tests should pass.
26 |
--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
1 | name: Build
2 | on:
3 | push:
4 | branches:
5 | - '*'
6 | pull_request:
7 | branches:
8 | - master
9 | jobs:
10 | build:
11 | strategy:
12 | matrix:
13 | platform: [ubuntu-latest]
14 | otp-version: [23, 24, 25, 26]
15 | runs-on: ${{ matrix.platform }}
16 | container:
17 | image: erlang:${{ matrix.otp-version }}
18 | steps:
19 | - name: Checkout
20 | uses: actions/checkout@v3
21 |
22 | - name: Cache Dialyzer PLTs
23 | uses: actions/cache@v3
24 | with:
25 | path: |
26 | ~/.cache/rebar3/rebar3_*_plt
27 | _build/**/*_plt
28 | key: ${{ runner.os }}-otp${{ matrix.otp-version }}-dialyzer-${{ hashFiles('rebar.config') }}
29 | restore-keys: |
30 | ${{ runner.os }}-otp${{ matrix.otp-version }}-dialyzer-
31 |
32 | - name: Compile
33 | run: rebar3 do compile
34 |
35 | - name: Analyze
36 | run: rebar3 do xref, dialyzer
37 |
38 | - name: Test
39 | run: rebar3 do eunit, ct
40 |
--------------------------------------------------------------------------------
/.github/workflows/repolint.yml:
--------------------------------------------------------------------------------
1 | name: Klarna repolint
2 |
3 | on:
4 | push:
5 | branches: [master]
6 | pull_request:
7 | branches: [master]
8 |
9 | jobs:
10 | lint:
11 | runs-on: ubuntu-latest
12 |
13 | steps:
14 | # Checks-out the repository under $GITHUB_WORKSPACE
15 | - uses: actions/checkout@v2
16 |
17 | - name: Install dependencies
18 | run: npm install repolinter log-symbols
19 |
20 | # @TODO Remove when fixed
21 | - name: Fix missing dependency in repolint
22 | run: npm install is-windows
23 |
24 | - name: Use custom rules
25 | run: wget https://raw.githubusercontent.com/klarna-incubator/meta/master/repolint.json
26 |
27 | - name: Run repolint
28 | run: ./node_modules/.bin/repolinter $GITHUB_WORKSPACE
29 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | _build
2 | *.beam
3 | ebin/
4 | .idea
5 |
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # Changelog
2 |
3 | All notable changes to this project will be documented in this file.
4 |
5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7 |
8 | ## [Unreleased] - yyyy-mm-dd
9 |
10 | Batch insert operations in postgres backend.
11 |
12 | ## [2.2.2] - 2022-11-29
13 |
14 | Do not query memory usage of processes that have huge message queues.
15 |
16 | ## [2.2.1] - 2022-09-12
17 |
18 | Fixed a bug which could cause badrecord errors in system\_monitor\_top.
19 |
20 | ## [2.2.0] - 2021-11-05
21 |
22 | Added support for configuring a module to use to send system_monitor events to
23 | an external destination.
24 |
25 | ## [2.1.0] - 2021-10-20
26 |
27 | Data format of system\_monitor\_top is changed to keep static data between
28 | ticks. Since this gen server is started by a supervisor that allows for some
29 | restarts, you can either let the server crash or stop+start this application.
30 |
31 | ## [2.0.0] - 2021-04-07
32 |
33 | Replace Kafka backend with a configurable one that defaults into Postgres
34 |
35 | ## [1.0.0] - 2020-09-02
36 |
37 | Initial version
38 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 |
2 | Apache License
3 | Version 2.0, January 2004
4 | http://www.apache.org/licenses/
5 |
6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7 |
8 | 1. Definitions.
9 |
10 | "License" shall mean the terms and conditions for use, reproduction,
11 | and distribution as defined by Sections 1 through 9 of this document.
12 |
13 | "Licensor" shall mean the copyright owner or entity authorized by
14 | the copyright owner that is granting the License.
15 |
16 | "Legal Entity" shall mean the union of the acting entity and all
17 | other entities that control, are controlled by, or are under common
18 | control with that entity. For the purposes of this definition,
19 | "control" means (i) the power, direct or indirect, to cause the
20 | direction or management of such entity, whether by contract or
21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
22 | outstanding shares, or (iii) beneficial ownership of such entity.
23 |
24 | "You" (or "Your") shall mean an individual or Legal Entity
25 | exercising permissions granted by this License.
26 |
27 | "Source" form shall mean the preferred form for making modifications,
28 | including but not limited to software source code, documentation
29 | source, and configuration files.
30 |
31 | "Object" form shall mean any form resulting from mechanical
32 | transformation or translation of a Source form, including but
33 | not limited to compiled object code, generated documentation,
34 | and conversions to other media types.
35 |
36 | "Work" shall mean the work of authorship, whether in Source or
37 | Object form, made available under the License, as indicated by a
38 | copyright notice that is included in or attached to the work
39 | (an example is provided in the Appendix below).
40 |
41 | "Derivative Works" shall mean any work, whether in Source or Object
42 | form, that is based on (or derived from) the Work and for which the
43 | editorial revisions, annotations, elaborations, or other modifications
44 | represent, as a whole, an original work of authorship. For the purposes
45 | of this License, Derivative Works shall not include works that remain
46 | separable from, or merely link (or bind by name) to the interfaces of,
47 | the Work and Derivative Works thereof.
48 |
49 | "Contribution" shall mean any work of authorship, including
50 | the original version of the Work and any modifications or additions
51 | to that Work or Derivative Works thereof, that is intentionally
52 | submitted to Licensor for inclusion in the Work by the copyright owner
53 | or by an individual or Legal Entity authorized to submit on behalf of
54 | the copyright owner. For the purposes of this definition, "submitted"
55 | means any form of electronic, verbal, or written communication sent
56 | to the Licensor or its representatives, including but not limited to
57 | communication on electronic mailing lists, source code control systems,
58 | and issue tracking systems that are managed by, or on behalf of, the
59 | Licensor for the purpose of discussing and improving the Work, but
60 | excluding communication that is conspicuously marked or otherwise
61 | designated in writing by the copyright owner as "Not a Contribution."
62 |
63 | "Contributor" shall mean Licensor and any individual or Legal Entity
64 | on behalf of whom a Contribution has been received by Licensor and
65 | subsequently incorporated within the Work.
66 |
67 | 2. Grant of Copyright License. Subject to the terms and conditions of
68 | this License, each Contributor hereby grants to You a perpetual,
69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70 | copyright license to reproduce, prepare Derivative Works of,
71 | publicly display, publicly perform, sublicense, and distribute the
72 | Work and such Derivative Works in Source or Object form.
73 |
74 | 3. Grant of Patent License. Subject to the terms and conditions of
75 | this License, each Contributor hereby grants to You a perpetual,
76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77 | (except as stated in this section) patent license to make, have made,
78 | use, offer to sell, sell, import, and otherwise transfer the Work,
79 | where such license applies only to those patent claims licensable
80 | by such Contributor that are necessarily infringed by their
81 | Contribution(s) alone or by combination of their Contribution(s)
82 | with the Work to which such Contribution(s) was submitted. If You
83 | institute patent litigation against any entity (including a
84 | cross-claim or counterclaim in a lawsuit) alleging that the Work
85 | or a Contribution incorporated within the Work constitutes direct
86 | or contributory patent infringement, then any patent licenses
87 | granted to You under this License for that Work shall terminate
88 | as of the date such litigation is filed.
89 |
90 | 4. Redistribution. You may reproduce and distribute copies of the
91 | Work or Derivative Works thereof in any medium, with or without
92 | modifications, and in Source or Object form, provided that You
93 | meet the following conditions:
94 |
95 | (a) You must give any other recipients of the Work or
96 | Derivative Works a copy of this License; and
97 |
98 | (b) You must cause any modified files to carry prominent notices
99 | stating that You changed the files; and
100 |
101 | (c) You must retain, in the Source form of any Derivative Works
102 | that You distribute, all copyright, patent, trademark, and
103 | attribution notices from the Source form of the Work,
104 | excluding those notices that do not pertain to any part of
105 | the Derivative Works; and
106 |
107 | (d) If the Work includes a "NOTICE" text file as part of its
108 | distribution, then any Derivative Works that You distribute must
109 | include a readable copy of the attribution notices contained
110 | within such NOTICE file, excluding those notices that do not
111 | pertain to any part of the Derivative Works, in at least one
112 | of the following places: within a NOTICE text file distributed
113 | as part of the Derivative Works; within the Source form or
114 | documentation, if provided along with the Derivative Works; or,
115 | within a display generated by the Derivative Works, if and
116 | wherever such third-party notices normally appear. The contents
117 | of the NOTICE file are for informational purposes only and
118 | do not modify the License. You may add Your own attribution
119 | notices within Derivative Works that You distribute, alongside
120 | or as an addendum to the NOTICE text from the Work, provided
121 | that such additional attribution notices cannot be construed
122 | as modifying the License.
123 |
124 | You may add Your own copyright statement to Your modifications and
125 | may provide additional or different license terms and conditions
126 | for use, reproduction, or distribution of Your modifications, or
127 | for any such Derivative Works as a whole, provided Your use,
128 | reproduction, and distribution of the Work otherwise complies with
129 | the conditions stated in this License.
130 |
131 | 5. Submission of Contributions. Unless You explicitly state otherwise,
132 | any Contribution intentionally submitted for inclusion in the Work
133 | by You to the Licensor shall be under the terms and conditions of
134 | this License, without any additional terms or conditions.
135 | Notwithstanding the above, nothing herein shall supersede or modify
136 | the terms of any separate license agreement you may have executed
137 | with Licensor regarding such Contributions.
138 |
139 | 6. Trademarks. This License does not grant permission to use the trade
140 | names, trademarks, service marks, or product names of the Licensor,
141 | except as required for reasonable and customary use in describing the
142 | origin of the Work and reproducing the content of the NOTICE file.
143 |
144 | 7. Disclaimer of Warranty. Unless required by applicable law or
145 | agreed to in writing, Licensor provides the Work (and each
146 | Contributor provides its Contributions) on an "AS IS" BASIS,
147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 | implied, including, without limitation, any warranties or conditions
149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 | PARTICULAR PURPOSE. You are solely responsible for determining the
151 | appropriateness of using or redistributing the Work and assume any
152 | risks associated with Your exercise of permissions under this License.
153 |
154 | 8. Limitation of Liability. In no event and under no legal theory,
155 | whether in tort (including negligence), contract, or otherwise,
156 | unless required by applicable law (such as deliberate and grossly
157 | negligent acts) or agreed to in writing, shall any Contributor be
158 | liable to You for damages, including any direct, indirect, special,
159 | incidental, or consequential damages of any character arising as a
160 | result of this License or out of the use or inability to use the
161 | Work (including but not limited to damages for loss of goodwill,
162 | work stoppage, computer failure or malfunction, or any and all
163 | other commercial damages or losses), even if such Contributor
164 | has been advised of the possibility of such damages.
165 |
166 | 9. Accepting Warranty or Additional Liability. While redistributing
167 | the Work or Derivative Works thereof, You may choose to offer,
168 | and charge a fee for, acceptance of support, warranty, indemnity,
169 | or other liability obligations and/or rights consistent with this
170 | License. However, in accepting such obligations, You may act only
171 | on Your own behalf and on Your sole responsibility, not on behalf
172 | of any other Contributor, and only if You agree to indemnify,
173 | defend, and hold each Contributor harmless for any liability
174 | incurred by, or claims asserted against, such Contributor by reason
175 | of your accepting any such warranty or additional liability.
176 |
177 | END OF TERMS AND CONDITIONS
178 |
179 | APPENDIX: How to apply the Apache License to your work.
180 |
181 | To apply the Apache License to your work, attach the following
182 | boilerplate notice, with the fields enclosed by brackets "[]"
183 | replaced with your own identifying information. (Don't include
184 | the brackets!) The text should be enclosed in the appropriate
185 | comment syntax for the file format. We also recommend that a
186 | file or class name and description of purpose be included on the
187 | same "printed page" as the copyright notice for easier
188 | identification within third-party archives.
189 |
190 | Copyright [yyyy] [name of copyright owner]
191 |
192 | Licensed under the Apache License, Version 2.0 (the "License");
193 | you may not use this file except in compliance with the License.
194 | You may obtain a copy of the License at
195 |
196 | http://www.apache.org/licenses/LICENSE-2.0
197 |
198 | Unless required by applicable law or agreed to in writing, software
199 | distributed under the License is distributed on an "AS IS" BASIS,
200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 | See the License for the specific language governing permissions and
202 | limitations under the License.
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | PICS=$(patsubst %.uml,%.png,$(wildcard doc/*.uml))
2 |
3 | .PHONY: test
4 | test:
5 | rebar3 do compile, xref, dialyzer, eunit
6 |
7 | doc/%.png: doc/%.uml
8 | plantuml -tsvg $<
9 |
10 | .PHONY: doc
11 | doc: $(PICS)
12 | rebar3 edoc
13 |
14 | .PHONY: dev-start
15 | dev-start:
16 | docker-compose -f docker/docker-compose.yml up -d
17 |
18 | .PHONY: dev-stop
19 | dev-stop:
20 | docker-compose -f docker/docker-compose.yml down --rmi all
21 |
22 | .PHONY: clean
23 | clean:
24 | rm -rf _build
25 |
26 | .PHONY: hex-publish
27 | hex-publish: clean
28 | rebar3 as dev hex publish
29 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # system_monitor
2 | > Erlang telemetry collector
3 |
4 | [![Build Status][ci-image]][ci-url]
5 | [![License][license-image]][license-url]
6 | [![Developed at Klarna][klarna-image]][klarna-url]
7 |
8 | `system_monitor` is a BEAM VM monitoring and introspection application
9 | that helps in troubleshooting live systems. It collects various
10 | information about Erlang processes and applications.
11 | Unlike `observer`, `system_monitor` does not require
12 | connecting to the monitored system via the Erlang distribution protocol,
13 | and can be used to monitor systems with very tight access
14 | restrictions.
15 |
16 | ## Features
17 |
18 | ### Process top
19 |
20 | Information about top N Erlang processes consuming the most resources
21 | (such as reductions or memory), or have the longest message queues, is
22 | presented on the process top dashboard:
23 |
24 | 
25 |
26 | Historical data can be accessed via standard Grafana time
27 | picker. `status` panel can display important information about the
28 | node state. Pids of the processes on that dashboard are clickable
29 | links that lead to the process history dashboard.
30 |
31 | ### Process history
32 | 
33 |
34 | Process history dashboard displays time series data about certain
35 | Erlang process. Note that some data points can be missing if the
36 | process didn't consume enough resources to appear in the process top.
37 |
38 | ### Application top
39 | 
40 |
41 | Application top dashboard contains various information aggregated per
42 | OTP application.
43 |
44 | ## Usage example
45 |
46 | In order to integrate `system_monitor` into your system, simply add it
47 | to the release apps. Add the following lines to `rebar.config`:
48 |
49 | ```erlang
50 | {deps, [..., system_monitor]}.
51 |
52 | {relx,
53 | [ {release, {my_release, "1.0.0"},
54 | [kernel, sasl, ..., system_monitor]}
55 | ]}.
56 | ```
57 |
58 | To enable export to Postgres:
59 |
60 | ```erlang
61 | application:load(system_monitor),
62 | application:set_env(system_monitor, callback_mod, system_monitor_pg)
63 | ```
64 |
65 | ### Custom node status
66 |
67 | `system_monitor` can export arbitrary node status information that is
68 | deemed important for the operator. This is done by defining a callback
69 | function that returns an HTML-formatted string (or iolist):
70 |
71 | ```erlang
72 | -module(foo).
73 |
74 | -export([node_status/0]).
75 |
76 | node_status() ->
77 | ["my node type
",
78 | case healthy() of
79 | true -> "UP
"
80 | false -> "DEGRADED
"
81 | end,
82 | io_lib:format("very important value=~p", [very_important_value()])
83 | ].
84 | ```
85 |
86 | This callback then needs to be added to the system_monitor application
87 | environment:
88 |
89 | ```erlang
90 | {system_monitor,
91 | [ {node_status_fun, {foo, node_status}}
92 | ...
93 | ]}
94 | ```
95 |
96 | More information about configurable options is found [here](src/system_monitor.app.src).
97 |
98 | ## How it all works out
99 |
100 | System_monitor will spawn several processes that handle different states:
101 |
102 | * `system_monitor_top`
103 | Collects a certain amount of data from the BEAM for a preconfigured number of processes
104 | * `system_monitor_events`
105 | Subscribes to certain types of preconfigured BEAM events such as: busy_port, long_gc, long_schedule etc
106 | * `system_monitor`
107 | Runs a set of preconfigured `monitors` periodically
108 |
109 | ### What are the preconfigured monitors
110 |
111 | * `check_process_count`
112 | Logs if the process_count passes a certain threshold
113 | * `suspect_procs`
114 | Logs if it detects processes with suspiciously high memory
115 | * `report_full_status`
116 | Gets the state from `system_monitor_top` and produces to a backend module
117 | that implements the `system_monitor_callback` behavior, selected by binding
118 | `callback_mod` in the `system_monitor` application environment to that module.
119 | If `callback_mod` is unbound, this monitor is disabled.
120 | The preconfigured backend is Postgres and is implemented via `system_monitor_pg`.
121 |
122 | `system_monitor_pg` allows for Postgres being temporary down by storing the stats in its own internal buffer.
123 | This buffer is built with a sliding window that will stop the state from growing too big whenever
124 | Postgres is down for too long. On top of this `system_monitor_pg` has a built-in load
125 | shedding mechanism that protects itself once the message length queue grows bigger than a certain level.
126 |
127 | ## Local development
128 | A Postgres and Grafana cluster can be spun up using `make dev-start` and stopped using `make dev-stop`.
129 | Start `system_monitor` by calling `rebar3 shell` and start the application with `application:ensure_all_started(system_monitor)`.
130 |
131 | At this point a grafana instance will be available on localhost:3000 with default login "admin" and password
132 | "admin" including some predefined dashboards.
133 |
134 | ## Production setup
135 | For production, a similar Postgres has to be setup as is done in the Dockerfile for Postgres in case one chooses to go with a system_monitor -> Postgres setup.
136 |
137 | ## How to contribute
138 |
139 | See our guide on [contributing](.github/CONTRIBUTING.md).
140 |
141 | ## Release History
142 |
143 | See our [changelog](CHANGELOG.md).
144 |
145 | ## License
146 |
147 | Copyright © 2020-2023 Klarna Bank AB
148 |
149 | For license details, see the [LICENSE](LICENSE) file in the root of this project.
150 |
151 |
152 |
153 | [ci-image]: https://img.shields.io/badge/build-passing-brightgreen?style=flat-square
154 | [ci-url]: https://github.com/klarna-incubator/TODO
155 | [license-image]: https://img.shields.io/badge/license-Apache%202-blue?style=flat-square
156 | [license-url]: http://www.apache.org/licenses/LICENSE-2.0
157 | [klarna-image]: https://img.shields.io/badge/%20-Developed%20at%20Klarna-black?labelColor=ffb3c7&style=flat-square&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAOCAYAAAAmL5yKAAAAAXNSR0IArs4c6QAAAIRlWElmTU0AKgAAAAgABQESAAMAAAABAAEAAAEaAAUAAAABAAAASgEbAAUAAAABAAAAUgEoAAMAAAABAAIAAIdpAAQAAAABAAAAWgAAAAAAAALQAAAAAQAAAtAAAAABAAOgAQADAAAAAQABAACgAgAEAAAAAQAAABCgAwAEAAAAAQAAAA4AAAAA0LMKiwAAAAlwSFlzAABuugAAbroB1t6xFwAAAVlpVFh0WE1MOmNvbS5hZG9iZS54bXAAAAAAADx4OnhtcG1ldGEgeG1sbnM6eD0iYWRvYmU6bnM6bWV0YS8iIHg6eG1wdGs9IlhNUCBDb3JlIDUuNC4wIj4KICAgPHJkZjpSREYgeG1sbnM6cmRmPSJodHRwOi8vd3d3LnczLm9yZy8xOTk5LzAyLzIyLXJkZi1zeW50YXgtbnMjIj4KICAgICAgPHJkZjpEZXNjcmlwdGlvbiByZGY6YWJvdXQ9IiIKICAgICAgICAgICAgeG1sbnM6dGlmZj0iaHR0cDovL25zLmFkb2JlLmNvbS90aWZmLzEuMC8iPgogICAgICAgICA8dGlmZjpPcmllbnRhdGlvbj4xPC90aWZmOk9yaWVudGF0aW9uPgogICAgICA8L3JkZjpEZXNjcmlwdGlvbj4KICAgPC9yZGY6UkRGPgo8L3g6eG1wbWV0YT4KTMInWQAAAVBJREFUKBVtkz0vREEUhsdXgo5qJXohkUgQ0fgFNFpR2V5ClP6CQu9PiB6lEL1I7B9A4/treZ47c252s97k2ffMmZkz5869m1JKL/AFbzAHaiRbmsIf4BdaMAZqMFsOXNxXkroKbxCPV5l8yHOJLVipn9/vEreLa7FguSN3S2ynA/ATeQuI8tTY6OOY34DQaQnq9mPCDtxoBwuRxPfAvPMWnARlB12KAi6eLTPruOOP4gcl33O6+Sjgc83DJkRH+h2MgorLzaPy68W48BG2S+xYnmAa1L+nOxEduMH3fgjGFvZeVkANZau68B6CrgJxWosFFpF7iG+h5wKZqwt42qIJtARu/ix+gqsosEq8D35o6R3c7OL4lAnTDljEe9B3Qa2BYzmHemDCt6Diwo6JY7E+A82OnN9HuoBruAQvUQ1nSxP4GVzBDRyBfygf6RW2/gD3NmEv+K/DZgAAAABJRU5ErkJggg==
158 | [klarna-url]: https://github.com/klarna-incubator
159 |
--------------------------------------------------------------------------------
/doc/app_top.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/klarna/system_monitor/5221666bef6dd18013fb4d9cf9a775e77619b47c/doc/app_top.png
--------------------------------------------------------------------------------
/doc/proc_history.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/klarna/system_monitor/5221666bef6dd18013fb4d9cf9a775e77619b47c/doc/proc_history.png
--------------------------------------------------------------------------------
/doc/proc_top.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/klarna/system_monitor/5221666bef6dd18013fb4d9cf9a775e77619b47c/doc/proc_top.png
--------------------------------------------------------------------------------
/docker/db/10-create_users.sql:
--------------------------------------------------------------------------------
1 | CREATE USER system_monitor WITH PASSWORD 'system_monitor_password';
2 | CREATE USER grafana WITH PASSWORD 'system_monitor_password';
3 |
--------------------------------------------------------------------------------
/docker/db/20-create_db.sql:
--------------------------------------------------------------------------------
1 | CREATE DATABASE system_monitor;
2 |
--------------------------------------------------------------------------------
/docker/db/30-db_schema.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -euxo
3 |
4 | psql -v ON_ERROR_STOP=1 --username "system_monitor" --dbname "system_monitor" <<-EOSQL
5 |
6 | -----------------------------------------------------------------------------------
7 | -- prc table
8 | -----------------------------------------------------------------------------------
9 |
10 | create table if not exists prc (
11 | node text not null,
12 | ts timestamp without time zone not null,
13 | pid text not null,
14 | dreductions double precision not null,
15 | dmemory double precision not null,
16 | reductions bigint not null,
17 | memory bigint not null,
18 | message_queue_len bigint not null,
19 | current_function text,
20 | initial_call text,
21 | registered_name text,
22 | stack_size bigint,
23 | heap_size bigint,
24 | total_heap_size bigint,
25 | current_stacktrace text,
26 | group_leader text
27 | ) partition by range(ts);
28 |
29 | alter table prc owner to system_monitor;
30 | grant insert on table prc to system_monitor;
31 | grant select on table prc to grafana;
32 |
33 | -----------------------------------------------------------------------------------
34 | -- app_top table
35 | -----------------------------------------------------------------------------------
36 | DO \$\$
37 | BEGIN
38 | IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'app_top_unit') THEN
39 | CREATE TYPE app_top_unit AS
40 | enum ('reductions', 'memory', 'processes');
41 | END IF;
42 | END\$\$;
43 |
44 | create table if not exists app_top (
45 | node text,
46 | ts timestamp without time zone not null,
47 | application text,
48 | unit app_top_unit,
49 | value numeric
50 | ) partition by range(ts);
51 |
52 | alter table app_top owner to system_monitor;
53 | grant insert on table app_top to system_monitor;
54 | grant select on table app_top to grafana;
55 |
56 | -----------------------------------------------------------------------------------
57 | -- fun_top table
58 | -----------------------------------------------------------------------------------
59 | DO \$\$
60 | BEGIN
61 | IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'fun_type') THEN
62 | CREATE TYPE fun_type AS
63 | enum ('initial_call', 'current_function');
64 | END IF;
65 | END\$\$;
66 |
67 | create table if not exists fun_top (
68 | node text,
69 | ts timestamp without time zone not null,
70 | fun text,
71 | fun_type fun_type,
72 | num_processes numeric
73 | ) partition by range(ts);
74 |
75 | alter table fun_top owner to system_monitor;
76 | grant insert on table fun_top to system_monitor;
77 | grant select on table fun_top to grafana;
78 |
79 | -----------------------------------------------------------------------------------
80 | -- node_role table
81 | -----------------------------------------------------------------------------------
82 |
83 | create table if not exists node_role (
84 | node text not null,
85 | ts timestamp without time zone not null,
86 | data text
87 | ) partition by range(ts);
88 |
89 | alter table node_role owner to system_monitor;
90 | grant delete on table node_role to system_monitor;
91 | grant select on table node_role to system_monitor;
92 | grant insert on table node_role to system_monitor;
93 | grant select on table node_role to grafana;
94 |
95 | create index if not exists node_role_ts_idx on node_role(ts);
96 |
97 | -----------------------------------------------------------------------------------
98 | -- node table
99 | -----------------------------------------------------------------------------------
100 |
101 | create table if not exists node (
102 | node text not null primary key
103 | );
104 |
105 | alter table node owner to system_monitor;
106 | grant select on table node to system_monitor;
107 | grant insert on table node to system_monitor;
108 | grant select on table node to grafana;
109 |
110 | create or replace function update_nodes()
111 | returns trigger
112 | language plpgsql as
113 | \$\$
114 | begin
115 | insert into node(node) values (NEW.node) on conflict do nothing;
116 | return null;
117 | end;
118 | \$\$;
119 |
120 | drop trigger if exists update_nodes_trigger on node_role;
121 | create trigger update_nodes_trigger
122 | after insert on node_role
123 | for each row
124 | execute procedure update_nodes();
125 |
126 | EOSQL
127 |
--------------------------------------------------------------------------------
/docker/db/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM postgres:12.4
2 |
3 | COPY *.sql /docker-entrypoint-initdb.d/
4 | COPY *.sh /docker-entrypoint-initdb.d/
5 |
--------------------------------------------------------------------------------
/docker/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: "3.6"
2 |
3 | services:
4 | db:
5 | build: db
6 | ports:
7 | - 5432:5432
8 | environment:
9 | - POSTGRES_PASSWORD=system_monitor_password
10 |
11 | grafana:
12 | build: grafana
13 | depends_on: [db]
14 | ports:
15 | - 3000:3000
16 |
--------------------------------------------------------------------------------
/docker/grafana/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM grafana/grafana:latest
2 |
3 | COPY datasources/*.yml /etc/grafana/provisioning/datasources/
4 | COPY dashboards/*.json /var/lib/grafana/dashboards/
5 | COPY dashboards/*.yml /etc/grafana/provisioning/dashboards/
6 |
--------------------------------------------------------------------------------
/docker/grafana/dashboards/app_top.json:
--------------------------------------------------------------------------------
1 | {
2 | "annotations": {
3 | "list": [
4 | {
5 | "builtIn": 1,
6 | "datasource": "-- Grafana --",
7 | "enable": true,
8 | "hide": true,
9 | "iconColor": "rgba(0, 211, 255, 1)",
10 | "name": "Annotations & Alerts",
11 | "type": "dashboard"
12 | }
13 | ]
14 | },
15 | "editable": true,
16 | "gnetId": null,
17 | "graphTooltip": 0,
18 | "id": 1,
19 | "iteration": 1596192927850,
20 | "links": [],
21 | "panels": [
22 | {
23 | "aliasColors": {},
24 | "bars": true,
25 | "dashLength": 10,
26 | "dashes": false,
27 | "datasource": "Postgres",
28 | "fill": 1,
29 | "fillGradient": 0,
30 | "gridPos": {
31 | "h": 9,
32 | "w": 12,
33 | "x": 0,
34 | "y": 0
35 | },
36 | "hiddenSeries": false,
37 | "id": 6,
38 | "legend": {
39 | "avg": false,
40 | "current": false,
41 | "max": false,
42 | "min": false,
43 | "show": true,
44 | "total": false,
45 | "values": false
46 | },
47 | "lines": false,
48 | "linewidth": 1,
49 | "links": [],
50 | "nullPointMode": "null",
51 | "options": {
52 | "dataLinks": []
53 | },
54 | "percentage": false,
55 | "pointradius": 5,
56 | "points": false,
57 | "renderer": "flot",
58 | "repeat": null,
59 | "repeatDirection": "h",
60 | "seriesOverrides": [],
61 | "spaceLength": 10,
62 | "stack": true,
63 | "steppedLine": false,
64 | "targets": [
65 | {
66 | "alias": "",
67 | "format": "time_series",
68 | "rawSql": "WITH tp(tp) AS (VALUES(($__unixEpochTo()-$__unixEpochFrom())/100))\nSELECT\n (extract(epoch from ts)/tp)::bigint*tp AS time,\n application AS \"metric\",\n log(avg(value))\nFROM\n app_top, tp\nWHERE\n unit = 'reductions' AND\n node = '[[node]]' AND\n ts BETWEEN $__timeFrom() AND $__timeTo()\nGROUP BY application,time\nORDER BY time",
69 | "refId": "A"
70 | }
71 | ],
72 | "thresholds": [],
73 | "timeFrom": null,
74 | "timeRegions": [],
75 | "timeShift": null,
76 | "title": "reductions per application (log scale)",
77 | "tooltip": {
78 | "shared": false,
79 | "sort": 2,
80 | "value_type": "individual"
81 | },
82 | "type": "graph",
83 | "xaxis": {
84 | "buckets": null,
85 | "mode": "time",
86 | "name": null,
87 | "show": true,
88 | "values": []
89 | },
90 | "yaxes": [
91 | {
92 | "format": "short",
93 | "label": null,
94 | "logBase": 1,
95 | "max": null,
96 | "min": null,
97 | "show": true
98 | },
99 | {
100 | "format": "short",
101 | "label": null,
102 | "logBase": 1,
103 | "max": null,
104 | "min": null,
105 | "show": true
106 | }
107 | ],
108 | "yaxis": {
109 | "align": false,
110 | "alignLevel": null
111 | }
112 | },
113 | {
114 | "aliasColors": {},
115 | "bars": true,
116 | "dashLength": 10,
117 | "dashes": false,
118 | "datasource": "Postgres",
119 | "fill": 1,
120 | "fillGradient": 0,
121 | "gridPos": {
122 | "h": 9,
123 | "w": 12,
124 | "x": 12,
125 | "y": 0
126 | },
127 | "hiddenSeries": false,
128 | "id": 8,
129 | "legend": {
130 | "avg": false,
131 | "current": false,
132 | "max": false,
133 | "min": false,
134 | "show": true,
135 | "total": false,
136 | "values": false
137 | },
138 | "lines": false,
139 | "linewidth": 1,
140 | "links": [],
141 | "nullPointMode": "null",
142 | "options": {
143 | "dataLinks": []
144 | },
145 | "percentage": false,
146 | "pointradius": 5,
147 | "points": false,
148 | "renderer": "flot",
149 | "seriesOverrides": [],
150 | "spaceLength": 10,
151 | "stack": true,
152 | "steppedLine": false,
153 | "targets": [
154 | {
155 | "alias": "",
156 | "format": "time_series",
157 | "rawSql": "WITH tp(tp) AS (VALUES(($__unixEpochTo()-$__unixEpochFrom())/100))\nSELECT\n (extract(epoch from ts)/tp)::bigint*tp AS time,\n application AS \"metric\",\n avg(value)\nFROM\n app_top, tp\nWHERE\n unit = 'memory' AND\n node = '[[node]]' AND\n ts BETWEEN $__timeFrom() AND $__timeTo()\nGROUP BY application,time\nORDER BY time",
158 | "refId": "A"
159 | }
160 | ],
161 | "thresholds": [],
162 | "timeFrom": null,
163 | "timeRegions": [],
164 | "timeShift": null,
165 | "title": "memory per application",
166 | "tooltip": {
167 | "shared": false,
168 | "sort": 2,
169 | "value_type": "individual"
170 | },
171 | "type": "graph",
172 | "xaxis": {
173 | "buckets": null,
174 | "mode": "time",
175 | "name": null,
176 | "show": true,
177 | "values": []
178 | },
179 | "yaxes": [
180 | {
181 | "format": "decbytes",
182 | "label": null,
183 | "logBase": 1,
184 | "max": null,
185 | "min": null,
186 | "show": true
187 | },
188 | {
189 | "format": "short",
190 | "label": null,
191 | "logBase": 1,
192 | "max": null,
193 | "min": null,
194 | "show": true
195 | }
196 | ],
197 | "yaxis": {
198 | "align": false,
199 | "alignLevel": null
200 | }
201 | },
202 | {
203 | "aliasColors": {},
204 | "bars": true,
205 | "dashLength": 10,
206 | "dashes": false,
207 | "datasource": "Postgres",
208 | "fill": 1,
209 | "fillGradient": 0,
210 | "gridPos": {
211 | "h": 9,
212 | "w": 8,
213 | "x": 0,
214 | "y": 9
215 | },
216 | "hiddenSeries": false,
217 | "id": 9,
218 | "legend": {
219 | "avg": false,
220 | "current": false,
221 | "max": false,
222 | "min": false,
223 | "show": true,
224 | "total": false,
225 | "values": false
226 | },
227 | "lines": false,
228 | "linewidth": 1,
229 | "links": [],
230 | "nullPointMode": "null",
231 | "options": {
232 | "dataLinks": []
233 | },
234 | "percentage": false,
235 | "pointradius": 5,
236 | "points": false,
237 | "renderer": "flot",
238 | "seriesOverrides": [],
239 | "spaceLength": 10,
240 | "stack": true,
241 | "steppedLine": false,
242 | "targets": [
243 | {
244 | "alias": "",
245 | "format": "time_series",
246 | "rawSql": "WITH tp(tp) AS (VALUES(($__unixEpochTo()-$__unixEpochFrom())/100))\nSELECT\n (extract(epoch from ts)/tp)::bigint*tp AS time,\n application AS \"metric\",\n avg(value)\nFROM\n app_top, tp\nWHERE\n unit = 'processes' AND\n node = '[[node]]' AND\n ts BETWEEN $__timeFrom() AND $__timeTo()\nGROUP BY application,time\nORDER BY time",
247 | "refId": "A"
248 | }
249 | ],
250 | "thresholds": [],
251 | "timeFrom": null,
252 | "timeRegions": [],
253 | "timeShift": null,
254 | "title": "processes per application",
255 | "tooltip": {
256 | "shared": false,
257 | "sort": 0,
258 | "value_type": "individual"
259 | },
260 | "type": "graph",
261 | "xaxis": {
262 | "buckets": null,
263 | "mode": "time",
264 | "name": null,
265 | "show": true,
266 | "values": []
267 | },
268 | "yaxes": [
269 | {
270 | "format": "short",
271 | "label": null,
272 | "logBase": 1,
273 | "max": null,
274 | "min": null,
275 | "show": true
276 | },
277 | {
278 | "format": "short",
279 | "label": null,
280 | "logBase": 1,
281 | "max": null,
282 | "min": null,
283 | "show": true
284 | }
285 | ],
286 | "yaxis": {
287 | "align": false,
288 | "alignLevel": null
289 | }
290 | },
291 | {
292 | "aliasColors": {
293 | "application_master:main_loop/2": "#eab839",
294 | "dets:open_file_loop2/2": "#0a50a1",
295 | "dist_util:con_loop/2": "#eab839",
296 | "mochiweb_http:request/3": "#e5ac0e",
297 | "prim_inet:accept0/2": "#890f02"
298 | },
299 | "bars": true,
300 | "dashLength": 10,
301 | "dashes": false,
302 | "datasource": "Postgres",
303 | "fill": 1,
304 | "fillGradient": 0,
305 | "gridPos": {
306 | "h": 9,
307 | "w": 7,
308 | "x": 8,
309 | "y": 9
310 | },
311 | "hiddenSeries": false,
312 | "id": 2,
313 | "legend": {
314 | "avg": false,
315 | "current": false,
316 | "max": false,
317 | "min": false,
318 | "show": true,
319 | "total": false,
320 | "values": false
321 | },
322 | "lines": false,
323 | "linewidth": 1,
324 | "links": [],
325 | "nullPointMode": "null",
326 | "options": {
327 | "dataLinks": []
328 | },
329 | "percentage": true,
330 | "pointradius": 5,
331 | "points": false,
332 | "renderer": "flot",
333 | "seriesOverrides": [],
334 | "spaceLength": 10,
335 | "stack": true,
336 | "steppedLine": false,
337 | "targets": [
338 | {
339 | "alias": "",
340 | "format": "time_series",
341 | "rawSql": "WITH tp(tp) AS (VALUES(($__unixEpochTo()-$__unixEpochFrom())/100))\nSELECT\n (extract(epoch from ts)/tp)::bigint*tp AS time,\n fun AS \"metric\",\n avg(num_processes)\nFROM\n fun_top, tp\nWHERE\n fun_type = 'current_function' AND\n node = '[[node]]' AND\n ts BETWEEN $__timeFrom() AND $__timeTo()\nGROUP BY fun,time\nORDER BY time",
342 | "refId": "A"
343 | }
344 | ],
345 | "thresholds": [],
346 | "timeFrom": null,
347 | "timeRegions": [],
348 | "timeShift": null,
349 | "title": "% of processes execuing function",
350 | "tooltip": {
351 | "shared": false,
352 | "sort": 1,
353 | "value_type": "individual"
354 | },
355 | "type": "graph",
356 | "xaxis": {
357 | "buckets": null,
358 | "mode": "time",
359 | "name": null,
360 | "show": true,
361 | "values": []
362 | },
363 | "yaxes": [
364 | {
365 | "format": "short",
366 | "label": null,
367 | "logBase": 1,
368 | "max": null,
369 | "min": null,
370 | "show": true
371 | },
372 | {
373 | "format": "short",
374 | "label": null,
375 | "logBase": 1,
376 | "max": null,
377 | "min": null,
378 | "show": true
379 | }
380 | ],
381 | "yaxis": {
382 | "align": false,
383 | "alignLevel": null
384 | }
385 | },
386 | {
387 | "aliasColors": {},
388 | "bars": true,
389 | "dashLength": 10,
390 | "dashes": false,
391 | "datasource": "Postgres",
392 | "fill": 1,
393 | "fillGradient": 0,
394 | "gridPos": {
395 | "h": 9,
396 | "w": 9,
397 | "x": 15,
398 | "y": 9
399 | },
400 | "hiddenSeries": false,
401 | "id": 4,
402 | "legend": {
403 | "avg": false,
404 | "current": false,
405 | "max": false,
406 | "min": false,
407 | "show": true,
408 | "total": false,
409 | "values": false
410 | },
411 | "lines": false,
412 | "linewidth": 1,
413 | "links": [],
414 | "nullPointMode": "null",
415 | "options": {
416 | "dataLinks": []
417 | },
418 | "percentage": true,
419 | "pointradius": 5,
420 | "points": false,
421 | "renderer": "flot",
422 | "seriesOverrides": [],
423 | "spaceLength": 10,
424 | "stack": true,
425 | "steppedLine": false,
426 | "targets": [
427 | {
428 | "alias": "",
429 | "format": "time_series",
430 | "rawSql": "WITH tp(tp) AS (VALUES(($__unixEpochTo()-$__unixEpochFrom())/100))\nSELECT\n (extract(epoch from ts)/tp)::bigint*tp AS time,\n fun AS \"metric\",\n avg(num_processes)\nFROM\n fun_top, tp\nWHERE\n fun_type = 'initial_call' AND\n node = '[[node]]' AND\n ts BETWEEN $__timeFrom() AND $__timeTo()\nGROUP BY fun,time\nORDER BY time",
431 | "refId": "A"
432 | }
433 | ],
434 | "thresholds": [],
435 | "timeFrom": null,
436 | "timeRegions": [],
437 | "timeShift": null,
438 | "title": "% of processes started from this function",
439 | "tooltip": {
440 | "shared": true,
441 | "sort": 0,
442 | "value_type": "individual"
443 | },
444 | "type": "graph",
445 | "xaxis": {
446 | "buckets": null,
447 | "mode": "time",
448 | "name": null,
449 | "show": true,
450 | "values": []
451 | },
452 | "yaxes": [
453 | {
454 | "format": "short",
455 | "label": null,
456 | "logBase": 1,
457 | "max": null,
458 | "min": null,
459 | "show": true
460 | },
461 | {
462 | "format": "short",
463 | "label": null,
464 | "logBase": 1,
465 | "max": null,
466 | "min": null,
467 | "show": true
468 | }
469 | ],
470 | "yaxis": {
471 | "align": false,
472 | "alignLevel": null
473 | }
474 | }
475 | ],
476 | "refresh": false,
477 | "schemaVersion": 22,
478 | "style": "dark",
479 | "tags": [
480 | "erlang",
481 | "top"
482 | ],
483 | "templating": {
484 | "list": [
485 | {
486 | "allValue": null,
487 | "current": {
488 | "tags": [],
489 | "text": "",
490 | "value": ""
491 | },
492 | "datasource": "Postgres",
493 | "definition": "SELECT DISTINCT node FROM node ORDER BY node ASC;",
494 | "hide": 0,
495 | "includeAll": false,
496 | "label": null,
497 | "multi": false,
498 | "name": "node",
499 | "options": [
500 | {
501 | "selected": true,
502 | "text": "",
503 | "value": ""
504 | }
505 | ],
506 | "query": "SELECT DISTINCT node FROM node ORDER BY node ASC;",
507 | "refresh": 1,
508 | "regex": "",
509 | "skipUrlSync": false,
510 | "sort": 0,
511 | "tagValuesQuery": "",
512 | "tags": [],
513 | "tagsQuery": "",
514 | "type": "query",
515 | "useTags": false
516 | }
517 | ]
518 | },
519 | "time": {
520 | "from": "now-30m",
521 | "to": "now"
522 | },
523 | "timepicker": {
524 | "refresh_intervals": [
525 | "5s",
526 | "10s",
527 | "30s",
528 | "1m",
529 | "5m",
530 | "15m",
531 | "30m",
532 | "1h",
533 | "2h",
534 | "1d"
535 | ],
536 | "time_options": [
537 | "5m",
538 | "15m",
539 | "1h",
540 | "6h",
541 | "12h",
542 | "24h",
543 | "2d",
544 | "7d",
545 | "30d"
546 | ]
547 | },
548 | "timezone": "",
549 | "title": "Erlang applications top",
550 | "uid": "tw4QVxniz",
551 | "version": 5
552 | }
553 |
--------------------------------------------------------------------------------
/docker/grafana/dashboards/file.yml:
--------------------------------------------------------------------------------
1 | apiVersion: 1
2 |
3 | providers:
4 | - name: 'file'
5 | type: file
6 | disableDeletion: false
7 | editable: true
8 | updateIntervalSeconds: 10
9 | allowUiUpdates: true
10 | options:
11 | path: /var/lib/grafana/dashboards
12 | foldersFromFilesStructure: true
13 |
--------------------------------------------------------------------------------
/docker/grafana/dashboards/proc_history.json:
--------------------------------------------------------------------------------
1 | {
2 | "annotations": {
3 | "list": [
4 | {
5 | "builtIn": 1,
6 | "datasource": "-- Grafana --",
7 | "enable": true,
8 | "hide": true,
9 | "iconColor": "rgba(0, 211, 255, 1)",
10 | "name": "Annotations & Alerts",
11 | "type": "dashboard"
12 | }
13 | ]
14 | },
15 | "editable": true,
16 | "gnetId": null,
17 | "graphTooltip": 0,
18 | "id": 15,
19 | "iteration": 1596189996375,
20 | "links": [],
21 | "panels": [
22 | {
23 | "aliasColors": {},
24 | "bars": false,
25 | "dashLength": 10,
26 | "dashes": false,
27 | "datasource": "Postgres",
28 | "fill": 1,
29 | "fillGradient": 0,
30 | "gridPos": {
31 | "h": 9,
32 | "w": 24,
33 | "x": 0,
34 | "y": 0
35 | },
36 | "hiddenSeries": false,
37 | "id": 4,
38 | "legend": {
39 | "avg": false,
40 | "current": false,
41 | "max": false,
42 | "min": false,
43 | "show": true,
44 | "total": false,
45 | "values": false
46 | },
47 | "lines": true,
48 | "linewidth": 0,
49 | "links": [],
50 | "nullPointMode": "null as zero",
51 | "options": {
52 | "dataLinks": []
53 | },
54 | "percentage": false,
55 | "pointradius": 0.5,
56 | "points": true,
57 | "renderer": "flot",
58 | "seriesOverrides": [],
59 | "spaceLength": 10,
60 | "stack": false,
61 | "steppedLine": false,
62 | "targets": [
63 | {
64 | "alias": "",
65 | "format": "time_series",
66 | "rawSql": "SELECT\n $__time(ts),\n dreductions\nFROM\n prc\nWHERE\n (ts BETWEEN $__timeFrom() AND $__timeTo()) AND \n node='[[node]]' AND\n ( ('[[pid]]' != '' AND pid='[[pid]]') OR -- pid is set\n ('[[regname]]' != '' AND registered_name='[[regname]]') OR -- registered_name is set\n ('[[pid]]' = '' AND '[[regname]]' = '') -- none is set, show all processes\n )\nORDER BY ts\n",
67 | "refId": "A"
68 | }
69 | ],
70 | "thresholds": [],
71 | "timeFrom": null,
72 | "timeRegions": [],
73 | "timeShift": null,
74 | "title": "Reductions",
75 | "tooltip": {
76 | "shared": true,
77 | "sort": 0,
78 | "value_type": "individual"
79 | },
80 | "type": "graph",
81 | "xaxis": {
82 | "buckets": null,
83 | "mode": "time",
84 | "name": null,
85 | "show": true,
86 | "values": []
87 | },
88 | "yaxes": [
89 | {
90 | "format": "short",
91 | "label": null,
92 | "logBase": 1,
93 | "max": null,
94 | "min": null,
95 | "show": true
96 | },
97 | {
98 | "format": "short",
99 | "label": null,
100 | "logBase": 1,
101 | "max": null,
102 | "min": null,
103 | "show": true
104 | }
105 | ],
106 | "yaxis": {
107 | "align": false,
108 | "alignLevel": null
109 | }
110 | },
111 | {
112 | "aliasColors": {},
113 | "bars": false,
114 | "dashLength": 10,
115 | "dashes": false,
116 | "datasource": "Postgres",
117 | "fill": 1,
118 | "fillGradient": 0,
119 | "gridPos": {
120 | "h": 9,
121 | "w": 24,
122 | "x": 0,
123 | "y": 9
124 | },
125 | "hiddenSeries": false,
126 | "id": 2,
127 | "legend": {
128 | "avg": false,
129 | "current": false,
130 | "max": false,
131 | "min": false,
132 | "show": true,
133 | "total": false,
134 | "values": false
135 | },
136 | "lines": true,
137 | "linewidth": 0,
138 | "links": [],
139 | "nullPointMode": "null",
140 | "options": {
141 | "dataLinks": []
142 | },
143 | "percentage": false,
144 | "pointradius": 0.5,
145 | "points": true,
146 | "renderer": "flot",
147 | "seriesOverrides": [],
148 | "spaceLength": 10,
149 | "stack": true,
150 | "steppedLine": false,
151 | "targets": [
152 | {
153 | "alias": "",
154 | "format": "time_series",
155 | "rawSql": "SELECT\n $__time(ts),\n stack_size, heap_size, total_heap_size, memory\nFROM\n prc\nWHERE\n (ts BETWEEN $__timeFrom() AND $__timeTo()) AND \n node='[[node]]' AND\n ( ('[[pid]]' != '' AND pid='[[pid]]') OR -- pid is set\n ('[[regname]]' != '' AND registered_name='[[regname]]') OR -- registered_name is set\n ('[[pid]]' = '' AND '[[regname]]' = '') -- none is set, show all processes\n ) \nORDER BY ts\n",
156 | "refId": "A"
157 | }
158 | ],
159 | "thresholds": [],
160 | "timeFrom": null,
161 | "timeRegions": [],
162 | "timeShift": null,
163 | "title": "Memory",
164 | "tooltip": {
165 | "shared": true,
166 | "sort": 0,
167 | "value_type": "individual"
168 | },
169 | "type": "graph",
170 | "xaxis": {
171 | "buckets": null,
172 | "mode": "time",
173 | "name": null,
174 | "show": true,
175 | "values": []
176 | },
177 | "yaxes": [
178 | {
179 | "format": "decbytes",
180 | "label": null,
181 | "logBase": 1,
182 | "max": null,
183 | "min": null,
184 | "show": true
185 | },
186 | {
187 | "format": "short",
188 | "label": null,
189 | "logBase": 1,
190 | "max": null,
191 | "min": null,
192 | "show": true
193 | }
194 | ],
195 | "yaxis": {
196 | "align": false,
197 | "alignLevel": null
198 | }
199 | },
200 | {
201 | "aliasColors": {},
202 | "bars": false,
203 | "dashLength": 10,
204 | "dashes": false,
205 | "datasource": "Postgres",
206 | "fill": 1,
207 | "fillGradient": 0,
208 | "gridPos": {
209 | "h": 9,
210 | "w": 24,
211 | "x": 0,
212 | "y": 18
213 | },
214 | "hiddenSeries": false,
215 | "id": 6,
216 | "legend": {
217 | "avg": false,
218 | "current": false,
219 | "max": false,
220 | "min": false,
221 | "show": true,
222 | "total": false,
223 | "values": false
224 | },
225 | "lines": true,
226 | "linewidth": 0,
227 | "links": [],
228 | "nullPointMode": "null",
229 | "options": {
230 | "dataLinks": []
231 | },
232 | "percentage": false,
233 | "pointradius": 0.5,
234 | "points": true,
235 | "renderer": "flot",
236 | "seriesOverrides": [],
237 | "spaceLength": 10,
238 | "stack": false,
239 | "steppedLine": false,
240 | "targets": [
241 | {
242 | "alias": "",
243 | "format": "time_series",
244 | "rawSql": "SELECT\n $__time(ts), message_queue_len\nFROM prc\nWHERE\n (ts BETWEEN $__timeFrom() AND $__timeTo()) AND \n node='[[node]]' AND\n ( ('[[pid]]' != '' AND pid='[[pid]]') OR -- pid is set\n ('[[regname]]' != '' AND registered_name='[[regname]]') OR -- registered_name is set\n ('[[pid]]' = '' AND '[[regname]]' = '') -- none is set, show all processes\n )\nORDER BY ts",
245 | "refId": "A"
246 | }
247 | ],
248 | "thresholds": [],
249 | "timeFrom": null,
250 | "timeRegions": [],
251 | "timeShift": null,
252 | "title": "Message queue",
253 | "tooltip": {
254 | "shared": true,
255 | "sort": 0,
256 | "value_type": "individual"
257 | },
258 | "type": "graph",
259 | "xaxis": {
260 | "buckets": null,
261 | "mode": "time",
262 | "name": null,
263 | "show": true,
264 | "values": []
265 | },
266 | "yaxes": [
267 | {
268 | "format": "short",
269 | "label": null,
270 | "logBase": 1,
271 | "max": null,
272 | "min": null,
273 | "show": true
274 | },
275 | {
276 | "format": "short",
277 | "label": null,
278 | "logBase": 1,
279 | "max": null,
280 | "min": null,
281 | "show": true
282 | }
283 | ],
284 | "yaxis": {
285 | "align": false,
286 | "alignLevel": null
287 | }
288 | },
289 | {
290 | "columns": [],
291 | "datasource": "Postgres",
292 | "fontSize": "100%",
293 | "gridPos": {
294 | "h": 8,
295 | "w": 24,
296 | "x": 0,
297 | "y": 27
298 | },
299 | "id": 8,
300 | "links": [],
301 | "options": {},
302 | "pageSize": null,
303 | "scroll": true,
304 | "showHeader": true,
305 | "sort": {
306 | "col": 0,
307 | "desc": true
308 | },
309 | "styles": [
310 | {
311 | "alias": "Time",
312 | "align": "auto",
313 | "dateFormat": "YYYY-MM-DD HH:mm:ss",
314 | "pattern": "Time",
315 | "type": "date"
316 | },
317 | {
318 | "alias": "Time",
319 | "align": "auto",
320 | "colorMode": null,
321 | "colors": [
322 | "rgba(245, 54, 54, 0.9)",
323 | "rgba(237, 129, 40, 0.89)",
324 | "rgba(50, 172, 45, 0.97)"
325 | ],
326 | "dateFormat": "YYYY-MM-DD HH:mm:ss",
327 | "decimals": 2,
328 | "pattern": "ts",
329 | "thresholds": [],
330 | "type": "date",
331 | "unit": "short"
332 | },
333 | {
334 | "alias": "Current function",
335 | "align": "auto",
336 | "colorMode": null,
337 | "colors": [
338 | "rgba(245, 54, 54, 0.9)",
339 | "rgba(237, 129, 40, 0.89)",
340 | "rgba(50, 172, 45, 0.97)"
341 | ],
342 | "dateFormat": "YYYY-MM-DD HH:mm:ss",
343 | "decimals": 2,
344 | "pattern": "current_function",
345 | "preserveFormat": false,
346 | "thresholds": [],
347 | "type": "string",
348 | "unit": "short"
349 | },
350 | {
351 | "alias": "",
352 | "align": "auto",
353 | "colorMode": null,
354 | "colors": [
355 | "rgba(245, 54, 54, 0.9)",
356 | "rgba(237, 129, 40, 0.89)",
357 | "rgba(50, 172, 45, 0.97)"
358 | ],
359 | "decimals": 2,
360 | "pattern": "/.*/",
361 | "thresholds": [],
362 | "type": "number",
363 | "unit": "short"
364 | }
365 | ],
366 | "targets": [
367 | {
368 | "alias": "",
369 | "format": "table",
370 | "rawSql": "SELECT ts, registered_name, pid, current_function FROM prc \nWHERE \n (ts BETWEEN $__timeFrom() AND $__timeTo()) AND \n node='[[node]]' AND\n ( ('[[pid]]' != '' AND pid='[[pid]]') OR -- pid is set\n ('[[regname]]' != '' AND registered_name='[[regname]]') OR -- registered_name is set\n ('[[pid]]' = '' AND '[[regname]]' = '') -- none is set, show all processes\n )\n",
371 | "refId": "A"
372 | }
373 | ],
374 | "title": "Current function",
375 | "transform": "table",
376 | "type": "table"
377 | }
378 | ],
379 | "refresh": false,
380 | "schemaVersion": 22,
381 | "style": "dark",
382 | "tags": [],
383 | "templating": {
384 | "list": [
385 | {
386 | "allValue": null,
387 | "datasource": "Postgres",
388 | "definition": "",
389 | "hide": 0,
390 | "includeAll": false,
391 | "label": null,
392 | "multi": false,
393 | "name": "node",
394 | "options": [],
395 | "query": "SELECT DISTINCT node FROM node ORDER BY node ASC;",
396 | "refresh": 1,
397 | "regex": "",
398 | "skipUrlSync": false,
399 | "sort": 0,
400 | "tagValuesQuery": "",
401 | "tags": [],
402 | "tagsQuery": "",
403 | "type": "query",
404 | "useTags": false
405 | },
406 | {
407 | "current": {
408 | "text": "undefined",
409 | "value": "undefined"
410 | },
411 | "hide": 0,
412 | "label": "Registered name",
413 | "name": "regname",
414 | "options": [
415 | {
416 | "selected": false,
417 | "text": "",
418 | "value": ""
419 | }
420 | ],
421 | "query": "",
422 | "skipUrlSync": false,
423 | "type": "constant"
424 | },
425 | {
426 | "current": {
427 | "text": "<17096.28649.6884> ",
428 | "value": "<17096.28649.6884> "
429 | },
430 | "hide": 0,
431 | "label": null,
432 | "name": "pid",
433 | "options": [
434 | {
435 | "selected": false,
436 | "text": "",
437 | "value": ""
438 | }
439 | ],
440 | "query": "",
441 | "skipUrlSync": false,
442 | "type": "constant"
443 | }
444 | ]
445 | },
446 | "time": {
447 | "from": "2020-07-31T09:51:33.904Z",
448 | "to": "2020-07-31T10:06:31.905Z"
449 | },
450 | "timepicker": {
451 | "refresh_intervals": [
452 | "5s",
453 | "10s",
454 | "30s",
455 | "1m",
456 | "5m",
457 | "15m",
458 | "30m",
459 | "1h",
460 | "2h",
461 | "1d"
462 | ],
463 | "time_options": [
464 | "5m",
465 | "15m",
466 | "1h",
467 | "6h",
468 | "12h",
469 | "24h",
470 | "2d",
471 | "7d",
472 | "30d"
473 | ]
474 | },
475 | "timezone": "",
476 | "title": "Process history",
477 | "uid": "P2OSAsRmz",
478 | "version": 5
479 | }
480 |
--------------------------------------------------------------------------------
/docker/grafana/dashboards/proc_top.json:
--------------------------------------------------------------------------------
1 | {
2 | "annotations": {
3 | "list": [
4 | {
5 | "builtIn": 1,
6 | "datasource": "-- Grafana --",
7 | "enable": true,
8 | "hide": true,
9 | "iconColor": "rgba(0, 211, 255, 1)",
10 | "name": "Annotations & Alerts",
11 | "type": "dashboard"
12 | }
13 | ]
14 | },
15 | "editable": true,
16 | "gnetId": null,
17 | "graphTooltip": 0,
18 | "id": 3,
19 | "iteration": 1617800341462,
20 | "links": [],
21 | "panels": [
22 | {
23 | "aliasColors": {},
24 | "bars": false,
25 | "dashLength": 10,
26 | "dashes": false,
27 | "datasource": "Postgres",
28 | "fieldConfig": {
29 | "defaults": {},
30 | "overrides": []
31 | },
32 | "fill": 1,
33 | "fillGradient": 0,
34 | "gridPos": {
35 | "h": 7,
36 | "w": 20,
37 | "x": 0,
38 | "y": 0
39 | },
40 | "hiddenSeries": false,
41 | "id": 8,
42 | "legend": {
43 | "avg": false,
44 | "current": false,
45 | "max": false,
46 | "min": false,
47 | "show": true,
48 | "total": false,
49 | "values": false
50 | },
51 | "lines": true,
52 | "linewidth": 1,
53 | "nullPointMode": "null",
54 | "options": {
55 | "alertThreshold": true
56 | },
57 | "percentage": false,
58 | "pluginVersion": "7.5.3",
59 | "pointradius": 2,
60 | "points": false,
61 | "renderer": "flot",
62 | "seriesOverrides": [],
63 | "spaceLength": 10,
64 | "stack": false,
65 | "steppedLine": false,
66 | "targets": [
67 | {
68 | "format": "time_series",
69 | "group": [],
70 | "metricColumn": "none",
71 | "queryType": "randomWalk",
72 | "rawQuery": false,
73 | "rawSql": "SELECT\n ts AS \"time\",\n dreductions\nFROM prc\nWHERE\n $__timeFilter(ts)\nORDER BY 1",
74 | "refId": "A",
75 | "select": [
76 | [
77 | {
78 | "params": [
79 | "dreductions"
80 | ],
81 | "type": "column"
82 | }
83 | ]
84 | ],
85 | "table": "prc",
86 | "timeColumn": "ts",
87 | "timeColumnType": "timestamp",
88 | "where": [
89 | {
90 | "name": "$__timeFilter",
91 | "params": [],
92 | "type": "macro"
93 | }
94 | ]
95 | }
96 | ],
97 | "thresholds": [],
98 | "timeFrom": null,
99 | "timeRegions": [],
100 | "timeShift": null,
101 | "title": "Time series",
102 | "tooltip": {
103 | "shared": true,
104 | "sort": 0,
105 | "value_type": "individual"
106 | },
107 | "type": "graph",
108 | "xaxis": {
109 | "buckets": null,
110 | "mode": "time",
111 | "name": null,
112 | "show": true,
113 | "values": []
114 | },
115 | "yaxes": [
116 | {
117 | "format": "short",
118 | "label": null,
119 | "logBase": 1,
120 | "max": null,
121 | "min": null,
122 | "show": true
123 | },
124 | {
125 | "format": "short",
126 | "label": null,
127 | "logBase": 1,
128 | "max": null,
129 | "min": null,
130 | "show": true
131 | }
132 | ],
133 | "yaxis": {
134 | "align": false,
135 | "alignLevel": null
136 | }
137 | },
138 | {
139 | "columns": [],
140 | "datasource": "Postgres",
141 | "fieldConfig": {
142 | "defaults": {},
143 | "overrides": []
144 | },
145 | "fontSize": "180%",
146 | "gridPos": {
147 | "h": 7,
148 | "w": 4,
149 | "x": 20,
150 | "y": 0
151 | },
152 | "id": 6,
153 | "links": [],
154 | "pageSize": 3,
155 | "scroll": false,
156 | "showHeader": true,
157 | "sort": {
158 | "col": null,
159 | "desc": false
160 | },
161 | "styles": [
162 | {
163 | "alias": "",
164 | "align": "auto",
165 | "colorMode": null,
166 | "colors": [
167 | "rgba(245, 54, 54, 0.9)",
168 | "rgba(237, 129, 40, 0.89)",
169 | "rgba(50, 172, 45, 0.97)"
170 | ],
171 | "decimals": 2,
172 | "pattern": "/.*/",
173 | "preserveFormat": true,
174 | "sanitize": true,
175 | "thresholds": [],
176 | "type": "string",
177 | "unit": "short"
178 | }
179 | ],
180 | "targets": [
181 | {
182 | "alias": "",
183 | "format": "table",
184 | "group": [],
185 | "metricColumn": "none",
186 | "rawQuery": true,
187 | "rawSql": "SELECT data as status \n FROM node_role WHERE node='[[node]]'\n AND ts=(SELECT max(ts) FROM node_role WHERE node='[[node]]' AND ts > $__timeFrom());",
188 | "refId": "A",
189 | "select": [
190 | [
191 | {
192 | "params": [
193 | "value"
194 | ],
195 | "type": "column"
196 | }
197 | ]
198 | ],
199 | "timeColumn": "time",
200 | "where": [
201 | {
202 | "name": "$__timeFilter",
203 | "params": [],
204 | "type": "macro"
205 | }
206 | ]
207 | }
208 | ],
209 | "transform": "table",
210 | "type": "table-old"
211 | },
212 | {
213 | "columns": [],
214 | "datasource": "Postgres",
215 | "fieldConfig": {
216 | "defaults": {},
217 | "overrides": []
218 | },
219 | "fontSize": "100%",
220 | "gridPos": {
221 | "h": 24,
222 | "w": 24,
223 | "x": 0,
224 | "y": 7
225 | },
226 | "hideTimeOverride": true,
227 | "id": 2,
228 | "links": [],
229 | "maxPerRow": 2,
230 | "pageSize": null,
231 | "repeat": "node",
232 | "repeatDirection": "v",
233 | "scopedVars": {
234 | "node": {
235 | "selected": true,
236 | "text": "nonode@nohost",
237 | "value": "nonode@nohost"
238 | }
239 | },
240 | "scroll": true,
241 | "showHeader": true,
242 | "sort": {
243 | "col": 4,
244 | "desc": true
245 | },
246 | "styles": [
247 | {
248 | "$$hashKey": "object:167",
249 | "alias": "",
250 | "align": "auto",
251 | "colorMode": "cell",
252 | "colors": [
253 | "rgba(50, 172, 45, 0)",
254 | "rgba(237, 129, 40, 0.89)",
255 | "rgba(245, 54, 54, 0.9)"
256 | ],
257 | "dateFormat": "YYYY-MM-DD HH:mm:ss",
258 | "decimals": 2,
259 | "link": true,
260 | "linkTooltip": "Process history",
261 | "linkUrl": "",
262 | "pattern": "dreductions",
263 | "thresholds": [
264 | "1000000",
265 | "100000000"
266 | ],
267 | "type": "number",
268 | "unit": "short"
269 | },
270 | {
271 | "$$hashKey": "object:168",
272 | "alias": "",
273 | "align": "auto",
274 | "colorMode": "cell",
275 | "colors": [
276 | "rgba(5, 5, 5, 0)",
277 | "rgba(237, 129, 40, 0.89)",
278 | "rgba(245, 54, 54, 0.9)"
279 | ],
280 | "dateFormat": "YYYY-MM-DD HH:mm:ss",
281 | "decimals": 0,
282 | "pattern": "message_queue_len",
283 | "thresholds": [
284 | "10",
285 | "100"
286 | ],
287 | "type": "number",
288 | "unit": "short"
289 | },
290 | {
291 | "$$hashKey": "object:169",
292 | "alias": "",
293 | "align": "auto",
294 | "colorMode": "cell",
295 | "colors": [
296 | "rgba(50, 172, 45, 0.97)",
297 | "rgba(237, 129, 40, 0)",
298 | "rgba(245, 54, 54, 0.9)"
299 | ],
300 | "dateFormat": "YYYY-MM-DD HH:mm:ss",
301 | "decimals": 2,
302 | "pattern": "dmemory",
303 | "thresholds": [
304 | "-1000000",
305 | "100000000"
306 | ],
307 | "type": "number",
308 | "unit": "Bps"
309 | },
310 | {
311 | "$$hashKey": "object:170",
312 | "alias": "",
313 | "align": "auto",
314 | "colorMode": null,
315 | "colors": [
316 | "rgba(245, 54, 54, 0.9)",
317 | "rgba(237, 129, 40, 0.89)",
318 | "rgba(50, 172, 45, 0.97)"
319 | ],
320 | "dateFormat": "YYYY-MM-DD HH:mm:ss",
321 | "decimals": 2,
322 | "link": true,
323 | "linkTargetBlank": true,
324 | "linkTooltip": "history",
325 | "linkUrl": "/d/P2OSAsRmz/process-history?orgId=1&var-node=[[node]]&var-pid=${__cell:raw}&var-regname=undefined&from=$__from&to=$__to",
326 | "pattern": "pid",
327 | "preserveFormat": false,
328 | "sanitize": false,
329 | "thresholds": [],
330 | "type": "string",
331 | "unit": "short"
332 | },
333 | {
334 | "$$hashKey": "object:171",
335 | "alias": "Current function ",
336 | "align": "auto",
337 | "colorMode": null,
338 | "colors": [
339 | "rgba(245, 54, 54, 0.9)",
340 | "rgba(237, 129, 40, 0.89)",
341 | "rgba(50, 172, 45, 0.97)"
342 | ],
343 | "dateFormat": "YYYY-MM-DD HH:mm:ss",
344 | "decimals": 2,
345 | "pattern": "current_function",
346 | "preserveFormat": true,
347 | "thresholds": [],
348 | "type": "string",
349 | "unit": "short"
350 | },
351 | {
352 | "$$hashKey": "object:172",
353 | "alias": "Initial call ",
354 | "align": "auto",
355 | "colorMode": null,
356 | "colors": [
357 | "rgba(245, 54, 54, 0.9)",
358 | "rgba(237, 129, 40, 0.89)",
359 | "rgba(50, 172, 45, 0.97)"
360 | ],
361 | "dateFormat": "YYYY-MM-DD HH:mm:ss",
362 | "decimals": 2,
363 | "pattern": "initial_call",
364 | "preserveFormat": true,
365 | "thresholds": [],
366 | "type": "string",
367 | "unit": "short"
368 | },
369 | {
370 | "$$hashKey": "object:173",
371 | "alias": "",
372 | "align": "auto",
373 | "colorMode": "cell",
374 | "colors": [
375 | "rgba(50, 172, 45, 0)",
376 | "rgba(237, 129, 40, 0.89)",
377 | "rgba(245, 54, 54, 0.9)"
378 | ],
379 | "dateFormat": "YYYY-MM-DD HH:mm:ss",
380 | "decimals": 2,
381 | "pattern": "memory",
382 | "thresholds": [
383 | "100000000",
384 | "10000000000"
385 | ],
386 | "type": "number",
387 | "unit": "decbytes"
388 | },
389 | {
390 | "$$hashKey": "object:174",
391 | "alias": "Registered name ",
392 | "align": "auto",
393 | "colorMode": null,
394 | "colors": [
395 | "rgba(245, 54, 54, 0.9)",
396 | "rgba(237, 129, 40, 0.89)",
397 | "rgba(50, 172, 45, 0.97)"
398 | ],
399 | "dateFormat": "YYYY-MM-DD HH:mm:ss",
400 | "decimals": 2,
401 | "pattern": "registered_name",
402 | "thresholds": [],
403 | "type": "string",
404 | "unit": "short"
405 | },
406 | {
407 | "$$hashKey": "object:175",
408 | "alias": "",
409 | "align": "auto",
410 | "colorMode": null,
411 | "colors": [
412 | "rgba(245, 54, 54, 0.9)",
413 | "rgba(237, 129, 40, 0.89)",
414 | "rgba(50, 172, 45, 0.97)"
415 | ],
416 | "decimals": 2,
417 | "pattern": "/.*/",
418 | "thresholds": [],
419 | "type": "number",
420 | "unit": "short"
421 | }
422 | ],
423 | "targets": [
424 | {
425 | "alias": "",
426 | "format": "table",
427 | "group": [],
428 | "metricColumn": "none",
429 | "rawQuery": true,
430 | "rawSql": "SELECT pid, registered_name, reductions, dreductions, message_queue_len, memory, dmemory, stack_size, heap_size, total_heap_size, initial_call, current_function\nFROM prc\nWHERE node='[[node]]'\nAND ts=(SELECT max(ts) FROM node_role WHERE node='[[node]]' AND ts > $__timeFrom() AND ts < $__timeTo());",
431 | "refId": "A",
432 | "select": [
433 | [
434 | {
435 | "params": [
436 | "value"
437 | ],
438 | "type": "column"
439 | }
440 | ]
441 | ],
442 | "timeColumn": "time",
443 | "where": [
444 | {
445 | "name": "$__timeFilter",
446 | "params": [],
447 | "type": "macro"
448 | }
449 | ]
450 | }
451 | ],
452 | "timeFrom": "10s",
453 | "timeShift": "1s",
454 | "title": "Top [[node]]",
455 | "transform": "table",
456 | "transparent": true,
457 | "type": "table-old"
458 | }
459 | ],
460 | "refresh": "5s",
461 | "schemaVersion": 27,
462 | "style": "dark",
463 | "tags": [
464 | "erlang",
465 | "top"
466 | ],
467 | "templating": {
468 | "list": [
469 | {
470 | "allValue": null,
471 | "current": {
472 | "selected": false,
473 | "text": "nonode@nohost",
474 | "value": "nonode@nohost"
475 | },
476 | "datasource": "Postgres",
477 | "definition": "SELECT DISTINCT node FROM node ORDER BY node ASC;",
478 | "description": null,
479 | "error": null,
480 | "hide": 0,
481 | "includeAll": false,
482 | "label": null,
483 | "multi": false,
484 | "name": "node",
485 | "options": [],
486 | "query": "SELECT DISTINCT node FROM node ORDER BY node ASC;",
487 | "refresh": 1,
488 | "regex": "",
489 | "skipUrlSync": false,
490 | "sort": 0,
491 | "tagValuesQuery": "",
492 | "tags": [],
493 | "tagsQuery": "",
494 | "type": "query",
495 | "useTags": false
496 | }
497 | ]
498 | },
499 | "time": {
500 | "from": "now-15m",
501 | "to": "now-2s"
502 | },
503 | "timepicker": {
504 | "nowDelay": "2s",
505 | "refresh_intervals": [
506 | "1s",
507 | "2s",
508 | "5s",
509 | "10s",
510 | ""
511 | ],
512 | "time_options": [
513 | "5m",
514 | "15m",
515 | "1h",
516 | "6h",
517 | "12h",
518 | "24h",
519 | "2d",
520 | "7d",
521 | "30d"
522 | ]
523 | },
524 | "timezone": "",
525 | "title": "Erlang top",
526 | "uid": "V5HktsRik",
527 | "version": 2
528 | }
529 |
--------------------------------------------------------------------------------
/docker/grafana/datasources/postgres.yml:
--------------------------------------------------------------------------------
1 | # config file version
2 | apiVersion: 1
3 |
4 | datasources:
5 | - name: Postgres
6 | type: postgres
7 | url: db:5432
8 | database: system_monitor
9 | user: grafana
10 | secureJsonData:
11 | password: "system_monitor_password"
12 | jsonData:
13 | sslmode: "disable"
14 | timescaledb: false
15 |
--------------------------------------------------------------------------------
/include/system_monitor.hrl:
--------------------------------------------------------------------------------
1 | %%--------------------------------------------------------------------------------
2 | %% Copyright 2020 Klarna Bank AB
3 | %%
4 | %% Licensed under the Apache License, Version 2.0 (the "License");
5 | %% you may not use this file except in compliance with the License.
6 | %% You may obtain a copy of the License at
7 | %%
8 | %% http://www.apache.org/licenses/LICENSE-2.0
9 | %%
10 | %% Unless required by applicable law or agreed to in writing, software
11 | %% distributed under the License is distributed on an "AS IS" BASIS,
12 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | %% See the License for the specific language governing permissions and
14 | %% limitations under the License.
15 | %%--------------------------------------------------------------------------------
16 | -ifndef(SYSTEM_MONITOR_HRL).
17 | -define(SYSTEM_MONITOR_HRL, true).
18 |
19 | -define(APP, system_monitor).
20 |
21 | -type function_top() ::
22 | #{ initial_call => [{mfa(), number()}]
23 | , current_function => [{mfa(), number()}]
24 | }.
25 |
26 | -record(pid_info,
27 | { pid :: pid()
28 | , initial_call :: mfa() | undefined
29 | , registered_name :: atom() | []
30 | , current_function :: mfa() | undefined
31 | , reductions :: integer()
32 | , dreductions :: number() | undefined
33 | , memory :: integer()
34 | , dmemory :: number() | undefined
35 | , message_queue_len :: integer()
36 | , group_leader :: pid()
37 | }).
38 |
39 | -record(erl_top,
40 | { node :: node()
41 | , ts :: integer()
42 | , pid :: string()
43 | , dreductions :: integer()
44 | , dmemory :: integer()
45 | , reductions :: integer()
46 | , memory :: integer() %% bytes
47 | , message_queue_len :: integer()
48 | , current_function :: mfa()
49 | , initial_call :: mfa()
50 | , registered_name :: atom() | []
51 | , stack_size :: integer()
52 | , heap_size :: integer() %% words
53 | , total_heap_size :: integer() %% words
54 | , current_stacktrace :: list()
55 | , group_leader :: list()
56 | }).
57 |
58 | -record(app_top,
59 | { app :: atom()
60 | , red_abs :: integer()
61 | , red_rel :: float()
62 | , memory :: integer()
63 | , processes :: integer()
64 | }).
65 |
66 | -endif.
67 |
--------------------------------------------------------------------------------
/rebar.config:
--------------------------------------------------------------------------------
1 | %% -*- mode:erlang -*-
2 | {erl_opts,
3 | [debug_info, warnings_as_errors]}.
4 |
5 | {deps,
6 | [ {supervisor3, "1.1.9"}
7 | , {epgsql, "4.7.0"}
8 | ]}.
9 |
10 | {dialyzer, [{warnings, [unknown]}]}.
11 |
12 | {profiles,
13 | [ {test, [ {deps, [{proper, "1.2.0"}]}
14 | , {cover_enabled, true}
15 | ]}
16 | , {dev,
17 | [{plugins, [rebar3_hex]}]}
18 | ]}.
19 |
20 | {xref_checks,
21 | [ undefined_function_calls
22 | , deprecated_function_calls
23 | ]
24 | }.
25 |
--------------------------------------------------------------------------------
/rebar.lock:
--------------------------------------------------------------------------------
1 | {"1.2.0",
2 | [{<<"epgsql">>,{pkg,<<"epgsql">>,<<"4.7.0">>},0},
3 | {<<"supervisor3">>,{pkg,<<"supervisor3">>,<<"1.1.9">>},0}]}.
4 | [
5 | {pkg_hash,[
6 | {<<"epgsql">>, <<"98361A63E49AE14DF57CBDA8495058D42ABD3A316F822D1F990A40259026FE5E">>},
7 | {<<"supervisor3">>, <<"F1A3CC12FB6197526F548E79C9FE2B4AF0C74EFB8A687917B3B1EBE5E9C9368D">>}]},
8 | {pkg_hash_ext,[
9 | {<<"epgsql">>, <<"90B0145D302AB133D957EA46A884E6E37E847E6E47DEAF93104314D2AD8CB5BB">>},
10 | {<<"supervisor3">>, <<"71B177C08F8CAB9EC8ECB81C1AA28A23BBC24AAC4B468C2DB69840229D78D5C4">>}]}
11 | ].
12 |
--------------------------------------------------------------------------------
/src/system_monitor.app.src:
--------------------------------------------------------------------------------
1 | %% -*- mode: erlang -*-
2 | %%--------------------------------------------------------------------------------
3 | %% Copyright 2020 Klarna Bank AB
4 | %%
5 | %% Licensed under the Apache License, Version 2.0 (the "License");
6 | %% you may not use this file except in compliance with the License.
7 | %% You may obtain a copy of the License at
8 | %%
9 | %% http://www.apache.org/licenses/LICENSE-2.0
10 | %%
11 | %% Unless required by applicable law or agreed to in writing, software
12 | %% distributed under the License is distributed on an "AS IS" BASIS,
13 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | %% See the License for the specific language governing permissions and
15 | %% limitations under the License.
16 | %%--------------------------------------------------------------------------------
17 | {application, system_monitor,
18 | [ {description, "Monitoring app that exports Erlang VM introspection data to any backend. "
19 | "Defaults to Postgres"}
20 | , {licenses, ["Apache 2.0"]}
21 | , {vsn, "git"}
22 | , {registered, []}
23 | , {modules, []}
24 | , {mod, {system_monitor_app, []}}
25 | , {applications, [kernel, stdlib, supervisor3, epgsql]}
26 | , {env,
27 | [ %% Specifies how many topmost processes should be reported per
28 | %% category (such as `top_memory', `top_reductions', etc.)
29 | {top_num_items, 10}
30 | %% Specifies how often process top should be collected (in ms):
31 | , {top_sample_interval, 2000}
32 | %% Specifies sample size for the approximate metrics, such as
33 | %% 'percentage of processes started by an app', and 'percentage
34 | %% of processes running a function':
35 | , {top_sample_size, 3000}
36 | %% Stop reporting exact process data when the number of
37 | %% processes is above this threshold, in order to avoid
38 | %% hammering the VM with introspection BIFs (this doesn't affect
39 | %% approximate monitors that rely on sampling):
40 | , {top_max_procs, 15000}
41 | %% By default system_monitor tries to collect process dictionary
42 | %% of the topmost processes in order to determine start function
43 | %% more precisely. In theory this can cause problems if process
44 | %% dictionary contains very large amounts of data. This option
45 | %% allows to fallback to safety:
46 | , {collect_process_dictionary, true}
47 | %% Don't report values to `app_top' and `fun_top' below the
48 | %% threshold as insignificant:
49 | , {top_significance_threshold,
50 | #{ current_function => 0.01 % 1 percent
51 | , initial_call => 0.01 % 1 percent
52 | , processes => 100 % number of processes
53 | , reductions => 0.01 % 1 percent
54 | , memory => 10000 % words
55 | }}
56 |
57 | %% Specify node-specific healthcheck function as `{module(),
58 | %% function()}', for example: `{my_app, node_status}'. This
59 | %% function should return an HTML-formatted status report:
60 | , {node_status_fun, undefined}
61 | %% List of additional status check functions:
62 | , {status_checks, []}
63 |
64 | %% BEAM event settings:
65 | , {beam_events,
66 | [ busy_port
67 | , busy_dist_port
68 | , {long_gc, 500}
69 | , {long_schedule, 500}
70 | ]}
71 | , {suspect_procs_max_memory, 524288000} %% 500 MB
72 | , {suspect_procs_max_message_queue_len, 5000}
73 | , {suspect_procs_max_total_heap_size, 524288000} %% 500 MB
74 |
75 | %% Don't query memory if message_queue_len is longer than this:
76 | , {mql_limit_for_memory, 100000}
77 | ]}
78 | ]}.
79 |
--------------------------------------------------------------------------------
/src/system_monitor.erl:
--------------------------------------------------------------------------------
1 | %% -*- mode: erlang -*-
2 | %%--------------------------------------------------------------------------------
3 | %% Copyright 2021 Klarna Bank AB
4 | %%
5 | %% Licensed under the Apache License, Version 2.0 (the "License");
6 | %% you may not use this file except in compliance with the License.
7 | %% You may obtain a copy of the License at
8 | %%
9 | %% http://www.apache.org/licenses/LICENSE-2.0
10 | %%
11 | %% Unless required by applicable law or agreed to in writing, software
12 | %% distributed under the License is distributed on an "AS IS" BASIS,
13 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | %% See the License for the specific language governing permissions and
15 | %% limitations under the License.
16 | %%--------------------------------------------------------------------------------
17 | %% @private
18 | -module(system_monitor).
19 |
20 | -behaviour(gen_server).
21 |
22 | %%--------------------------------------------------------------------
23 | %% Include files
24 | %%--------------------------------------------------------------------
25 |
26 | -include_lib("system_monitor/include/system_monitor.hrl").
27 | -include_lib("kernel/include/logger.hrl").
28 |
29 | %% API
30 | -export([start_link/0]).
31 |
32 | -export([reset/0]).
33 |
34 | -export([ report_full_status/0
35 | , check_process_count/0
36 | , suspect_procs/0
37 | , erl_top_to_str/1
38 | , start_top/0
39 | , stop_top/0
40 | , fmt_mfa/1
41 | , fmt_stack/1
42 | , node_name/0
43 | ]).
44 |
45 | %% gen_server callbacks
46 | -export([ init/1
47 | , handle_call/3
48 | , handle_cast/2
49 | , handle_info/2
50 | , terminate/2
51 | ]).
52 |
53 | -define(SERVER, ?MODULE).
54 | -define(TICK_INTERVAL, 1000).
55 |
56 | -record(state, { monitors = []
57 | , timer_ref
58 | }).
59 |
60 | %%====================================================================
61 | %% API
62 | %%====================================================================
63 | %%--------------------------------------------------------------------
64 | %% @doc Starts the server
65 | %%--------------------------------------------------------------------
66 | -spec start_link() -> {ok, pid()} | ignore | {error, term()}.
67 | start_link() -> gen_server:start_link({local, ?SERVER}, ?MODULE, [], []).
68 |
69 | %%--------------------------------------------------------------------
70 | %% @doc Start printing erlang top to console
71 | %%--------------------------------------------------------------------
72 | -spec start_top() -> ok.
73 | start_top() ->
74 | application:set_env(?APP, top_printing, group_leader()).
75 |
76 | %%--------------------------------------------------------------------
77 | %% @doc Stop printing erlang top to console
78 | %%--------------------------------------------------------------------
79 | -spec stop_top() -> ok.
80 | stop_top() ->
81 | application:set_env(?APP, top_printing, false).
82 |
83 | %%--------------------------------------------------------------------
84 | %% @doc Reset monitors
85 | %%--------------------------------------------------------------------
86 | -spec reset() -> ok.
87 | reset() ->
88 | gen_server:cast(?SERVER, reset).
89 |
90 | %%====================================================================
91 | %% gen_server callbacks
92 | %%====================================================================
93 |
94 | init([]) ->
95 | {ok, Timer} = timer:send_interval(?TICK_INTERVAL, {self(), tick}),
96 | {ok, #state{monitors = init_monitors(), timer_ref = Timer}}.
97 |
98 | handle_call(_Request, _From, State) ->
99 | {reply, {error, unknown_call}, State}.
100 |
101 | handle_cast(reset, State) ->
102 | {noreply, State#state{monitors = init_monitors()}};
103 | handle_cast(_Msg, State) ->
104 | {noreply, State}.
105 |
106 | handle_info({Self, tick}, State) when Self =:= self() ->
107 | Monitors = [case Ticks - 1 of
108 | 0 ->
109 | try
110 | apply(Module, Function, [])
111 | catch
112 | EC:Error:Stack ->
113 | error_logger:warning_msg(
114 | "system_monitor ~p crashed:~n~p:~p~nStacktrace: ~p~n",
115 | [{Module, Function}, EC, Error, Stack])
116 | end,
117 | {Module, Function, RunOnTerminate, TicksReset, TicksReset};
118 | TicksDecremented ->
119 | {Module, Function, RunOnTerminate, TicksReset, TicksDecremented}
120 | end || {Module, Function,
121 | RunOnTerminate, TicksReset, Ticks} <- State#state.monitors],
122 | {noreply, State#state{monitors = Monitors}};
123 | handle_info(_Info, State) ->
124 | {noreply, State}.
125 |
126 | -spec terminate(term(), #state{}) -> any().
127 | terminate(_Reason, State) ->
128 | %% Possibly, one last check.
129 | [apply(?MODULE, Monitor, []) ||
130 | {Monitor, true, _TicksReset, _Ticks} <- State#state.monitors].
131 |
132 | %%==============================================================================
133 | %% Internal functions
134 | %%==============================================================================
135 |
136 | %%------------------------------------------------------------------------------
137 | %% @doc Returns the list of initiated monitors.
138 | %%------------------------------------------------------------------------------
139 | -spec init_monitors() -> [{module(), function(), boolean(), pos_integer(), pos_integer()}].
140 | init_monitors() ->
141 | [{Module, Function, RunOnTerminate, Ticks, Ticks}
142 | || {Module, Function, RunOnTerminate, Ticks} <- monitors()].
143 |
144 | %%------------------------------------------------------------------------------
145 | %% @doc Returns the list of monitors. The format is
146 | %% {FunctionName, RunMonitorAtTerminate, NumberOfTicks}.
147 | %% RunMonitorAtTerminate determines whether the monitor is to be run in
148 | %% the terminate gen_server callback.
149 | %% ... and NumberOfTicks is the number of ticks between invocations of
150 | %% the monitor in question. So, if NumberOfTicks is 3600, the monitor is
151 | %% to be run once every hour, as there is a tick every second.
152 | %%------------------------------------------------------------------------------
153 | -spec monitors() -> [{module(), function(), boolean(), pos_integer()}].
154 | monitors() ->
155 | {ok, AdditionalMonitors} = application:get_env(system_monitor, status_checks),
156 | MaybeReportFullStatusMonitor =
157 | case system_monitor_callback:is_configured() of
158 | true ->
159 | {ok, TopInterval} = application:get_env(?APP, top_sample_interval),
160 | [{?MODULE, report_full_status, false, TopInterval div 1000}];
161 | false ->
162 | []
163 | end,
164 | [{?MODULE, check_process_count, true, 2},
165 | {?MODULE, suspect_procs, true, 5}]
166 | ++ MaybeReportFullStatusMonitor
167 | ++ AdditionalMonitors.
168 |
169 | %%------------------------------------------------------------------------------
170 | %% Monitor for number of processes
171 | %%------------------------------------------------------------------------------
172 |
173 | %%------------------------------------------------------------------------------
174 | %% @doc Check the number of processes and log an aggregate summary of the
175 | %% process info if the count is above Threshold.
176 | %%------------------------------------------------------------------------------
177 | -spec check_process_count() -> ok.
178 | check_process_count() ->
179 | {ok, MaxProcs} = application:get_env(?APP, top_max_procs),
180 | case erlang:system_info(process_count) of
181 | Count when Count > MaxProcs ->
182 | ?LOG_WARNING(
183 | "Abnormal process count (~p).~n"
184 | , [Count]
185 | , #{domain => [system_monitor]}
186 | );
187 | _ -> ok
188 | end.
189 |
190 |
191 | %%------------------------------------------------------------------------------
192 | %% Monitor for processes with suspect stats
193 | %%------------------------------------------------------------------------------
194 | suspect_procs() ->
195 | {_TS, ProcTop} = system_monitor_top:get_proc_top(),
196 | Env = fun(Name) -> application:get_env(?APP, Name, undefined) end,
197 | Conf =
198 | {Env(suspect_procs_max_memory),
199 | Env(suspect_procs_max_message_queue_len),
200 | Env(suspect_procs_max_total_heap_size)},
201 | SuspectProcs = lists:filter(fun(Proc) -> is_suspect_proc(Proc, Conf) end, ProcTop),
202 | lists:foreach(fun log_suspect_proc/1, SuspectProcs).
203 |
204 | is_suspect_proc(Proc, {MaxMemory, MaxMqLen, MaxTotalHeapSize}) ->
205 | #erl_top{memory = Memory,
206 | message_queue_len = MessageQueueLen,
207 | total_heap_size = TotalHeapSize} =
208 | Proc,
209 | GreaterIfDef =
210 | fun ({undefined, _}) ->
211 | false;
212 | ({Comp, Value}) ->
213 | Value >= Comp
214 | end,
215 | ToCompare =
216 | [{MaxMemory, Memory}, {MaxMqLen, MessageQueueLen}, {MaxTotalHeapSize, TotalHeapSize}],
217 | lists:any(GreaterIfDef, ToCompare).
218 |
219 | log_suspect_proc(Proc) ->
220 | ErlTopStr = erl_top_to_str(Proc),
221 | Format = "Suspect Proc~n~s",
222 | ?LOG_WARNING(Format, [ErlTopStr], #{domain => [system_monitor]}).
223 |
224 | %%------------------------------------------------------------------------------
225 | %% @doc Report top processes
226 | %%------------------------------------------------------------------------------
227 | -spec report_full_status() -> ok.
228 | report_full_status() ->
229 | %% `TS' variable should be used consistently in all following
230 | %% reports for this time interval, so it can be used as a key to
231 | %% lookup the relevant events
232 | {TS, ProcTop} = system_monitor_top:get_proc_top(),
233 | system_monitor_callback:produce(proc_top, ProcTop),
234 | report_app_top(TS),
235 | %% Node status report goes last, and it "seals" the report for this
236 | %% time interval:
237 | NodeReport =
238 | case application:get_env(?APP, node_status_fun) of
239 | {ok, {Module, Function}} ->
240 | try
241 | Module:Function()
242 | catch
243 | _:_ ->
244 | <<>>
245 | end;
246 | _ ->
247 | <<>>
248 | end,
249 | system_monitor_callback:produce(
250 | node_role, [{node_role, node_name(), TS, iolist_to_binary(NodeReport)}]).
251 |
252 | %%------------------------------------------------------------------------------
253 | %% @doc Calculate reductions per application.
254 | %%------------------------------------------------------------------------------
255 | -spec report_app_top(integer()) -> ok.
256 | report_app_top(TS) ->
257 | AppReds = system_monitor_top:get_abs_app_top(),
258 | present_results(app_top, reductions, AppReds, TS),
259 | AppMem = system_monitor_top:get_app_memory(),
260 | present_results(app_top, memory, AppMem, TS),
261 | AppProcs = system_monitor_top:get_app_processes(),
262 | present_results(app_top, processes, AppProcs, TS),
263 | #{ current_function := CurrentFunction
264 | , initial_call := InitialCall
265 | } = system_monitor_top:get_function_top(),
266 | present_results(fun_top, current_function, CurrentFunction, TS),
267 | present_results(fun_top, initial_call, InitialCall, TS),
268 | ok.
269 |
270 | %%--------------------------------------------------------------------
271 | %% @doc Push app_top or fun_top information to the configured destination
272 | %%--------------------------------------------------------------------
273 | present_results(Record, Tag, Values, TS) ->
274 | {ok, Thresholds} = application:get_env(?APP, top_significance_threshold),
275 | Threshold = maps:get(Tag, Thresholds, 0),
276 | L = lists:filtermap(fun ({Key, Val}) when Val > Threshold ->
277 | {true, {Record, node_name(), TS, Key, Tag, Val}};
278 | (_) ->
279 | false
280 | end,
281 | Values),
282 | system_monitor_callback:produce(Record, L).
283 |
284 | node_name() ->
285 | application:get_env(?APP, node_name, node()).
286 |
287 | %%--------------------------------------------------------------------
288 | %% @doc logs "the interesting parts" of erl_top
289 | %%--------------------------------------------------------------------
290 | erl_top_to_str(Proc) ->
291 | #erl_top{registered_name = RegisteredName,
292 | pid = Pid,
293 | initial_call = InitialCall,
294 | memory = Memory,
295 | message_queue_len = MessageQueueLength,
296 | stack_size = StackSize,
297 | heap_size = HeapSize,
298 | total_heap_size = TotalHeapSize,
299 | current_function = CurrentFunction,
300 | current_stacktrace = CurrentStack} =
301 | Proc,
302 | WordSize = erlang:system_info(wordsize),
303 | Format =
304 | "registered_name=~p~n"
305 | "offending_pid=~s~n"
306 | "initial_call=~s~n"
307 | "memory=~p (~s)~n"
308 | "message_queue_len=~p~n"
309 | "stack_size=~p~n"
310 | "heap_size=~p (~s)~n"
311 | "total_heap_size=~p (~s)~n"
312 | "current_function=~s~n"
313 | "current_stack:~n~s",
314 | Args =
315 | [RegisteredName,
316 | Pid,
317 | fmt_mfa(InitialCall),
318 | Memory, fmt_mem(Memory),
319 | MessageQueueLength,
320 | StackSize,
321 | HeapSize, fmt_mem(WordSize * HeapSize),
322 | TotalHeapSize, fmt_mem(WordSize * TotalHeapSize),
323 | fmt_mfa(CurrentFunction),
324 | fmt_stack(CurrentStack)],
325 | io_lib:format(Format, Args).
326 |
327 | fmt_mem(Mem) ->
328 | Units = [{1, "Bytes"}, {1024, "KB"}, {1024 * 1024, "MB"}, {1024 * 1024 * 1024, "GB"}],
329 | MemIsSmallEnough = fun({Dividor, _UnitStr}) -> Mem =< Dividor * 1024 end,
330 | {Dividor, UnitStr} =
331 | find_first(MemIsSmallEnough, Units, {1024 * 1024 * 1024 * 1024, "TB"}),
332 | io_lib:format("~.1f ~s", [Mem / Dividor, UnitStr]).
333 |
334 | fmt_stack(CurrentStack) ->
335 | [[fmt_mfa(MFA), "\n"] || MFA <- CurrentStack].
336 |
337 | fmt_mfa({Mod, Fun, Arity, Prop}) ->
338 | case proplists:get_value(line, Prop, undefined) of
339 | undefined ->
340 | fmt_mfa({Mod, Fun, Arity});
341 | Line ->
342 | io_lib:format("~s:~s/~p (Line ~p)", [Mod, Fun, Arity, Line])
343 | end;
344 | fmt_mfa({Mod, Fun, Arity}) ->
345 | io_lib:format("~s:~s/~p", [Mod, Fun, Arity]);
346 | fmt_mfa(L) ->
347 | io_lib:format("~p", [L]).
348 |
349 | -spec find_first(fun((any()) -> boolean()), [T], Default) -> T | Default.
350 | find_first(Pred, List, Default) ->
351 | case lists:search(Pred, List) of
352 | {value, Elem} -> Elem;
353 | false -> Default
354 | end.
355 |
--------------------------------------------------------------------------------
/src/system_monitor_app.erl:
--------------------------------------------------------------------------------
1 | %%--------------------------------------------------------------------------------
2 | %% Copyright 2020 Klarna Bank AB
3 | %%
4 | %% Licensed under the Apache License, Version 2.0 (the "License");
5 | %% you may not use this file except in compliance with the License.
6 | %% You may obtain a copy of the License at
7 | %%
8 | %% http://www.apache.org/licenses/LICENSE-2.0
9 | %%
10 | %% Unless required by applicable law or agreed to in writing, software
11 | %% distributed under the License is distributed on an "AS IS" BASIS,
12 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | %% See the License for the specific language governing permissions and
14 | %% limitations under the License.
15 | %%--------------------------------------------------------------------------------
16 | -module(system_monitor_app).
17 |
18 | -behaviour(application).
19 |
20 | -export([start/2, stop/1]).
21 |
22 | start(_Type, _StartArgs) ->
23 | system_monitor_sup:start_link().
24 |
25 | stop(_State) ->
26 | ok.
27 |
--------------------------------------------------------------------------------
/src/system_monitor_callback.erl:
--------------------------------------------------------------------------------
1 | %%--------------------------------------------------------------------------------
2 | %% Copyright 2021 Klarna Bank AB
3 | %%
4 | %% Licensed under the Apache License, Version 2.0 (the "License");
5 | %% you may not use this file except in compliance with the License.
6 | %% You may obtain a copy of the License at
7 | %%
8 | %% http://www.apache.org/licenses/LICENSE-2.0
9 | %%
10 | %% Unless required by applicable law or agreed to in writing, software
11 | %% distributed under the License is distributed on an "AS IS" BASIS,
12 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | %% See the License for the specific language governing permissions and
14 | %% limitations under the License.
15 | %%--------------------------------------------------------------------------------
16 |
17 | -module(system_monitor_callback).
18 |
19 | -export([ produce/2
20 | , is_configured/0
21 | , get_callback_mod/0
22 | ]).
23 |
24 | -include_lib("system_monitor/include/system_monitor.hrl").
25 |
26 | -callback produce(atom(), list()) -> ok.
27 |
28 | produce(Type, Events) ->
29 | (get_callback_mod()):?FUNCTION_NAME(Type, Events).
30 |
31 | -compile({inline, [get_callback_mod/0]}).
32 | get_callback_mod() ->
33 | application:get_env(?APP, callback_mod, undefined).
34 |
35 | is_configured() ->
36 | get_callback_mod() =/= undefined.
37 |
--------------------------------------------------------------------------------
/src/system_monitor_events.erl:
--------------------------------------------------------------------------------
1 | %%--------------------------------------------------------------------------------
2 | %% Copyright 2020 Klarna Bank AB
3 | %%
4 | %% Licensed under the Apache License, Version 2.0 (the "License");
5 | %% you may not use this file except in compliance with the License.
6 | %% You may obtain a copy of the License at
7 | %%
8 | %% http://www.apache.org/licenses/LICENSE-2.0
9 | %%
10 | %% Unless required by applicable law or agreed to in writing, software
11 | %% distributed under the License is distributed on an "AS IS" BASIS,
12 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | %% See the License for the specific language governing permissions and
14 | %% limitations under the License.
15 | %%--------------------------------------------------------------------------------
16 | %%% @doc
17 | %%% Print BEAM VM events to the logs
18 | %%%
19 | %%% @end
20 | -module(system_monitor_events).
21 |
22 | -behaviour(gen_server).
23 |
24 | -include("system_monitor.hrl").
25 | -include_lib("kernel/include/logger.hrl").
26 |
27 | -export([start_link/0]).
28 |
29 | %% gen_server callbacks
30 | -export([ init/1
31 | , handle_call/3
32 | , handle_cast/2
33 | , handle_info/2
34 | , terminate/2
35 | ]).
36 |
37 | %%--------------------------------------------------------------------
38 | %% @doc
39 | %% Starts the server
40 | %% @end
41 | %%--------------------------------------------------------------------
42 | -spec start_link() -> {ok, pid()}.
43 | start_link() ->
44 | gen_server:start_link({local, ?MODULE}, ?MODULE, [], []).
45 |
46 | %%====================================================================
47 | %% gen_server callbacks
48 | %%====================================================================
49 |
50 | init([]) ->
51 | setup_system_monitor(),
52 | {ok, {}}.
53 |
54 | handle_call(_Request, _From, State) ->
55 | {reply, {error, unknown_call}, State}.
56 |
57 | handle_cast(_Msg, State) ->
58 | {noreply, State}.
59 |
60 | handle_info({monitor, PidOrPort, EventKind, Info}, State) ->
61 | ReferenceData = data_for_reference(PidOrPort),
62 | InfoTxt = format_system_event_info(Info),
63 | ?LOG_INFO(
64 | "sysmon type=~p reference=~p~n~s~n~s"
65 | , [EventKind, PidOrPort, InfoTxt, ReferenceData]
66 | , #{domain => [system_monitor]}
67 | ),
68 | case application:get_env(?APP, external_monitoring) of
69 | {ok, Mod} -> Mod:system_monitor_event(EventKind, Info);
70 | undefined -> ok
71 | end,
72 | {noreply, State};
73 | handle_info(_Info, State) ->
74 | {noreply, State}.
75 |
76 | terminate(_Reason, _State) ->
77 | ok.
78 |
79 | %%==============================================================================
80 | %% Internal functions
81 | %%==============================================================================
82 |
83 | %%--------------------------------------------------------------------
84 | %% @doc: Set the current process as the receiver of the BEAM system
85 | %% events
86 | %%--------------------------------------------------------------------
87 | -spec setup_system_monitor() -> ok.
88 | setup_system_monitor() ->
89 | {ok, Opts} = application:get_env(?APP, beam_events),
90 | erlang:system_monitor(self(), Opts),
91 | ok.
92 |
93 | data_for_reference(Pid) when is_pid(Pid) ->
94 | case system_monitor_top:get_proc_top(Pid) of
95 | false -> "Proc not in top";
96 | ProcErlTop -> system_monitor:erl_top_to_str(ProcErlTop)
97 | end;
98 | data_for_reference(_Port) ->
99 | "".
100 |
101 | -spec format_system_event_info(term()) -> io_lib:chars().
102 | format_system_event_info(Info) when is_list(Info) ->
103 | lists:foldl(
104 | fun({Key, Value}, Acc) ->
105 | [io_lib:format("~p=~p ", [Key, Value])|Acc];
106 | (Value, Acc) ->
107 | [io_lib:format("~p ", [Value])|Acc]
108 | end,
109 | [],
110 | Info);
111 | format_system_event_info(Port) when is_port(Port) ->
112 | format_system_event_info([{port, Port}]);
113 | format_system_event_info(Pid) when is_pid(Pid) ->
114 | format_system_event_info([{pid_2, Pid}]);
115 | format_system_event_info(Term) ->
116 | format_system_event_info([{info, Term}]).
117 |
118 | %%%_* Emacs ============================================================
119 | %%% Local Variables:
120 | %%% allout-layout: t
121 | %%% erlang-indent-level: 2
122 | %%% End:
123 |
--------------------------------------------------------------------------------
/src/system_monitor_pg.erl:
--------------------------------------------------------------------------------
1 | %%--------------------------------------------------------------------------------
2 | %% Copyright 2021 Klarna Bank AB
3 | %%
4 | %% Licensed under the Apache License, Version 2.0 (the "License");
5 | %% you may not use this file except in compliance with the License.
6 | %% You may obtain a copy of the License at
7 | %%
8 | %% http://www.apache.org/licenses/LICENSE-2.0
9 | %%
10 | %% Unless required by applicable law or agreed to in writing, software
11 | %% distributed under the License is distributed on an "AS IS" BASIS,
12 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | %% See the License for the specific language governing permissions and
14 | %% limitations under the License.
15 | %%--------------------------------------------------------------------------------
16 | -module(system_monitor_pg).
17 |
18 | -behaviour(gen_server).
19 | -export([ start_link/0
20 | , init/1
21 | , handle_continue/2
22 | , handle_call/3
23 | , handle_info/2
24 | , handle_cast/2
25 | , format_status/1
26 | , terminate/2
27 | ]).
28 |
29 | -if(?OTP_RELEASE < 27).
30 | -export([ format_status/2
31 | ]).
32 | -endif.
33 |
34 | -behaviour(system_monitor_callback).
35 | -export([ produce/2 ]).
36 |
37 | -include_lib("system_monitor/include/system_monitor.hrl").
38 | -include_lib("kernel/include/logger.hrl").
39 |
40 | -define(SERVER, ?MODULE).
41 | -define(FIVE_SECONDS, 5000).
42 | -define(ONE_HOUR, 60*60*1000).
43 |
44 | %%%_* API =================================================================
45 | produce(Type, Events) ->
46 | gen_server:cast(?SERVER, {produce, Type, Events}).
47 |
48 | %%%_* Callbacks =================================================================
49 | start_link() ->
50 | gen_server:start_link({local, ?SERVER}, ?MODULE, [], []).
51 |
52 | init(_Args) ->
53 | erlang:process_flag(trap_exit, true),
54 | {ok, #{}, {continue, start_pg}}.
55 |
56 | handle_continue(start_pg, State) ->
57 | Conn = initialize(),
58 | case Conn of
59 | undefined ->
60 | timer:send_after(?FIVE_SECONDS, reinitialize);
61 | Conn ->
62 | ok
63 | end,
64 | timer:send_after(?ONE_HOUR, mk_partitions),
65 | {noreply, State#{connection => Conn, buffer => buffer_new()}}.
66 |
67 | handle_call(_Msg, _From, State) ->
68 | {reply, ok, State}.
69 |
70 | handle_info({'EXIT', Conn, _Reason}, #{connection := Conn} = State) ->
71 | timer:send_after(?FIVE_SECONDS, reinitialize),
72 | {noreply, State#{connection => undefined}};
73 | handle_info({'EXIT', _Conn, _Reason}, #{connection := undefined} = State) ->
74 | timer:send_after(?FIVE_SECONDS, reinitialize),
75 | {noreply, State};
76 | handle_info({'EXIT', _Conn, normal}, State) ->
77 | {noreply, State};
78 | handle_info(mk_partitions, #{connection := undefined} = State) ->
79 | timer:send_after(?ONE_HOUR, mk_partitions),
80 | {noreply, State};
81 | handle_info(mk_partitions, #{connection := Conn} = State) ->
82 | mk_partitions(Conn),
83 | timer:send_after(?ONE_HOUR, mk_partitions),
84 | {noreply, State};
85 | handle_info(reinitialize, State) ->
86 | {noreply, State#{connection => initialize()}}.
87 |
88 | handle_cast({produce, Type, Events}, #{connection := undefined, buffer := Buffer} = State) ->
89 | {noreply, State#{buffer => buffer_add(Buffer, {Type, Events})}};
90 | handle_cast({produce, Type, Events}, #{connection := Conn, buffer := Buffer} = State) ->
91 | MaxMsgQueueSize = application:get_env(?APP, max_message_queue_len, 1000),
92 | case process_info(self(), message_queue_len) of
93 | {_, N} when N > MaxMsgQueueSize ->
94 | {noreply, State};
95 | _ ->
96 | lists:foreach(fun({Type0, Events0}) ->
97 | run_query(Conn, Type0, Events0)
98 | end, buffer_to_list(buffer_add(Buffer, {Type, Events}))),
99 | {noreply, State#{buffer => buffer_new()}}
100 | end.
101 |
102 | format_status(Status = #{reason := _Reason, state := State}) ->
103 | Status#{state => State#{buffer => buffer_new()}};
104 | format_status(Status) ->
105 | Status.
106 |
107 | -if(?OTP_RELEASE < 27).
108 | %% TODO: remove once OTP-25 is the oldest supported OTP version
109 | format_status(normal, [_PDict, State]) ->
110 | [{data, [{"State", State}]}];
111 | format_status(terminate, [_PDict, State]) ->
112 | State#{buffer => buffer_new()}.
113 | -endif.
114 |
115 | terminate(_Reason, #{connection := undefined}) ->
116 | ok;
117 | terminate(_Reason, #{connection := Conn}) ->
118 | epgsql:close(Conn).
119 |
120 | %%%_* Internal buffer functions ===============================================
121 | buffer_new() ->
122 | {0, queue:new()}.
123 |
124 | buffer_add({Len, Buffer}, Element) ->
125 | MaxBufferSize = application:get_env(?APP, max_buffer_size, 1000),
126 | case Len >= MaxBufferSize of
127 | true -> {Len, queue:in(Element, queue:drop(Buffer))};
128 | false -> {Len + 1, queue:in(Element, Buffer)}
129 | end.
130 |
131 | buffer_to_list({_, Buffer}) ->
132 | queue:to_list(Buffer).
133 |
134 | %%%_* Internal functions ======================================================
135 | run_query(Conn, Type, Events) ->
136 | {ok, Statement} = epgsql:parse(Conn, query(Type)),
137 | Batch = [{Statement, params(Type, I)} || I <- Events],
138 | Results = epgsql:execute_batch(Conn, Batch),
139 | %% Crash on failure
140 | lists:foreach(fun ({ok, _}) ->
141 | ok;
142 | ({ok, _, _}) ->
143 | ok
144 | end,
145 | Results).
146 |
147 | initialize() ->
148 | case connect() of
149 | undefined ->
150 | log_failed_connection(),
151 | undefined;
152 | Conn ->
153 | mk_partitions(Conn),
154 | Conn
155 | end.
156 |
157 | connect() ->
158 | case epgsql:connect(connect_options()) of
159 | {ok, Conn} ->
160 | Conn;
161 | _ ->
162 | undefined
163 | end.
164 |
165 | connect_options() ->
166 | #{host => application:get_env(?APP, db_hostname, "localhost"),
167 | port => application:get_env(?APP, db_port, 5432),
168 | username => application:get_env(?APP, db_username, "system_monitor"),
169 | password => application:get_env(?APP, db_password, "system_monitor_password"),
170 | database => application:get_env(?APP, db_name, "system_monitor"),
171 | timeout => application:get_env(?APP, db_connection_timeout, 5000),
172 | codecs => []}.
173 |
174 | log_failed_connection() ->
175 | ?LOG_WARNING("Failed to open connection to the DB.", [], #{domain => [system_monitor]}).
176 |
177 | mk_partitions(Conn) ->
178 | DaysAhead = application:get_env(system_monitor, partition_days_ahead, 10),
179 | DaysBehind = application:get_env(system_monitor, partition_days_behind, 10),
180 | GDate = calendar:date_to_gregorian_days(date()),
181 | DaysAheadL = lists:seq(GDate, GDate + DaysAhead),
182 | %% Delete 10 days older than partition_days_behind config
183 | DaysBehindL = lists:seq(GDate - DaysBehind - 10, GDate - DaysBehind - 1),
184 | lists:foreach(fun(Day) -> create_partition_tables(Conn, Day) end, DaysAheadL),
185 | lists:foreach(fun(Day) -> delete_partition_tables(Conn, Day) end, DaysBehindL).
186 |
187 | create_partition_tables(Conn, Day) ->
188 | Tables = [<<"prc">>, <<"app_top">>, <<"fun_top">>, <<"node_role">>],
189 | From = to_postgres_date(Day),
190 | To = to_postgres_date(Day + 1),
191 | lists:foreach(fun(Table) ->
192 | Query = create_partition_query(Table, Day, From, To),
193 | [{ok, [], []}, {ok, [], []}] = epgsql:squery(Conn, Query)
194 | end,
195 | Tables).
196 |
197 | delete_partition_tables(Conn, Day) ->
198 | Tables = [<<"prc">>, <<"app_top">>, <<"fun_top">>, <<"node_role">>],
199 | lists:foreach(fun(Table) ->
200 | Query = delete_partition_query(Table, Day),
201 | {ok, [], []} = epgsql:squery(Conn, Query)
202 | end,
203 | Tables).
204 |
205 | create_partition_query(Table, Day, From, To) ->
206 | <<"CREATE TABLE IF NOT EXISTS ", Table/binary, "_", (integer_to_binary(Day))/binary, " ",
207 | "PARTITION OF ", Table/binary, " ",
208 | "FOR VALUES "
209 | "FROM ('", (list_to_binary(From))/binary, "') TO ('", (list_to_binary(To))/binary, "');"
210 | "CREATE INDEX IF NOT EXISTS ",
211 | Table/binary, "_", (integer_to_binary(Day))/binary, "_ts_idx "
212 | "ON ", Table/binary, "_", (integer_to_binary(Day))/binary, "(ts);">>.
213 |
214 | delete_partition_query(Table, Day) ->
215 | <<"DROP TABLE IF EXISTS ", Table/binary, "_", (integer_to_binary(Day))/binary, ";">>.
216 |
217 | to_postgres_date(GDays) ->
218 | {YY, MM, DD} = calendar:gregorian_days_to_date(GDays),
219 | lists:flatten(io_lib:format("~w-~2..0w-~2..0w", [YY, MM, DD])).
220 |
221 | query(fun_top) ->
222 | fun_top_query();
223 | query(app_top) ->
224 | app_top_query();
225 | query(node_role) ->
226 | node_role_query();
227 | query(proc_top) ->
228 | prc_query().
229 |
230 | prc_query() ->
231 | <<"insert into prc (node, ts, pid, dreductions, dmemory, reductions, "
232 | "memory, message_queue_len, current_function, initial_call, "
233 | "registered_name, stack_size, heap_size, total_heap_size, current_stacktrace, group_leader) "
234 | "VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16);">>.
235 |
236 | app_top_query() ->
237 | <<"insert into app_top (node, ts, application, unit, value) VALUES ($1, $2, $3, $4, $5);">>.
238 |
239 | fun_top_query() ->
240 | <<"insert into fun_top (node, ts, fun, fun_type, num_processes) VALUES ($1, $2, $3, $4, $5);">>.
241 |
242 | node_role_query() ->
243 | <<"insert into node_role (node, ts, data) VALUES ($1, $2, $3);">>.
244 |
245 | params(fun_top, {fun_top, Node, TS, Key, Tag, Val} = _Event) ->
246 | [atom_to_list(Node), ts_to_timestamp(TS), system_monitor:fmt_mfa(Key), Tag, Val];
247 | params(app_top, {app_top, Node, TS, Application, Tag, Val} = _Event) ->
248 | [atom_to_list(Node),
249 | ts_to_timestamp(TS),
250 | atom_to_list(Application),
251 | atom_to_list(Tag),
252 | Val];
253 | params(node_role, {node_role, Node, TS, Bin}) ->
254 | [atom_to_list(Node), ts_to_timestamp(TS), Bin];
255 | params(proc_top,
256 | #erl_top{node = Node,
257 | ts = TS,
258 | pid = Pid,
259 | dreductions = DR,
260 | dmemory = DM,
261 | reductions = R,
262 | memory = M,
263 | message_queue_len = MQL,
264 | current_function = CF,
265 | initial_call = IC,
266 | registered_name = RN,
267 | stack_size = SS,
268 | heap_size = HS,
269 | total_heap_size = THS,
270 | current_stacktrace = CS,
271 | group_leader = GL} =
272 | _Event) ->
273 | [atom_to_list(Node),
274 | ts_to_timestamp(TS),
275 | Pid,
276 | DR,
277 | DM,
278 | R,
279 | M,
280 | MQL,
281 | system_monitor:fmt_mfa(CF),
282 | system_monitor:fmt_mfa(IC),
283 | name_to_list(RN),
284 | SS,
285 | HS,
286 | THS,
287 | system_monitor:fmt_stack(CS),
288 | GL].
289 |
290 | ts_to_timestamp(TS) ->
291 | calendar:system_time_to_universal_time(TS, native).
292 |
293 | name_to_list(Term) ->
294 | case io_lib:printable_latin1_list(Term) of
295 | true ->
296 | Term;
297 | false ->
298 | lists:flatten(io_lib:format("~p", [Term]))
299 | end.
300 |
--------------------------------------------------------------------------------
/src/system_monitor_sup.erl:
--------------------------------------------------------------------------------
1 | %%--------------------------------------------------------------------------------
2 | %% Copyright 2020 Klarna Bank AB
3 | %%
4 | %% Licensed under the Apache License, Version 2.0 (the "License");
5 | %% you may not use this file except in compliance with the License.
6 | %% You may obtain a copy of the License at
7 | %%
8 | %% http://www.apache.org/licenses/LICENSE-2.0
9 | %%
10 | %% Unless required by applicable law or agreed to in writing, software
11 | %% distributed under the License is distributed on an "AS IS" BASIS,
12 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | %% See the License for the specific language governing permissions and
14 | %% limitations under the License.
15 | %%--------------------------------------------------------------------------------
16 | -module(system_monitor_sup).
17 |
18 | %% TODO: Dialyzer doesn't like this one:
19 | %-behaviour(supervisor3).
20 |
21 | %% External exports
22 | -export([start_link/0]).
23 |
24 | %% supervisor callbacks
25 | -export([init/1, post_init/1]).
26 |
27 | %%--------------------------------------------------------------------
28 | %% Macros
29 | %%--------------------------------------------------------------------
30 | -define(SERVER, ?MODULE).
31 | -define(SUP2, system_monitor2_sup).
32 |
33 | %%%----------------------------------------------------------------------
34 | %%% API
35 | %%%----------------------------------------------------------------------
36 | start_link() ->
37 | supervisor3:start_link({local, ?SERVER}, ?MODULE, ?SERVER).
38 |
39 | %%%----------------------------------------------------------------------
40 | %%% Callback functions from supervisor
41 | %%%----------------------------------------------------------------------
42 |
43 | server(Name, Type) ->
44 | server(Name, Type, 2000).
45 |
46 | server(Name, Type, Shutdown) ->
47 | {Name, {Name, start_link, []}, {permanent, 15}, Shutdown, Type, [Name]}.
48 |
49 | worker(Name) -> server(Name, worker).
50 |
51 | post_init(_) ->
52 | ignore.
53 |
54 | init(?SERVER) ->
55 | %% The top level supervisor *does not allow restarts*; if a component
56 | %% directly under this supervisor crashes, the entire node will shut
57 | %% down and restart. Thus, only those components that must never be
58 | %% unavailable should be directly under this supervisor.
59 |
60 | SecondSup = {?SUP2,
61 | {supervisor3, start_link,
62 | [{local, ?SUP2}, ?MODULE, ?SUP2]},
63 | permanent, 2000, supervisor, [?MODULE]},
64 |
65 | {ok, {{one_for_one,0,1}, % no restarts allowed!
66 | [SecondSup]
67 | }};
68 | init(?SUP2) ->
69 | %% The second-level supervisor allows some restarts. This is where the
70 | %% normal services live.
71 | {ok, {{one_for_one, 10, 20},
72 | [ worker(system_monitor_top)
73 | , worker(system_monitor_events)
74 | , worker(system_monitor)
75 | ] ++ producer_callback()
76 | }}.
77 |
78 | producer_callback() ->
79 | case system_monitor_callback:get_callback_mod() of
80 | undefined -> [];
81 | Mod -> [worker(Mod)]
82 | end.
83 |
--------------------------------------------------------------------------------
/src/system_monitor_top.erl:
--------------------------------------------------------------------------------
1 | %%--------------------------------------------------------------------------------
2 | %% Copyright 2020 Klarna Bank AB
3 | %%
4 | %% Licensed under the Apache License, Version 2.0 (the "License");
5 | %% you may not use this file except in compliance with the License.
6 | %% You may obtain a copy of the License at
7 | %%
8 | %% http://www.apache.org/licenses/LICENSE-2.0
9 | %%
10 | %% Unless required by applicable law or agreed to in writing, software
11 | %% distributed under the License is distributed on an "AS IS" BASIS,
12 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | %% See the License for the specific language governing permissions and
14 | %% limitations under the License.
15 | %%--------------------------------------------------------------------------------
16 | %%% @doc
17 | %%% Collect Erlang process statistics and push it to the configured destination
18 | %%% @end
19 | -module(system_monitor_top).
20 |
21 | -behaviour(gen_server).
22 |
23 | -include_lib("system_monitor/include/system_monitor.hrl").
24 |
25 | -ifdef(TEST).
26 | -include_lib("proper/include/proper.hrl").
27 | -include_lib("eunit/include/eunit.hrl").
28 | -endif. % TEST
29 |
30 | %% API
31 | -export([start_link/0, get_app_top/0, get_abs_app_top/0,
32 | get_app_memory/0, get_app_processes/0,
33 | get_function_top/0, get_proc_top/0, get_proc_top/1]).
34 |
35 | %% gen_server callbacks
36 | -export([init/1, handle_call/3, handle_cast/2, handle_info/2,
37 | terminate/2, code_change/3]).
38 |
39 | -export_type([function_top/0]).
40 |
41 | -define(SERVER, ?MODULE).
42 |
43 | -define(TOP_APP_TAB, sysmon_top_app_tab).
44 | -define(TAB_OPTS, [private, named_table, set, {keypos, 1}]).
45 |
46 | %% Type and record definitions
47 |
48 | -record(state,
49 | { max_procs :: integer()
50 | , sample_size :: non_neg_integer()
51 | , interval :: integer()
52 | , num_items :: integer()
53 | , timer :: timer:tref()
54 | , old_data :: [#pid_info{}]
55 | , last_ts :: integer()
56 | , proc_top = [] :: [#erl_top{}]
57 | , app_top = [] :: [#app_top{}]
58 | , function_top =
59 | #{ initial_call => []
60 | , current_function => []
61 | } :: function_top()
62 | }).
63 |
64 | -type top() :: {integer(), gb_trees:tree(integer(), [#pid_info{}])}.
65 |
66 | -define(PROCESS_INFO_FIELDS_NEW,
67 | [ initial_call, dictionary, registered_name, group_leader, reductions, memory,
68 | message_queue_len, current_function]).
69 |
70 | -define(PROCESS_INFO_FIELDS_UPDATE,
71 | [ reductions, memory, message_queue_len, current_function]).
72 |
73 | -define(ADDITIONAL_FIELDS,
74 | [ stack_size
75 | , heap_size, total_heap_size, current_stacktrace
76 | ]).
77 |
78 | %%%===================================================================
79 | %%% API
80 | %%%===================================================================
81 |
82 | %%--------------------------------------------------------------------
83 | %% @doc
84 | %% Get Erlang process top
85 | %% @end
86 | %%--------------------------------------------------------------------
87 | -spec get_proc_top() -> {integer(), [#erl_top{}]}.
88 | get_proc_top() ->
89 | {ok, Data} = gen_server:call(?SERVER, get_proc_top, infinity),
90 | Data.
91 |
92 |
93 | %%--------------------------------------------------------------------
94 | %% @doc
95 | %% Get Erlang process top info for one process
96 | %% @end
97 | %%--------------------------------------------------------------------
98 | -spec get_proc_top(pid()) -> #erl_top{} | false.
99 | get_proc_top(Pid) ->
100 | gen_server:call(?SERVER, {get_proc_top, Pid}, infinity).
101 |
102 | %%--------------------------------------------------------------------
103 | %% @doc
104 | %% Get relative reduction utilization per application, sorted by
105 | %% reductions
106 | %% @end
107 | %%--------------------------------------------------------------------
108 | -spec get_app_top() -> [{atom(), float()}].
109 | get_app_top() ->
110 | do_get_app_top(#app_top.red_rel).
111 |
112 | %%--------------------------------------------------------------------
113 | %% @doc
114 | %% Get absolute reduction utilization per application, sorted by
115 | %% reductions
116 | %% @end
117 | %%--------------------------------------------------------------------
118 | -spec get_abs_app_top() -> [{atom(), integer()}].
119 | get_abs_app_top() ->
120 | do_get_app_top(#app_top.red_abs).
121 |
122 | %%--------------------------------------------------------------------
123 | %% @doc
124 | %% Get memory utilization per application, sorted by memory
125 | %% @end
126 | %%--------------------------------------------------------------------
127 | -spec get_app_memory() -> [{atom(), integer()}].
128 | get_app_memory() ->
129 | do_get_app_top(#app_top.memory).
130 |
131 | %%--------------------------------------------------------------------
132 | %% @doc
133 | %% Get number of processes spawned by each application
134 | %% @end
135 | %%--------------------------------------------------------------------
136 | -spec get_app_processes() -> [{atom(), integer()}].
137 | get_app_processes() ->
138 | do_get_app_top(#app_top.processes).
139 |
140 | %%--------------------------------------------------------------------
141 | %% @doc
142 | %% Get approximate distribution of initilal_call and current_function
143 | %% per process
144 | %% @end
145 | %%--------------------------------------------------------------------
146 | -spec get_function_top() -> function_top().
147 | get_function_top() ->
148 | {ok, Data} = gen_server:call(?SERVER, get_function_top, infinity),
149 | Data.
150 |
151 | %%--------------------------------------------------------------------
152 | %% @doc
153 | %% Starts the server
154 | %% @end
155 | %%--------------------------------------------------------------------
156 | -spec start_link() ->
157 | {ok, pid()} | ignore | {error, term()}.
158 | start_link() ->
159 | gen_server:start_link({local, ?SERVER}, ?MODULE, [], []).
160 |
161 | %%%===================================================================
162 | %%% gen_server callbacks
163 | %%%===================================================================
164 |
165 | init([]) ->
166 | {ok, MaxProcs} = application:get_env(?APP , top_max_procs),
167 | {ok, SampleSize} = application:get_env(?APP, top_sample_size),
168 | {ok, Interval} = application:get_env(?APP, top_sample_interval),
169 | {ok, NumItems} = application:get_env(?APP, top_num_items),
170 | {ok, TRef} = timer:send_after(0, collect_data),
171 | {ok, #state{ max_procs = MaxProcs
172 | , sample_size = SampleSize
173 | , interval = Interval
174 | , num_items = NumItems
175 | , timer = TRef
176 | , last_ts = os:system_time()
177 | , old_data = []
178 | }}.
179 |
180 | handle_call(get_proc_top, _From, State) ->
181 | Top = State#state.proc_top,
182 | SnapshotTS = State#state.last_ts,
183 | Data = {SnapshotTS, Top},
184 | {reply, {ok, Data}, State};
185 | handle_call({get_proc_top, Pid}, _From, State) ->
186 | Top = State#state.proc_top,
187 | {reply, lists:keyfind(pid_to_list(Pid), #erl_top.pid, Top), State};
188 | handle_call(get_app_top, _From, State) ->
189 | Data = State#state.app_top,
190 | {reply, {ok, Data}, State};
191 | handle_call(get_function_top, _From, State) ->
192 | Data = State#state.function_top,
193 | {reply, {ok, Data}, State};
194 | handle_call(_Msg, _From, State) ->
195 | {reply, {error, bad_call}, State}.
196 |
197 | handle_cast(_Msg, State) ->
198 | {noreply, State}.
199 |
200 | handle_info(collect_data, State) ->
201 | T1 = os:system_time(),
202 | NumProcesses = erlang:system_info(process_count),
203 | case should_calculate_info(NumProcesses, State#state.max_procs) of
204 | true ->
205 | T0 = State#state.last_ts,
206 | Dt = erlang:convert_time_unit(T1 - T0, native, microsecond),
207 | Pids = lists:sort(processes()),
208 | OldData = State#state.old_data,
209 | NewData = calc_deltas(OldData, Pids, Dt),
210 | ProcTop = do_proc_top(NewData, State, T1),
211 | AppTop = do_app_top(NewData);
212 | false ->
213 | AppTop = [],
214 | NewData = [],
215 | ProcTop = [fake_erl_top_msg(T1)]
216 | end,
217 | FunctionTop = process_aggregate(NewData, State#state.sample_size),
218 | %% Calculate timer interval. Sleep at least half a second between
219 | %% samples when sysmon is running very slow:
220 | T2 = os:system_time(),
221 | Dt2 = erlang:convert_time_unit(T2 - T1, native, microsecond),
222 | SleepTime = max(500, State#state.interval - Dt2),
223 | {ok, TRef} = timer:send_after(SleepTime, collect_data),
224 | {noreply, State#state{ last_ts = T1
225 | , old_data = NewData
226 | , proc_top = ProcTop
227 | , app_top = AppTop
228 | , function_top = FunctionTop
229 | , timer = TRef
230 | }};
231 | handle_info(_Info, State) ->
232 | {noreply, State}.
233 |
234 | terminate(_Reason, _State) ->
235 | ok.
236 |
237 | code_change(_OldVsn, State, _Extra) ->
238 | {ok, State}.
239 |
240 | %%%===================================================================
241 | %%% Internal functions
242 | %%%===================================================================
243 |
244 | %%--------------------------------------------------------------------
245 | %% @doc
246 | %% Calculate resource consumption per application
247 | %% @end
248 | %%--------------------------------------------------------------------
249 | -spec do_app_top([#pid_info{}]) -> [#app_top{}].
250 | do_app_top(Deltas) ->
251 | %% Prepare the temporary table:
252 | case ets:info(?TOP_APP_TAB) of
253 | undefined ->
254 | ets:new(?TOP_APP_TAB, ?TAB_OPTS);
255 | _ ->
256 | ets:delete_all_objects(?TOP_APP_TAB)
257 | end,
258 | %% Traverse process infos:
259 | lists:foreach(
260 | fun(#pid_info{group_leader = GL, dreductions = DR, memory=Mem}) ->
261 | ets:update_counter( ?TOP_APP_TAB
262 | , GL
263 | , [ {2, round(DR)}
264 | , {3, Mem}
265 | , {4, 1}
266 | ]
267 | , {GL, 0, 0, 0}
268 | )
269 | end,
270 | Deltas),
271 | %% Calculate final values:
272 | TotalReds =
273 | ets:foldl(
274 | fun({_, DR, _, _}, Acc) ->
275 | Acc + DR
276 | end,
277 | 0,
278 | ?TOP_APP_TAB),
279 | {AppInfo, UnknownReds, UnknownMem, UnknownProcs} =
280 | ets:foldl(
281 | fun( {GL, Reds, Mem, Procs}
282 | , {Apps, UnknownReds, UnknownMem, UnknownProcs}
283 | ) ->
284 | case application_controller:get_application(GL) of
285 | undefined ->
286 | { Apps
287 | , UnknownReds + Reds
288 | , UnknownMem + Mem
289 | , UnknownProcs + Procs
290 | };
291 | {ok, App} ->
292 | AppInfo = #app_top{ app = App
293 | , red_rel = divide(Reds, TotalReds)
294 | , red_abs = Reds
295 | , memory = Mem
296 | , processes = Procs
297 | },
298 | {[AppInfo|Apps], UnknownReds, UnknownMem, UnknownProcs}
299 | end
300 | end,
301 | {[], 0, 0, 0},
302 | ?TOP_APP_TAB),
303 | UnknownApp = #app_top{ app = unknown
304 | , red_rel = divide(UnknownReds, TotalReds)
305 | , red_abs = UnknownReds
306 | , memory = UnknownMem
307 | , processes = UnknownProcs
308 | },
309 | [UnknownApp|AppInfo].
310 |
311 | divide(_A, 0) ->
312 | 0;
313 | divide(A, B) ->
314 | A / B.
315 |
316 | %%------------------------------------------------------------------------------
317 | %% @doc Produce an aggregate summary of initial call and current function for
318 | %% processes.
319 | %%------------------------------------------------------------------------------
320 | -spec process_aggregate([#pid_info{}], non_neg_integer()) -> function_top().
321 | process_aggregate(ProcInfos0, SampleSize) ->
322 | ProcInfos = random_sample(ProcInfos0, SampleSize),
323 | NumProcs = length(ProcInfos),
324 | InitCallT = ets:new(sysmon_init_call, []),
325 | CurrFunT = ets:new(sysmon_curr_fun, []),
326 | Fun = fun(#pid_info{initial_call = InitCall, current_function = CurrFun0}) ->
327 | ets:update_counter(InitCallT, InitCall, {2, 1}, {InitCall, 0}),
328 | CurrFun =
329 | case CurrFun0 of
330 | %% process_info/2 may return 'undefined' in some
331 | %% cases (e.g. native compiled (HiPE)
332 | %% modules). We collect all of these under
333 | %% {undefined, undefined, 0}.
334 | undefined -> {undefined, undefined, 0};
335 | _ -> CurrFun0
336 | end,
337 | ets:update_counter(CurrFunT, CurrFun, {2, 1}, {CurrFun, 0})
338 | end,
339 | lists:foreach(Fun, ProcInfos),
340 | Finalize = fun(A) ->
341 | Sorted = lists:reverse(lists:keysort(2, ets:tab2list(A))),
342 | lists:map(fun({Key, Val}) -> {Key, Val/NumProcs} end, Sorted)
343 | end,
344 | Result = #{ initial_call => Finalize(InitCallT)
345 | , current_function => Finalize(CurrFunT)
346 | },
347 | ets:delete(InitCallT),
348 | ets:delete(CurrFunT),
349 | Result.
350 |
351 | %%--------------------------------------------------------------------
352 | %% @doc
353 | %% Find processes that take the most resources
354 | %% @end
355 | %%--------------------------------------------------------------------
356 | -spec do_proc_top([#pid_info{}], #state{}, integer()) -> [#erl_top{}].
357 | do_proc_top(Deltas, State, Now) ->
358 | NumElems = State#state.num_items,
359 | case length(Deltas) > NumElems of
360 | true ->
361 | {First, Rest} = lists:split(NumElems, Deltas);
362 | false ->
363 | First = Deltas,
364 | Rest = []
365 | end,
366 | %% Generate initial conditions for the top search using the first
367 | %% NumElems:
368 | Acc0 = { sort_top(#pid_info.dreductions, First)
369 | , sort_top(#pid_info.memory, First)
370 | , sort_top(#pid_info.dmemory, First)
371 | , sort_top(#pid_info.message_queue_len, First)
372 | },
373 | %% Iterate through the rest of the processes:
374 | TopGroups =
375 | lists:foldl(
376 | fun(Delta, {TopDRed, TopMem, TopDMem, TopMQ}) ->
377 | { maybe_push_to_top(#pid_info.dreductions, Delta, TopDRed)
378 | , maybe_push_to_top(#pid_info.memory, Delta, TopMem)
379 | , maybe_push_to_top(#pid_info.dmemory, Delta, TopDMem)
380 | , maybe_push_to_top(#pid_info.message_queue_len, Delta, TopMQ)
381 | }
382 | end,
383 | Acc0,
384 | Rest),
385 | %% Some pids may appear in more than one group, fix this:
386 | TopElems = lists:usort(
387 | lists:flatten(
388 | [top_to_list(Grp) || Grp <- tuple_to_list(TopGroups)]
389 | )),
390 | %% Request additional data for the top processes:
391 | [finalize_proc_info(P, Now) || P <- TopElems].
392 |
393 | -spec finalize_proc_info(#pid_info{}, integer()) -> #erl_top{}.
394 | finalize_proc_info(#pid_info{pid = Pid, initial_call = InitialCall,
395 | registered_name = Name,
396 | group_leader = GL} = ProcInfo, Now) ->
397 | ProcessInfo = process_info(Pid, ?ADDITIONAL_FIELDS),
398 | case ProcessInfo of
399 | [{stack_size, Stack},
400 | {heap_size, Heap},
401 | {total_heap_size, Total},
402 | {current_stacktrace, Stacktrace}] ->
403 | CurrentFunction =
404 | case Stacktrace of
405 | [] ->
406 | {unknown, unknown, 0};
407 | [{CurrModule, CurrFun, CurrArity, _} | _] ->
408 | {CurrModule, CurrFun, CurrArity}
409 | end,
410 | #erl_top{node = system_monitor:node_name(),
411 | ts = Now,
412 | pid = pid_to_list(ProcInfo#pid_info.pid),
413 | group_leader = pid_to_list(GL),
414 | dreductions = ProcInfo#pid_info.dreductions,
415 | dmemory = ProcInfo#pid_info.dmemory,
416 | reductions = ProcInfo#pid_info.reductions,
417 | memory = ProcInfo#pid_info.memory,
418 | message_queue_len = ProcInfo#pid_info.message_queue_len,
419 | initial_call = InitialCall,
420 | registered_name = Name,
421 | stack_size = Stack,
422 | heap_size = Heap,
423 | total_heap_size = Total,
424 | current_stacktrace = Stacktrace,
425 | current_function = CurrentFunction};
426 | undefined ->
427 | #erl_top{node = system_monitor:node_name(),
428 | ts = Now,
429 | pid = pid_to_list(ProcInfo#pid_info.pid),
430 | group_leader = pid_to_list(GL),
431 | dreductions = ProcInfo#pid_info.dreductions,
432 | dmemory = ProcInfo#pid_info.dmemory,
433 | reductions = ProcInfo#pid_info.reductions,
434 | memory = ProcInfo#pid_info.memory,
435 | message_queue_len = ProcInfo#pid_info.message_queue_len,
436 | initial_call = {unknown, unknown, 0},
437 | current_function = {unknown, unknown, 0},
438 | stack_size = 0,
439 | heap_size = 0,
440 | total_heap_size = 0,
441 | current_stacktrace = []}
442 | end.
443 |
444 | -spec maybe_push_to_top(integer(), #pid_info{}, top()) -> top().
445 | maybe_push_to_top(FieldID, Val, {OldMin, OldTop}) ->
446 | Key = element(FieldID, Val),
447 | if OldMin < Key ->
448 | {SKey, SVal, Top1} = gb_trees:take_smallest(OldTop),
449 | case SVal of
450 | [_] ->
451 | Top2 = Top1;
452 | [_|SVal2] ->
453 | Top2 = gb_insert(SKey, SVal2, Top1)
454 | end,
455 | NewTop = gb_insert(Key, Val, Top2),
456 | {Minimal, _} = gb_trees:smallest(NewTop),
457 | {Minimal, NewTop};
458 | true ->
459 | {OldMin, OldTop}
460 | end.
461 |
462 | -spec sort_top(integer(), [#pid_info{}]) -> top().
463 | sort_top(Field, L) ->
464 | Top =
465 | lists:foldl(
466 | fun(Val, Acc) ->
467 | gb_insert(element(Field, Val), Val, Acc)
468 | end,
469 | gb_trees:empty(),
470 | L),
471 | {Minimal, _} = gb_trees:smallest(Top),
472 | {Minimal, Top}.
473 |
474 | gb_insert(Key, Val, Tree) ->
475 | case gb_trees:lookup(Key, Tree) of
476 | none ->
477 | gb_trees:enter(Key, [Val], Tree);
478 | {value, Vals} ->
479 | gb_trees:update(Key, [Val|Vals], Tree)
480 | end.
481 |
482 | -spec should_calculate_info(non_neg_integer(), integer()) -> boolean().
483 | should_calculate_info(NumPids, MaxProcs) ->
484 | not (MaxProcs < NumPids andalso MaxProcs > 0).
485 |
486 | pid_info_update(PI) ->
487 | #pid_info{pid = Pid} = PI,
488 | case should_not_update_memory(PI) of
489 | true ->
490 | %% Calling process_info(Pid, memory) can block both system_monitor and the
491 | %% monitored Pid for a long time, which can degrade system performance.
492 | %% If it seems dangerous to query memory, don't do that. The memory metric
493 | %% must be present if Pid's sample is present, so we set it to zero.
494 | case erlang:process_info(Pid, ?PROCESS_INFO_FIELDS_UPDATE -- [memory]) of
495 | [ {reductions, Reds}
496 | , {message_queue_len, MQL}
497 | , {current_function, CurFun}
498 | ] ->
499 | pid_info_update(PI, Reds, _Mem = 0, MQL, CurFun);
500 | undefined ->
501 | undefined
502 | end;
503 | false ->
504 | case erlang:process_info(Pid, ?PROCESS_INFO_FIELDS_UPDATE) of
505 | [ {reductions, Reds}
506 | , {memory, Mem}
507 | , {message_queue_len, MQL}
508 | , {current_function, CurFun}
509 | ] ->
510 | pid_info_update(PI, Reds, Mem, MQL, CurFun);
511 | undefined ->
512 | undefined
513 | end
514 | end.
515 |
516 | pid_info_update(PI, Reds, Mem, MQL, CurFun) ->
517 | PI#pid_info{
518 | reductions = Reds
519 | , memory = Mem
520 | , message_queue_len = MQL
521 | , current_function = CurFun
522 | }.
523 |
524 | should_not_update_memory(PI) ->
525 | %% this relies on number < atom in Erlang's term order
526 | PI#pid_info.message_queue_len > application:get_env(?APP, mql_limit_for_memory, undefined).
527 |
528 | -spec pid_info_new(pid()) -> #pid_info{} | undefined.
529 | pid_info_new(Pid) ->
530 | case erlang:process_info(Pid, ?PROCESS_INFO_FIELDS_NEW) of
531 | [ {initial_call, _} = InitialCallProp
532 | , {dictionary, _} = DictProp
533 | , {registered_name, RegisteredName}
534 | , {group_leader, GL}
535 | , {reductions, Red}
536 | , {memory, Mem}
537 | , {message_queue_len, MQ}
538 | , {current_function, CF}
539 | ] ->
540 | #pid_info{
541 | pid = Pid,
542 | initial_call = initial_call([InitialCallProp,DictProp]),
543 | registered_name = RegisteredName,
544 | group_leader = GL,
545 | reductions = Red,
546 | memory = Mem,
547 | message_queue_len = MQ,
548 | current_function = CF
549 | };
550 | undefined ->
551 | %% The proces has died while we were collecting other data...
552 | undefined
553 | end.
554 |
555 | calc_deltas(OldData, Pids, Dt) ->
556 | NewData = calc_deltas(OldData, Pids, [], Dt),
557 | lists:filter(
558 | fun(undefined) -> false;
559 | (#pid_info{}) -> true
560 | end,
561 | NewData).
562 |
563 | -spec calc_deltas(PIL, [pid()], PIL, number()) -> PIL
564 | when PIL :: [#pid_info{}].
565 | calc_deltas([], New, Acc, Dt) ->
566 | %% The rest of the processess are new
567 | lists:sort([delta(undefined, pid_info_new(Pid), Dt) || Pid <- New] ++ Acc);
568 | calc_deltas(_Old, [], Acc, _) ->
569 | %% The rest of the processes have terminated
570 | lists:sort(Acc);
571 | calc_deltas(Old, Pids, Acc, Dt) ->
572 | [PI1 = #pid_info{pid = P1} | OldT] = Old,
573 | [P2 | PidsT] = Pids,
574 | if P1 > P2 -> %% P1 has terminated
575 | calc_deltas(OldT, Pids, Acc, Dt);
576 | P1 < P2 -> %% P2 is a new process
577 | Delta = delta(undefined, pid_info_new(P2), Dt),
578 | calc_deltas(Old, PidsT, [Delta|Acc], Dt);
579 | P1 =:= P2 -> %% We already have record of P2
580 | case pid_info_update(PI1) of
581 | undefined -> % P1 just terminated
582 | calc_deltas(OldT, PidsT, Acc, Dt);
583 | PI2 ->
584 | Delta = delta(PI1, PI2, Dt),
585 | calc_deltas(OldT, PidsT, [Delta|Acc], Dt)
586 | end
587 | end.
588 |
589 | -spec top_to_list(top()) -> [#pid_info{}].
590 | top_to_list({_, Top}) ->
591 | lists:append(gb_trees:values(Top)).
592 |
593 | -spec delta(undefined, undefined, number()) -> undefined;
594 | (#pid_info{} | undefined, #pid_info{}, number()) -> #pid_info{}.
595 | delta(undefined, undefined, _Dt) -> undefined;
596 | delta(P1, P2, Dt) ->
597 | case P1 of
598 | undefined ->
599 | DRed = divide(P2#pid_info.reductions, Dt),
600 | DMem = divide(P2#pid_info.memory, Dt);
601 | _ ->
602 | DRed = divide((P2#pid_info.reductions - P1#pid_info.reductions), Dt),
603 | DMem = divide((P2#pid_info.memory - P1#pid_info.memory), Dt)
604 | end,
605 | P2#pid_info
606 | { dreductions = DRed
607 | , dmemory = DMem
608 | }.
609 |
610 | -spec do_get_app_top(integer()) -> [{atom(), number()}].
611 | do_get_app_top(FieldId) ->
612 | {ok, Data} = gen_server:call(?SERVER, get_app_top, infinity),
613 | lists:reverse(
614 | lists:keysort(2, [{Val#app_top.app, element(FieldId, Val)}
615 | || Val <- Data])).
616 |
617 | -spec fake_erl_top_msg(integer()) -> #erl_top{}.
618 | fake_erl_top_msg(Now) ->
619 | #erl_top{ node = system_monitor:node_name()
620 | , ts = Now
621 | , pid = "<42.42.42>"
622 | , group_leader = "<42.42.42>"
623 | , dreductions = 0
624 | , dmemory = 0
625 | , reductions = -1
626 | , memory = -1
627 | , message_queue_len = -1
628 | , initial_call = {too_many, processes, 0}
629 | , registered_name = error_too_many_processes
630 | , current_stacktrace = []
631 | , current_function = {too_many, processes, 0}
632 | , stack_size = -1
633 | , heap_size = -1
634 | , total_heap_size = -1
635 | }.
636 |
637 | -spec random_sample(list(A), non_neg_integer()) -> list(A).
638 | %% Note: actual sample size may slightly differ from
639 | %% the SampleSize argument
640 | random_sample([], _SampleSize) -> [];
641 | random_sample(L, SampleSize) ->
642 | P = SampleSize/length(L),
643 | lists:filter(fun(_I) -> rand:uniform() < P end, L).
644 |
645 | -spec initial_call(proplists:proplist()) -> mfa().
646 | initial_call(Info) ->
647 | case proplists:get_value(initial_call, Info) of
648 | {proc_lib, init_p, 5} ->
649 | proc_lib:translate_initial_call(Info);
650 | ICall ->
651 | ICall
652 | end.
653 |
654 | %%%===================================================================
655 | %%% Tests
656 | %%%===================================================================
657 |
658 | -ifdef(TEST).
659 |
660 | -dialyzer({nowarn_function, [ maybe_push_to_top_test/0
661 | , maybe_push_to_top_same_as_sort_prop/0
662 | , initial_call_test/0
663 | , initial_call_fallback_test/0
664 | ]}).
665 |
666 | maybe_push_to_top_wrapper(Val, Top) ->
667 | Init = sort_top(1, Top),
668 | Result = top_to_list(maybe_push_to_top(1, Val, Init)),
669 | lists:sort(Result).
670 |
671 | %% maybe_push_to_top function is just an optimized version
672 | %% of sorting a list and then taking its first N elements.
673 | %%
674 | %% Check that it is indeed true
675 | maybe_push_to_top_same_as_sort_prop() ->
676 | ?FORALL({Val, Top}, {{number()}, [{number()}]},
677 | begin
678 | NumElems = length(Top),
679 | PlainSort = lists:reverse(lists:sort([Val|Top])),
680 | Reference = lists:sublist(PlainSort, NumElems),
681 | Result = maybe_push_to_top_wrapper(Val, Top),
682 | Result == Reference
683 | end).
684 |
685 | maybe_push_to_top_test() ->
686 | ?assertEqual(true, proper:quickcheck(
687 | proper:numtests(
688 | 1000,
689 | maybe_push_to_top_same_as_sort_prop())
690 | )).
691 |
692 | initial_call_test() ->
693 | GetProcInfo = fun(Pid) ->
694 | erlang:process_info(Pid, [initial_call, dictionary])
695 | end,
696 | Pid1 = spawn(fun() -> timer:sleep(1000) end),
697 | timer:sleep(100), %% Sleep to avoid race condition
698 | ?assertEqual( {erlang, apply, 2}
699 | , initial_call(GetProcInfo(Pid1))
700 | ),
701 | Pid2 = proc_lib:spawn(timer, sleep, [1000]),
702 | timer:sleep(100), %% Sleep to avoid race condition
703 | ?assertEqual( {timer, sleep, 1}
704 | , initial_call(GetProcInfo(Pid2))
705 | ).
706 |
707 | initial_call_fallback_test() ->
708 | GetProcInfo = fun(Pid) ->
709 | erlang:process_info(Pid, [initial_call])
710 | end,
711 | Pid1 = spawn(fun() -> timer:sleep(1000) end),
712 | timer:sleep(100), %% Sleep to avoid race condition
713 | ?assertEqual( {erlang, apply, 2}
714 | , initial_call(GetProcInfo(Pid1))
715 | ),
716 | Pid2 = proc_lib:spawn(timer, sleep, [1000]),
717 | timer:sleep(100), %% Sleep to avoid race condition
718 | ?assertEqual( {proc_lib, init_p, 5}
719 | , initial_call(GetProcInfo(Pid2))
720 | ).
721 |
722 | -endif.
723 |
724 | %%%_* Emacs ============================================================
725 | %%% Local Variables:
726 | %%% allout-layout: t
727 | %%% erlang-indent-level: 2
728 | %%% End:
729 |
--------------------------------------------------------------------------------
/test/system_monitor_tests.erl:
--------------------------------------------------------------------------------
1 | -module(system_monitor_tests).
2 |
3 | -include_lib("eunit/include/eunit.hrl").
4 |
5 | start_test() ->
6 | ?assertMatch({ok, _}, application:ensure_all_started(system_monitor)),
7 | application:stop(system_monitor).
8 |
9 | callback_is_started_when_configured_test() ->
10 | application:set_env(system_monitor, callback_mod, system_monitor_pg),
11 | ?assertMatch({ok, _}, application:ensure_all_started(system_monitor)),
12 | ?assertNotEqual(undefined, whereis(system_monitor_pg)),
13 | application:stop(system_monitor).
14 |
15 | callback_is_started_test() ->
16 | application:unset_env(system_monitor, callback_mod),
17 | ?assertMatch({ok, _}, application:ensure_all_started(system_monitor)),
18 | ?assertEqual(undefined, whereis(system_monitor_pg)),
19 | application:stop(system_monitor).
20 |
--------------------------------------------------------------------------------