├── .github
└── workflows
│ └── ci.yml
├── .gitignore
├── CHANGELOG.md
├── LICENSE
├── Makefile
├── README.md
├── doc
├── app_top.png
├── proc_history.png
└── proc_top.png
├── include
└── system_monitor.hrl
├── rebar.config
├── rebar.lock
├── src
├── sysmon_int.hrl
├── system_monitor.app.src
├── system_monitor.erl
├── system_monitor_app.erl
├── system_monitor_callback.erl
├── system_monitor_collector.erl
├── system_monitor_dummy.erl
├── system_monitor_events.erl
├── system_monitor_lib.erl
├── system_monitor_pg.erl
├── system_monitor_sup.erl
└── system_monitor_top.erl
└── test
└── sysmon_SUITE.erl
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
1 | name: CI
2 | on: [push, pull_request]
3 | jobs:
4 | test:
5 | runs-on: ubuntu-latest
6 | strategy:
7 | matrix:
8 | erlang:
9 | - otp: "24"
10 | rebar3: "3.20"
11 | - otp: "25"
12 | rebar3: "3.22"
13 | - otp: "26"
14 | rebar3: "3.22"
15 | - otp: "27"
16 | rebar3: "3.24"
17 |
18 | steps:
19 | - uses: actions/checkout@v4
20 |
21 | - name: Install Erlang/OTP
22 | uses: erlef/setup-beam@v1
23 | with:
24 | otp-version: ${{ matrix.erlang.otp }}
25 | rebar3-version: ${{ matrix.erlang.rebar3 }}
26 |
27 | - name: Run tests
28 | run: make
29 |
30 | - name: Archive common test results
31 | if: ${{ always() }}
32 | uses: actions/upload-artifact@v4
33 | with:
34 | name: CT results
35 | path: _build/test/logs/**
36 | retention-days: 1
37 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | _build
2 | *.beam
3 | ebin/
4 | .idea
5 |
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # Changelog
2 |
3 | All notable changes to this project will be documented in this file.
4 |
5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7 |
8 | ## [3.0.2] - 2022-02-08
9 |
10 | Drop support for postgres replay queue. Batch insert operations in
11 | postgres backend.
12 |
13 | ## [3.0.0] - 2022-02-06
14 |
15 | Optimized top collection for systems with millions of processes.
16 | Added "very important processes" feature: some registered processes
17 | are always collected to the top. Added CI and improved test suite.
18 | Major refactoring. Hard fork from the [Klarna version](https://github.com/klarna-incubator/system_monitor).
19 |
20 | Warning: the table schema has changed! See: [example schema](https://github.com/k32/grafana-dashboards/blob/master/postgres/20-schema.sql)
21 |
22 | ## [2.2.0] - 2021-11-05
23 |
24 | Added support for configuring a module to use to send system_monitor events to
25 | an external destination.
26 |
27 | ## [2.1.0] - 2021-10-20
28 |
29 | Data format of system\_monitor\_top is changed to keep static data between
30 | ticks. Since this gen server is started by a supervisor that allows for some
31 | restarts, you can either let the server crash or stop+start this application.
32 |
33 | ## [2.0.0] - 2021-04-07
34 |
35 | Replace Kafka backend with a configurable one that defaults into Postgres
36 |
37 | ## [1.0.0] - 2020-09-02
38 |
39 | Initial version
40 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 |
2 | Apache License
3 | Version 2.0, January 2004
4 | http://www.apache.org/licenses/
5 |
6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7 |
8 | 1. Definitions.
9 |
10 | "License" shall mean the terms and conditions for use, reproduction,
11 | and distribution as defined by Sections 1 through 9 of this document.
12 |
13 | "Licensor" shall mean the copyright owner or entity authorized by
14 | the copyright owner that is granting the License.
15 |
16 | "Legal Entity" shall mean the union of the acting entity and all
17 | other entities that control, are controlled by, or are under common
18 | control with that entity. For the purposes of this definition,
19 | "control" means (i) the power, direct or indirect, to cause the
20 | direction or management of such entity, whether by contract or
21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
22 | outstanding shares, or (iii) beneficial ownership of such entity.
23 |
24 | "You" (or "Your") shall mean an individual or Legal Entity
25 | exercising permissions granted by this License.
26 |
27 | "Source" form shall mean the preferred form for making modifications,
28 | including but not limited to software source code, documentation
29 | source, and configuration files.
30 |
31 | "Object" form shall mean any form resulting from mechanical
32 | transformation or translation of a Source form, including but
33 | not limited to compiled object code, generated documentation,
34 | and conversions to other media types.
35 |
36 | "Work" shall mean the work of authorship, whether in Source or
37 | Object form, made available under the License, as indicated by a
38 | copyright notice that is included in or attached to the work
39 | (an example is provided in the Appendix below).
40 |
41 | "Derivative Works" shall mean any work, whether in Source or Object
42 | form, that is based on (or derived from) the Work and for which the
43 | editorial revisions, annotations, elaborations, or other modifications
44 | represent, as a whole, an original work of authorship. For the purposes
45 | of this License, Derivative Works shall not include works that remain
46 | separable from, or merely link (or bind by name) to the interfaces of,
47 | the Work and Derivative Works thereof.
48 |
49 | "Contribution" shall mean any work of authorship, including
50 | the original version of the Work and any modifications or additions
51 | to that Work or Derivative Works thereof, that is intentionally
52 | submitted to Licensor for inclusion in the Work by the copyright owner
53 | or by an individual or Legal Entity authorized to submit on behalf of
54 | the copyright owner. For the purposes of this definition, "submitted"
55 | means any form of electronic, verbal, or written communication sent
56 | to the Licensor or its representatives, including but not limited to
57 | communication on electronic mailing lists, source code control systems,
58 | and issue tracking systems that are managed by, or on behalf of, the
59 | Licensor for the purpose of discussing and improving the Work, but
60 | excluding communication that is conspicuously marked or otherwise
61 | designated in writing by the copyright owner as "Not a Contribution."
62 |
63 | "Contributor" shall mean Licensor and any individual or Legal Entity
64 | on behalf of whom a Contribution has been received by Licensor and
65 | subsequently incorporated within the Work.
66 |
67 | 2. Grant of Copyright License. Subject to the terms and conditions of
68 | this License, each Contributor hereby grants to You a perpetual,
69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70 | copyright license to reproduce, prepare Derivative Works of,
71 | publicly display, publicly perform, sublicense, and distribute the
72 | Work and such Derivative Works in Source or Object form.
73 |
74 | 3. Grant of Patent License. Subject to the terms and conditions of
75 | this License, each Contributor hereby grants to You a perpetual,
76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77 | (except as stated in this section) patent license to make, have made,
78 | use, offer to sell, sell, import, and otherwise transfer the Work,
79 | where such license applies only to those patent claims licensable
80 | by such Contributor that are necessarily infringed by their
81 | Contribution(s) alone or by combination of their Contribution(s)
82 | with the Work to which such Contribution(s) was submitted. If You
83 | institute patent litigation against any entity (including a
84 | cross-claim or counterclaim in a lawsuit) alleging that the Work
85 | or a Contribution incorporated within the Work constitutes direct
86 | or contributory patent infringement, then any patent licenses
87 | granted to You under this License for that Work shall terminate
88 | as of the date such litigation is filed.
89 |
90 | 4. Redistribution. You may reproduce and distribute copies of the
91 | Work or Derivative Works thereof in any medium, with or without
92 | modifications, and in Source or Object form, provided that You
93 | meet the following conditions:
94 |
95 | (a) You must give any other recipients of the Work or
96 | Derivative Works a copy of this License; and
97 |
98 | (b) You must cause any modified files to carry prominent notices
99 | stating that You changed the files; and
100 |
101 | (c) You must retain, in the Source form of any Derivative Works
102 | that You distribute, all copyright, patent, trademark, and
103 | attribution notices from the Source form of the Work,
104 | excluding those notices that do not pertain to any part of
105 | the Derivative Works; and
106 |
107 | (d) If the Work includes a "NOTICE" text file as part of its
108 | distribution, then any Derivative Works that You distribute must
109 | include a readable copy of the attribution notices contained
110 | within such NOTICE file, excluding those notices that do not
111 | pertain to any part of the Derivative Works, in at least one
112 | of the following places: within a NOTICE text file distributed
113 | as part of the Derivative Works; within the Source form or
114 | documentation, if provided along with the Derivative Works; or,
115 | within a display generated by the Derivative Works, if and
116 | wherever such third-party notices normally appear. The contents
117 | of the NOTICE file are for informational purposes only and
118 | do not modify the License. You may add Your own attribution
119 | notices within Derivative Works that You distribute, alongside
120 | or as an addendum to the NOTICE text from the Work, provided
121 | that such additional attribution notices cannot be construed
122 | as modifying the License.
123 |
124 | You may add Your own copyright statement to Your modifications and
125 | may provide additional or different license terms and conditions
126 | for use, reproduction, or distribution of Your modifications, or
127 | for any such Derivative Works as a whole, provided Your use,
128 | reproduction, and distribution of the Work otherwise complies with
129 | the conditions stated in this License.
130 |
131 | 5. Submission of Contributions. Unless You explicitly state otherwise,
132 | any Contribution intentionally submitted for inclusion in the Work
133 | by You to the Licensor shall be under the terms and conditions of
134 | this License, without any additional terms or conditions.
135 | Notwithstanding the above, nothing herein shall supersede or modify
136 | the terms of any separate license agreement you may have executed
137 | with Licensor regarding such Contributions.
138 |
139 | 6. Trademarks. This License does not grant permission to use the trade
140 | names, trademarks, service marks, or product names of the Licensor,
141 | except as required for reasonable and customary use in describing the
142 | origin of the Work and reproducing the content of the NOTICE file.
143 |
144 | 7. Disclaimer of Warranty. Unless required by applicable law or
145 | agreed to in writing, Licensor provides the Work (and each
146 | Contributor provides its Contributions) on an "AS IS" BASIS,
147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 | implied, including, without limitation, any warranties or conditions
149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 | PARTICULAR PURPOSE. You are solely responsible for determining the
151 | appropriateness of using or redistributing the Work and assume any
152 | risks associated with Your exercise of permissions under this License.
153 |
154 | 8. Limitation of Liability. In no event and under no legal theory,
155 | whether in tort (including negligence), contract, or otherwise,
156 | unless required by applicable law (such as deliberate and grossly
157 | negligent acts) or agreed to in writing, shall any Contributor be
158 | liable to You for damages, including any direct, indirect, special,
159 | incidental, or consequential damages of any character arising as a
160 | result of this License or out of the use or inability to use the
161 | Work (including but not limited to damages for loss of goodwill,
162 | work stoppage, computer failure or malfunction, or any and all
163 | other commercial damages or losses), even if such Contributor
164 | has been advised of the possibility of such damages.
165 |
166 | 9. Accepting Warranty or Additional Liability. While redistributing
167 | the Work or Derivative Works thereof, You may choose to offer,
168 | and charge a fee for, acceptance of support, warranty, indemnity,
169 | or other liability obligations and/or rights consistent with this
170 | License. However, in accepting such obligations, You may act only
171 | on Your own behalf and on Your sole responsibility, not on behalf
172 | of any other Contributor, and only if You agree to indemnify,
173 | defend, and hold each Contributor harmless for any liability
174 | incurred by, or claims asserted against, such Contributor by reason
175 | of your accepting any such warranty or additional liability.
176 |
177 | END OF TERMS AND CONDITIONS
178 |
179 | APPENDIX: How to apply the Apache License to your work.
180 |
181 | To apply the Apache License to your work, attach the following
182 | boilerplate notice, with the fields enclosed by brackets "[]"
183 | replaced with your own identifying information. (Don't include
184 | the brackets!) The text should be enclosed in the appropriate
185 | comment syntax for the file format. We also recommend that a
186 | file or class name and description of purpose be included on the
187 | same "printed page" as the copyright notice for easier
188 | identification within third-party archives.
189 |
190 | Copyright [yyyy] [name of copyright owner]
191 |
192 | Licensed under the Apache License, Version 2.0 (the "License");
193 | you may not use this file except in compliance with the License.
194 | You may obtain a copy of the License at
195 |
196 | http://www.apache.org/licenses/LICENSE-2.0
197 |
198 | Unless required by applicable law or agreed to in writing, software
199 | distributed under the License is distributed on an "AS IS" BASIS,
200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 | See the License for the specific language governing permissions and
202 | limitations under the License.
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | .PHONY: all
2 | all:
3 | rebar3 do compile, dialyzer, eunit, ct --readable=false, cover
4 |
5 | .PHONY: clean
6 | clean:
7 | rm -rf _build
8 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # system_monitor
2 | > Erlang telemetry collector
3 |
4 | `system_monitor` is a BEAM VM monitoring and introspection application
5 | that helps troubleshooting live systems. It collects various
6 | information about Erlang and Elixir processes and applications.
7 |
8 | Unlike `observer`, `system_monitor` it does not require connecting to
9 | the monitored system via Erlang distribution protocol, and can be used
10 | to monitor systems with very tight access restrictions. It can happily
11 | monitor systems with millions of processes.
12 |
13 | By default the data is stored in a Postgres database, and visualized
14 | using Grafana. Ready to use docker images of
15 | [Postgres](https://github.com/k32/grafana-dashboards/pkgs/container/sysmon-postgres)
16 | with the necessary schema and
17 | [Grafana](https://github.com/k32/grafana-dashboards/pkgs/container/sysmon-grafana)
18 | with the dashboards are provided. See
19 | [documentation](https://github.com/k32/grafana-dashboards).
20 |
21 | ## Features
22 |
23 | ### Process top
24 |
25 | Information about top N Erlang processes consuming the most resources
26 | (such as reductions or memory), or have the longest message queues, is
27 | presented on process top dashboard:
28 |
29 | 
30 |
31 | Historical data can be accessed via standard Grafana time
32 | picker. `status` panel can display important information about the
33 | node state. Pids of the processes on that dashboard are clickable
34 | links that lead to the process history dashboard.
35 |
36 | ### Process history
37 | 
38 |
39 | Process history dashboard displays time series data about certain
40 | Erlang process. Note that some data points can be missing if the
41 | process didn't consume enough resources to appear in the process top.
42 |
43 | ### Application top
44 | 
45 |
46 | Application top dashboard contains various information aggregated per
47 | OTP application.
48 |
49 | ## Usage example
50 |
51 | In order to integrate `system_monitor` into your system, simply add it
52 | to the release apps. Add the following lines to `rebar.config`:
53 |
54 | ```erlang
55 | {deps,
56 | [ {system_monitor, {git, "https://github.com/k32/system_monitor", {tag, "3.0.2"}}}
57 | ]}.
58 |
59 | {relx,
60 | [ {release, {my_release, "1.0.0"},
61 | [kernel, sasl, ..., system_monitor]}
62 | ]}.
63 | ```
64 |
65 | Or to `mix.exs` for Elixir:
66 |
67 | ```elixir
68 | defp deps() do
69 | [
70 | {:system_monitor, github: "k32/system_monitor", tag: "3.0.2"}
71 | ]
72 | end
73 | ```
74 |
75 | To enable export to Postgres:
76 |
77 | ```erlang
78 | application:load(system_monitor),
79 | application:set_env(system_monitor, callback_mod, system_monitor_pg)
80 | ```
81 |
82 | ### Custom node status
83 |
84 | `system_monitor` can export arbitrary node status information that is
85 | deemed important for the operator. This is done by defining a callback
86 | function that returns an HTML-formatted string (or iolist):
87 |
88 | ```erlang
89 | -module(foo).
90 |
91 | -export([node_status/0]).
92 |
93 | node_status() ->
94 | ["my node type
",
95 | case healthy() of
96 | true -> "UP
"
97 | false -> "DEGRADED
"
98 | end,
99 | io_lib:format("very important value=~p", [very_important_value()])
100 | ].
101 | ```
102 |
103 | This callback then needs to be added to the system_monitor application
104 | environment:
105 |
106 | ```erlang
107 | application:set_env(system_monitor, node_status_fun, {?MODULE, node_status})
108 | ```
109 |
110 | More information about configurable options and the defaults is found
111 | [here](src/system_monitor.app.src).
112 |
113 | ### What are the preconfigured monitors
114 |
115 | * `check_process_count`
116 | Logs if the process_count passes a certain threshold
117 | * `suspect_procs`
118 | Logs if it detects processes with suspiciously high memory
119 |
120 | `system_monitor_pg` allows for Postgres being temporary down by storing the stats in its own internal buffer.
121 | This buffer is built with a sliding window that will stop the state from growing too big whenever
122 | Postgres is down for too long. On top of this `system_monitor_pg` has a built-in load
123 | shedding mechanism that protects itself once the message length queue grows bigger than a certain level.
124 |
125 | ## Release History
126 |
127 | See our [changelog](CHANGELOG.md).
128 |
129 | ## License
130 |
131 | Copyright © 2020 Klarna Bank AB
132 | Copyright © 2021-2022 k32
133 |
--------------------------------------------------------------------------------
/doc/app_top.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ieQu1/system_monitor/a0c537677fadb4dc8d8eedffb9394b9908dd3ae2/doc/app_top.png
--------------------------------------------------------------------------------
/doc/proc_history.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ieQu1/system_monitor/a0c537677fadb4dc8d8eedffb9394b9908dd3ae2/doc/proc_history.png
--------------------------------------------------------------------------------
/doc/proc_top.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ieQu1/system_monitor/a0c537677fadb4dc8d8eedffb9394b9908dd3ae2/doc/proc_top.png
--------------------------------------------------------------------------------
/include/system_monitor.hrl:
--------------------------------------------------------------------------------
1 | %%--------------------------------------------------------------------------------
2 | %% Copyright 2022 k32
3 | %% Copyright 2020 Klarna Bank AB
4 | %%
5 | %% Licensed under the Apache License, Version 2.0 (the "License");
6 | %% you may not use this file except in compliance with the License.
7 | %% You may obtain a copy of the License at
8 | %%
9 | %% http://www.apache.org/licenses/LICENSE-2.0
10 | %%
11 | %% Unless required by applicable law or agreed to in writing, software
12 | %% distributed under the License is distributed on an "AS IS" BASIS,
13 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | %% See the License for the specific language governing permissions and
15 | %% limitations under the License.
16 | %%--------------------------------------------------------------------------------
17 | -ifndef(SYSTEM_MONITOR_HRL).
18 | -define(SYSTEM_MONITOR_HRL, true).
19 |
20 | -record(erl_top,
21 | { ts :: system_monitor_lib:ts()
22 | , pid :: string()
23 | , dreductions :: integer()
24 | , dmemory :: integer()
25 | , reductions :: integer()
26 | , memory :: integer() %% bytes
27 | , message_queue_len :: integer()
28 | , current_function :: mfa()
29 | , initial_call :: mfa()
30 | , registered_name :: atom() | []
31 | , stack_size :: integer()
32 | , heap_size :: integer() %% words
33 | , total_heap_size :: integer() %% words
34 | , current_stacktrace :: list()
35 | , group_leader :: string()
36 | }).
37 |
38 | -record(app_top,
39 | { app :: atom()
40 | , ts :: system_monitor_lib:ts()
41 | , red_abs :: integer()
42 | , red_rel :: float()
43 | , memory :: integer()
44 | , processes :: integer()
45 | }).
46 |
47 | -endif.
48 |
--------------------------------------------------------------------------------
/rebar.config:
--------------------------------------------------------------------------------
1 | %% -*- mode:erlang -*-
2 | {erl_opts,
3 | [debug_info, warnings_as_errors]}.
4 |
5 | {deps,
6 | [ {supervisor3, "1.1.12"}
7 | , {epgsql, "4.7.1"}
8 | , {snabbkaffe, {git, "https://github.com/kafka4beam/snabbkaffe", {tag, "1.0.10"}}}
9 | ]}.
10 |
11 | {dialyzer, [{warnings, [unknown]}]}.
12 |
13 | {profiles,
14 | [ {test, [ {deps, [ {proper, "1.4.0"}
15 | ]}
16 | , {cover_enabled, true}
17 | ]}
18 | , {dev,
19 | [{plugins, [rebar3_hex]}]}
20 | ]}.
21 |
22 | {cover_enabled, true}.
23 | {cover_opts, [verbose]}.
24 | {cover_export_enabled, true}.
25 |
--------------------------------------------------------------------------------
/rebar.lock:
--------------------------------------------------------------------------------
1 | {"1.2.0",
2 | [{<<"epgsql">>,{pkg,<<"epgsql">>,<<"4.7.1">>},0},
3 | {<<"snabbkaffe">>,
4 | {git,"https://github.com/kafka4beam/snabbkaffe",
5 | {ref,"b59298334ed349556f63405d1353184c63c66534"}},
6 | 0},
7 | {<<"supervisor3">>,{pkg,<<"supervisor3">>,<<"1.1.12">>},0}]}.
8 | [
9 | {pkg_hash,[
10 | {<<"epgsql">>, <<"D4E47CAE46C18C8AFA88E34D59A9B4BAE16368D7CE1EB3DA24FA755EB28393EB">>},
11 | {<<"supervisor3">>, <<"2FAB1AF26BB9F8AE07692BB30EF79D5F1940E1587EFF9C14C6C8B04B16B400A8">>}]},
12 | {pkg_hash_ext,[
13 | {<<"epgsql">>, <<"B6D86B7DC42C8555B1D4E20880E5099D6D6D053148000E188E548F98E4E01836">>},
14 | {<<"supervisor3">>, <<"62BF29F802C8620B7F9609FE5D81212B1AA5A75A7D86876B61CEA73BE58BA2A6">>}]}
15 | ].
16 |
--------------------------------------------------------------------------------
/src/sysmon_int.hrl:
--------------------------------------------------------------------------------
1 | %%--------------------------------------------------------------------
2 | %% Copyright (c) 2022 k43. All Rights Reserved.
3 | %%
4 | %% Licensed under the Apache License, Version 2.0 (the "License");
5 | %% you may not use this file except in compliance with the License.
6 | %% You may obtain a copy of the License at
7 | %%
8 | %% http://www.apache.org/licenses/LICENSE-2.0
9 | %%
10 | %% Unless required by applicable law or agreed to in writing, software
11 | %% distributed under the License is distributed on an "AS IS" BASIS,
12 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | %% See the License for the specific language governing permissions and
14 | %% limitations under the License.
15 | %%--------------------------------------------------------------------
16 | -ifndef(SYSMON_INT_HRL).
17 | -define(SYSMON_INT_HRL, true).
18 |
19 | -include("system_monitor.hrl").
20 | -include_lib("snabbkaffe/include/trace.hrl").
21 |
22 | -define(APP, system_monitor).
23 |
24 | -define(CFG(KEY), system_monitor_lib:cfg(KEY)).
25 |
26 | -define(TS_UNIT, microsecond).
27 |
28 | -endif.
29 |
--------------------------------------------------------------------------------
/src/system_monitor.app.src:
--------------------------------------------------------------------------------
1 | %% -*- mode: erlang -*-
2 | %%--------------------------------------------------------------------------------
3 | %% Copyright 2022 k32
4 | %% Copyright 2020 Klarna Bank AB
5 | %%
6 | %% Licensed under the Apache License, Version 2.0 (the "License");
7 | %% you may not use this file except in compliance with the License.
8 | %% You may obtain a copy of the License at
9 | %%
10 | %% http://www.apache.org/licenses/LICENSE-2.0
11 | %%
12 | %% Unless required by applicable law or agreed to in writing, software
13 | %% distributed under the License is distributed on an "AS IS" BASIS,
14 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | %% See the License for the specific language governing permissions and
16 | %% limitations under the License.
17 | %%--------------------------------------------------------------------------------
18 | {application, system_monitor,
19 | [ {description, "Monitoring app that exports Erlang VM introspection data to the external databases."}
20 | , {licenses, ["Apache 2.0"]}
21 | , {vsn, "git"}
22 | , {registered, []}
23 | , {modules, []}
24 | , {mod, {system_monitor_app, []}}
25 | , {applications, [kernel, stdlib, supervisor3, epgsql]}
26 | , {env,
27 | [ %% Specifies how many topmost processes should be reported per
28 | %% category (such as `top_memory', `top_reductions', etc.)
29 | {top_num_items, 10}
30 | %% Specifies how often process top should be collected (in ms):
31 | , {top_sample_interval, 2000}
32 | %% Specifies sample size for the approximate metrics, such as
33 | %% 'percentage of processes started by an app', and 'percentage
34 | %% of processes running a function':
35 | , {top_sample_size, 1000}
36 | %% Stop reporting exact process data when the number of
37 | %% processes is above this threshold, in order to avoid
38 | %% hammering the VM with introspection BIFs (this doesn't affect
39 | %% approximate monitors that rely on sampling):
40 | , {top_max_procs, 15000}
41 | %% Don't report values to `app_top' and `fun_top' below the
42 | %% threshold as insignificant:
43 | , {top_significance_threshold,
44 | #{ current_function => 0.01 % 1 percent of all processes
45 | , initial_call => 0.01 % 1 percent of all processes
46 | , reductions => 0.01 % 1 percent of total reductions
47 | , abs_reductions => 100 % Absolute number of reductions
48 | , memory => 0.01 % 1 percent of total memory
49 | , num_processes => 100 % absolute number of processes
50 | }}
51 |
52 | %% List of registered processes that should be always reported:
53 | , {vips, [mnesia_tm, mnesia_locker]}
54 |
55 | %% Data reporting callback. It is called whenever the data is collected.
56 | , {callback_mod, system_monitor_dummy}
57 | %% Postgres callback settings:
58 | , {db_hostname, "localhost"}
59 | , {db_port, 5432}
60 | , {db_username, "system_monitor"}
61 | , {db_password, "system_monitor_password"}
62 | , {db_name, "system_monitor"}
63 | , {db_connection_timeout, 5000}
64 |
65 | %% Specify node-specific healthcheck function as `{module(),
66 | %% function()}', for example: `{my_app, node_status}'. This
67 | %% function should return an HTML-formatted status report:
68 | , {node_status_fun, undefined}
69 | %% List of status check functions: The format is
70 | %%
71 | %% `{Module, FunctionName, RunAtTerminate, Interval(Ticks)}'
72 | , {status_checks, [ {system_monitor, check_process_count, false, 30}
73 | , {system_monitor, suspect_procs, false, 5}
74 | ]}
75 | , {tick_interval, 1000}
76 | %% BEAM event settings:
77 | , {beam_events,
78 | [ busy_port
79 | , busy_dist_port
80 | , {long_gc, 500}
81 | , {long_schedule, 500}
82 | ]}
83 | %% Suspect process settings:
84 | , {suspect_procs_max_memory, 524288000} %% 500 MB
85 | , {suspect_procs_max_message_queue_len, 5000}
86 | , {suspect_procs_max_total_heap_size, 524288000} %% 500 MB
87 | ]}
88 | ]}.
89 |
--------------------------------------------------------------------------------
/src/system_monitor.erl:
--------------------------------------------------------------------------------
1 | %% -*- mode: erlang -*-
2 | %%--------------------------------------------------------------------------------
3 | %% Copyright 2022 k32
4 | %% Copyright 2021 Klarna Bank AB
5 | %%
6 | %% Licensed under the Apache License, Version 2.0 (the "License");
7 | %% you may not use this file except in compliance with the License.
8 | %% You may obtain a copy of the License at
9 | %%
10 | %% http://www.apache.org/licenses/LICENSE-2.0
11 | %%
12 | %% Unless required by applicable law or agreed to in writing, software
13 | %% distributed under the License is distributed on an "AS IS" BASIS,
14 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | %% See the License for the specific language governing permissions and
16 | %% limitations under the License.
17 | %%--------------------------------------------------------------------------------
18 | %% @private
19 | -module(system_monitor).
20 |
21 | -behaviour(gen_server).
22 |
23 | %%--------------------------------------------------------------------
24 | %% Include files
25 | %%--------------------------------------------------------------------
26 |
27 | -include("sysmon_int.hrl").
28 |
29 | -include_lib("kernel/include/logger.hrl").
30 |
31 | %% API
32 | -export([ start_link/0
33 | , reset/0
34 |
35 | , get_app_top/0
36 | , get_abs_app_top/0
37 | , get_app_memory/0
38 | , get_app_processes/0
39 | , get_function_top/0
40 | , get_proc_top/0
41 | , get_proc_info/1
42 |
43 | , add_vip/1
44 | , remove_vip/1
45 | ]).
46 |
47 | %% Builtin checks
48 | -export([ check_process_count/0
49 | , suspect_procs/0
50 | ]).
51 |
52 | %% gen_server callbacks
53 | -export([ init/1
54 | , handle_continue/2
55 | , handle_call/3
56 | , handle_cast/2
57 | , handle_info/2
58 | , terminate/2
59 | ]).
60 |
61 | %% Internal exports
62 | -export([report_data/2]).
63 |
64 | -export_type([ function_top/0
65 | ]).
66 |
67 | -include_lib("kernel/include/logger.hrl").
68 |
69 | -define(SERVER, ?MODULE).
70 | -define(TABLE, system_monitor_data_tab).
71 |
72 | -type function_top() :: [{mfa(), number()}].
73 |
74 | -record(state, { monitors = []
75 | , timer_ref
76 | }).
77 |
78 | %%====================================================================
79 | %% API
80 | %%====================================================================
81 |
82 | -spec start_link() -> {ok, pid()} | ignore | {error, term()}.
83 | start_link() -> gen_server:start_link({local, ?SERVER}, ?MODULE, [], []).
84 |
85 | %% @doc Reset monitors
86 | -spec reset() -> ok.
87 | reset() ->
88 | gen_server:cast(?SERVER, reset).
89 |
90 | %% @doc Add a VIP
91 | -spec add_vip(atom() | [atom()]) -> ok.
92 | add_vip(NameOrNames) ->
93 | system_monitor_collector:add_vip(NameOrNames).
94 |
95 | %% @doc Add a VIP
96 | -spec remove_vip(atom()) -> ok.
97 | remove_vip(RegName) ->
98 | system_monitor_collector:remove_vip(RegName).
99 |
100 | %% @doc Get Erlang process top
101 | -spec get_proc_top() -> [#erl_top{}].
102 | get_proc_top() ->
103 | lookup_top(proc_top).
104 |
105 | %% @doc Get Erlang process top info for one process
106 | -spec get_proc_info(pid() | atom()) -> #erl_top{} | false.
107 | get_proc_info(Name) when is_atom(Name) ->
108 | case whereis(Name) of
109 | undefined -> false;
110 | Pid -> get_proc_info(Pid)
111 | end;
112 | get_proc_info(Pid) ->
113 | Top = lookup_top(proc_top),
114 | lists:keyfind(pid_to_list(Pid), #erl_top.pid, Top).
115 |
116 | %% @doc Get relative reduction utilization per application, sorted by
117 | %% reductions
118 | -spec get_app_top() -> [{atom(), float()}].
119 | get_app_top() ->
120 | get_filtered_top(app_top, #app_top.app, #app_top.red_rel, reductions).
121 |
122 | %% @doc Get absolute reduction utilization per application, sorted by
123 | %% reductions
124 | -spec get_abs_app_top() -> [{atom(), integer()}].
125 | get_abs_app_top() ->
126 | get_filtered_top(app_top, #app_top.app, #app_top.red_abs, abs_reductions).
127 |
128 | %% @doc Get memory utilization per application, sorted by memory
129 | -spec get_app_memory() -> [{atom(), integer()}].
130 | get_app_memory() ->
131 | get_filtered_top(app_top, #app_top.app, #app_top.memory, memory).
132 |
133 | %% @doc Get number of processes spawned by each application
134 | -spec get_app_processes() -> [{atom(), integer()}].
135 | get_app_processes() ->
136 | get_filtered_top(app_top, #app_top.app, #app_top.processes, num_processes).
137 |
138 | %% @doc Get approximate distribution of initilal_call and
139 | %% current_function per process
140 | -spec get_function_top() -> #{ initial_call := function_top()
141 | , current_function := function_top()
142 | }.
143 | get_function_top() ->
144 | #{ initial_call => get_filtered_top(init_call_top, 1, 2, initial_call)
145 | , current_function => get_filtered_top(current_fun_top, 1, 2, current_function)
146 | }.
147 |
148 | %%====================================================================
149 | %% gen_server callbacks
150 | %%====================================================================
151 |
152 | init([]) ->
153 | process_flag(trap_exit, true),
154 | logger:update_process_metadata(#{domain => [system_monitor, status_check]}),
155 | ets:new(?TABLE, [ public
156 | , named_table
157 | , set
158 | , {keypos, 1}
159 | , {write_concurrency, false}
160 | ]),
161 | {ok, Timer} = timer:send_interval(?CFG(tick_interval), {self(), tick}),
162 | State = #state{ monitors = init_monitors()
163 | , timer_ref = Timer
164 | },
165 | {ok, State, {continue, start_callback}}.
166 |
167 | handle_continue(start_callback, State) ->
168 | ok = system_monitor_callback:start(),
169 | {noreply, State}.
170 |
171 | handle_call(_Request, _From, State) ->
172 | {reply, {error, unknown_call}, State}.
173 |
174 | handle_cast({report_data, SnapshotTS, ProcTop, AppTop, InitCallTop, CurrentFunTop}, State) ->
175 | ets:insert(?TABLE, {proc_top, SnapshotTS, ProcTop}),
176 | ets:insert(?TABLE, {app_top, SnapshotTS, AppTop}),
177 | ets:insert(?TABLE, {init_call_top, SnapshotTS, InitCallTop}),
178 | ets:insert(?TABLE, {current_fun_top, SnapshotTS, CurrentFunTop}),
179 | report_node_status(SnapshotTS, ProcTop, AppTop),
180 | ?tp(sysmon_report_data, #{ts => SnapshotTS}),
181 | {noreply, State};
182 | handle_cast(reset, State) ->
183 | {noreply, State#state{monitors = init_monitors()}};
184 | handle_cast(_Msg, State) ->
185 | {noreply, State}.
186 |
187 | handle_info({Self, tick}, State) when Self =:= self() ->
188 | Monitors = [case Ticks - 1 of
189 | 0 ->
190 | try
191 | apply(Module, Function, [])
192 | catch
193 | EC:Error:Stack ->
194 | logger:debug(
195 | "system_monitor ~p crashed:~n~p:~p~nStacktrace: ~p~n",
196 | [{Module, Function}, EC, Error, Stack])
197 | end,
198 | {Module, Function, RunOnTerminate, TicksReset, TicksReset};
199 | TicksDecremented ->
200 | {Module, Function, RunOnTerminate, TicksReset, TicksDecremented}
201 | end || {Module, Function,
202 | RunOnTerminate, TicksReset, Ticks} <- State#state.monitors],
203 | {noreply, State#state{monitors = Monitors}};
204 | handle_info(_Info, State) ->
205 | {noreply, State}.
206 |
207 | -spec terminate(term(), #state{}) -> any().
208 | terminate(_Reason, State) ->
209 | %% Possibly, one last check.
210 | [apply(?MODULE, Monitor, []) ||
211 | {Monitor, true, _TicksReset, _Ticks} <- State#state.monitors].
212 |
213 | %%================================================================================
214 | %% Builtin checks
215 | %%================================================================================
216 |
217 | %% @doc Check the number of processes and log an aggregate summary of
218 | %% the process info if the count is above Threshold.
219 | -spec check_process_count() -> ok.
220 | check_process_count() ->
221 | {ok, MaxProcs} = application:get_env(?APP, top_max_procs),
222 | case erlang:system_info(process_count) of
223 | Count when Count > MaxProcs div 5 ->
224 | ?tp(warning, "Abnormal process count", #{n_procs => Count});
225 | _ ->
226 | ok
227 | end.
228 |
229 | suspect_procs() ->
230 | ProcTop = get_proc_top(),
231 | Conf = { ?CFG(suspect_procs_max_memory)
232 | , ?CFG(suspect_procs_max_message_queue_len)
233 | , ?CFG(suspect_procs_max_total_heap_size)
234 | },
235 | SuspectProcs = lists:filter(fun(Proc) -> is_suspect_proc(Proc, Conf) end, ProcTop),
236 | lists:foreach(fun log_suspect_proc/1, SuspectProcs).
237 |
238 | %%====================================================================
239 | %% Internal exports
240 | %%====================================================================
241 |
242 | report_data(SnapshotTS, {ProcTop, AppTop, InitCallTop, CurrentFunTop}) ->
243 | gen_server:cast(?SERVER, {report_data, SnapshotTS, ProcTop, AppTop, InitCallTop, CurrentFunTop}).
244 |
245 | %%==============================================================================
246 | %% Internal functions
247 | %%==============================================================================
248 |
249 | %% @doc Return the list of initiated monitors.
250 | -spec init_monitors() -> [{module(), function(), boolean(), pos_integer(), pos_integer()}].
251 | init_monitors() ->
252 | [{Module, Function, RunOnTerminate, Ticks, Ticks}
253 | || {Module, Function, RunOnTerminate, Ticks} <- monitors()].
254 |
255 | %% @doc Returns the list of monitors. The format is
256 | %%
257 | %% ```{Module, FunctionName, RunAtTerminate, NumberOfTicks}'''
258 | %%
259 | %% `RunMonitorAtTerminate' determines whether the monitor is to be run
260 | %% in the terminate gen_server callback. ... and `NumberOfTicks' is
261 | %% the number of ticks between invocations of the monitor in
262 | %% question. So, if `NumberOfTicks' is 3600, the monitor is to be run
263 | %% once every hour, as there is a tick every second.
264 | -spec monitors() -> [{module(), function(), boolean(), pos_integer()}].
265 | monitors() ->
266 | ?CFG(status_checks).
267 |
268 | %% @doc Report node status
269 | report_node_status(TS, ProcTop, AppTop) ->
270 | system_monitor_callback:produce(proc_top, ProcTop),
271 | system_monitor_callback:produce(app_top, AppTop),
272 | produce_fun_top(TS),
273 | %% Node status report goes last, and it "seals" the report for this
274 | %% time interval:
275 | NodeReport =
276 | case application:get_env(?APP, node_status_fun) of
277 | {ok, {Module, Function}} ->
278 | try
279 | Module:Function()
280 | catch
281 | _:_ ->
282 | <<>>
283 | end;
284 | _ ->
285 | <<>>
286 | end,
287 | system_monitor_callback:produce(node_status,
288 | [{node_status, node(), TS, iolist_to_binary(NodeReport)}]).
289 |
290 | -spec get_filtered_top(proc_top | app_top | init_call_top | current_fun_top, byte(), byte(), atom()) ->
291 | [{atom(), number()}].
292 | get_filtered_top(Top, KeyField, ValueField, ThresholdKey) ->
293 | Threshold = maps:get(ThresholdKey, ?CFG(top_significance_threshold), 0.0001),
294 | lists:reverse(lists:keysort(2, lookup_top_kv(Top, KeyField, ValueField, Threshold))).
295 |
296 | -spec lookup_top_kv(proc_top | app_top | init_call_top | current_fun_top, byte(), byte(), number()) ->
297 | [{atom(), number()}].
298 | lookup_top_kv(Top, KeyField, ValueField, Threshold) ->
299 | lists:filtermap( fun(Record) ->
300 | Key = element(KeyField, Record),
301 | Val = element(ValueField, Record),
302 | case Val > Threshold of
303 | true -> {true, {Key, Val}};
304 | false -> false
305 | end
306 | end
307 | , lookup_top(Top)
308 | ).
309 |
310 | -spec lookup_top(proc_top | app_top | init_call_top | current_fun_top) -> list().
311 | lookup_top(Key) ->
312 | case ets:lookup(?TABLE, Key) of
313 | [{Key, _Timestamp, Vals}] -> Vals;
314 | [] -> []
315 | end.
316 |
317 | is_suspect_proc(#erl_top{pid = "!!!"}, _) ->
318 | false;
319 | is_suspect_proc(Proc, {MaxMemory, MaxMqLen, MaxTotalHeapSize}) ->
320 | #erl_top{memory = Memory,
321 | message_queue_len = MessageQueueLen,
322 | total_heap_size = TotalHeapSize} =
323 | Proc,
324 | GreaterIfDef =
325 | fun ({undefined, _}) ->
326 | false;
327 | ({Comp, Value}) ->
328 | Value >= Comp
329 | end,
330 | ToCompare =
331 | [{MaxMemory, Memory}, {MaxMqLen, MessageQueueLen}, {MaxTotalHeapSize, TotalHeapSize}],
332 | lists:any(GreaterIfDef, ToCompare).
333 |
334 | log_suspect_proc(Proc) ->
335 | ErlTopStr = system_monitor_lib:erl_top_to_str(Proc),
336 | Format = "Suspect Proc~n~s",
337 | ?LOG_WARNING(Format, [ErlTopStr], #{domain => [system_monitor]}).
338 |
339 | -spec produce_fun_top(system_monitor_lib:ts()) -> ok.
340 | produce_fun_top(TS) ->
341 | #{ current_function := CurrentFunctionTop
342 | , initial_call := InitialCallTop
343 | } = get_function_top(),
344 | produce_fun_top(current_fun_top, CurrentFunctionTop, TS),
345 | produce_fun_top(initial_fun_top, InitialCallTop, TS),
346 | ok.
347 |
348 | produce_fun_top(TopType, Values, TS) ->
349 | Node = node(),
350 | L = lists:map(fun({Function, PercentProcesses}) ->
351 | {Node, TS, Function, PercentProcesses}
352 | end,
353 | Values),
354 | system_monitor_callback:produce(TopType, L).
355 |
--------------------------------------------------------------------------------
/src/system_monitor_app.erl:
--------------------------------------------------------------------------------
1 | %%--------------------------------------------------------------------------------
2 | %% Copyright 2020 Klarna Bank AB
3 | %%
4 | %% Licensed under the Apache License, Version 2.0 (the "License");
5 | %% you may not use this file except in compliance with the License.
6 | %% You may obtain a copy of the License at
7 | %%
8 | %% http://www.apache.org/licenses/LICENSE-2.0
9 | %%
10 | %% Unless required by applicable law or agreed to in writing, software
11 | %% distributed under the License is distributed on an "AS IS" BASIS,
12 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | %% See the License for the specific language governing permissions and
14 | %% limitations under the License.
15 | %%--------------------------------------------------------------------------------
16 | -module(system_monitor_app).
17 |
18 | -behaviour(application).
19 |
20 | -export([start/2, stop/1]).
21 |
22 | start(_Type, _StartArgs) ->
23 | system_monitor_sup:start_link().
24 |
25 | stop(_State) ->
26 | ok.
27 |
--------------------------------------------------------------------------------
/src/system_monitor_callback.erl:
--------------------------------------------------------------------------------
1 | %%--------------------------------------------------------------------------------
2 | %% Copyright 2021 Klarna Bank AB
3 | %%
4 | %% Licensed under the Apache License, Version 2.0 (the "License");
5 | %% you may not use this file except in compliance with the License.
6 | %% You may obtain a copy of the License at
7 | %%
8 | %% http://www.apache.org/licenses/LICENSE-2.0
9 | %%
10 | %% Unless required by applicable law or agreed to in writing, software
11 | %% distributed under the License is distributed on an "AS IS" BASIS,
12 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | %% See the License for the specific language governing permissions and
14 | %% limitations under the License.
15 | %%--------------------------------------------------------------------------------
16 |
17 | -module(system_monitor_callback).
18 |
19 | -export([ start/0
20 | , stop/0
21 | , produce/2
22 | , is_configured/0
23 | ]).
24 |
25 | -include("sysmon_int.hrl").
26 |
27 | -callback start() -> ok.
28 | -callback stop() -> ok.
29 | -callback produce(atom(), list()) -> ok.
30 |
31 | start() ->
32 | (get_callback_mod()):?FUNCTION_NAME().
33 |
34 | stop() ->
35 | (get_callback_mod()):?FUNCTION_NAME().
36 |
37 | produce(Type, Events) ->
38 | (get_callback_mod()):?FUNCTION_NAME(Type, Events).
39 |
40 | -compile({inline, [get_callback_mod/0]}).
41 | get_callback_mod() ->
42 | application:get_env(?APP, callback_mod, system_monitor_dummy).
43 |
44 | is_configured() ->
45 | get_callback_mod() =/= system_monitor_dummy.
46 |
--------------------------------------------------------------------------------
/src/system_monitor_collector.erl:
--------------------------------------------------------------------------------
1 | %%--------------------------------------------------------------------------------
2 | %% Copyright 2022 k32
3 | %% Copyright 2020 Klarna Bank AB
4 | %%
5 | %% Licensed under the Apache License, Version 2.0 (the "License");
6 | %% you may not use this file except in compliance with the License.
7 | %% You may obtain a copy of the License at
8 | %%
9 | %% http://www.apache.org/licenses/LICENSE-2.0
10 | %%
11 | %% Unless required by applicable law or agreed to in writing, software
12 | %% distributed under the License is distributed on an "AS IS" BASIS,
13 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | %% See the License for the specific language governing permissions and
15 | %% limitations under the License.
16 | %%--------------------------------------------------------------------------------
17 |
18 | %%% @doc Collect Erlang process statistics and push it to the
19 | %%% configured destination
20 | -module(system_monitor_collector).
21 |
22 | -behaviour(gen_server).
23 |
24 | -include("sysmon_int.hrl").
25 |
26 | %% API
27 | -export([start_link/0, add_vip/1, remove_vip/1]).
28 |
29 | %% gen_server callbacks
30 | -export([init/1, handle_call/3, handle_cast/2, handle_info/2]).
31 |
32 | -define(SERVER, ?MODULE).
33 |
34 | -define(TOP_APP_TAB, sysmon_top_app_tab).
35 | -define(TOP_INIT_CALL, sysmon_top_init_call).
36 | -define(TOP_CURR_FUN, sysmon_top_curr_fun).
37 | -define(TAB_OPTS, [private, named_table, set, {keypos, 1}]).
38 |
39 | -define(COUNT, diceroll_counter).
40 |
41 | %% Type and record definitions
42 |
43 | -define(HIST(PID, REDS, MEM), {PID, REDS, MEM}).
44 |
45 | -record(state,
46 | { timer :: timer:tref()
47 | , old_data = [] :: [hist()]
48 | , last_ts :: system_monitor_lib:ts()
49 | , time_to_collect = 0 :: non_neg_integer()
50 | }).
51 |
52 | -record(delta,
53 | { pid :: pid()
54 | , reg_name :: atom()
55 | , reds :: non_neg_integer()
56 | , dreds :: non_neg_integer()
57 | , memory :: non_neg_integer()
58 | , dmemory :: non_neg_integer()
59 | , mql :: non_neg_integer()
60 | }).
61 |
62 | -record(top_acc,
63 | { is_vip :: #{atom() => _}
64 | , dt :: non_neg_integer()
65 | , hist_data :: [hist()]
66 | , sample_modulo :: non_neg_integer()
67 | %% Tops
68 | , vips :: [#delta{}]
69 | , memory :: system_monitor_top:top()
70 | , dmemory :: system_monitor_top:top()
71 | , dreds :: system_monitor_top:top()
72 | , mql :: system_monitor_top:top()
73 | }).
74 |
75 | -type hist() :: ?HIST(pid(), non_neg_integer(), non_neg_integer()).
76 |
77 | %%%===================================================================
78 | %%% API
79 | %%%===================================================================
80 |
81 | %% @doc Add a VIP
82 | -spec add_vip(atom() | [atom()]) -> ok.
83 | add_vip(RegName) when is_atom(RegName) ->
84 | add_vip([RegName]);
85 | add_vip(RegNames) when is_list(RegNames) ->
86 | gen_server:call(?SERVER, {add_vip, RegNames}).
87 |
88 | %% @doc Add a VIP
89 | -spec remove_vip(atom()) -> ok.
90 | remove_vip(RegName) ->
91 | gen_server:call(?SERVER, {remove_vip, RegName}).
92 |
93 | -spec start_link() -> {ok, pid()} | ignore | {error, term()}.
94 | start_link() ->
95 | gen_server:start_link({local, ?SERVER}, ?MODULE, [], []).
96 |
97 | %%%===================================================================
98 | %%% gen_server callbacks
99 | %%%===================================================================
100 |
101 | init([]) ->
102 | put(?COUNT, 0),
103 | {ok, TRef} = timer:send_after(sample_interval(), collect_data),
104 | {ok, #state{ timer = TRef
105 | , last_ts = system_monitor_lib:timestamp()
106 | }}.
107 |
108 | handle_call({add_vip, RegNames}, _From, State) ->
109 | application:set_env(?APP, vips, lists:usort(RegNames ++ ?CFG(vips))),
110 | {reply, ok, State};
111 | handle_call({remove_vip, RegName}, _From, State) ->
112 | application:set_env(?APP, vips, lists:delete(RegName, ?CFG(vips))),
113 | {reply, ok, State};
114 | handle_call(_Msg, _From, State) ->
115 | {reply, {error, bad_call}, State}.
116 |
117 | handle_cast(_Msg, State) ->
118 | {noreply, State}.
119 |
120 | handle_info(collect_data, State0) ->
121 | init_tables(),
122 | T1 = system_monitor_lib:timestamp(),
123 | NumProcesses = erlang:system_info(process_count),
124 | TooManyPids = NumProcesses > ?CFG(top_max_procs),
125 | Pids = case TooManyPids of
126 | false -> lists:sort(processes());
127 | true -> lists:sort(get_vip_pids())
128 | end,
129 | {ProcTop, State} = collect_proc_top(State0, T1, Pids, TooManyPids),
130 | {AppTop, InitCallTop, CurrFunTop} = finalize_aggr_top(T1, NumProcesses),
131 | %% Report the collected data:
132 | system_monitor:report_data(T1, {ProcTop, AppTop, InitCallTop, CurrFunTop}),
133 | %% Prepare for the next iteration:
134 | T2 = system_monitor_lib:timestamp(),
135 | LastRunTime = erlang:convert_time_unit(T2 - T1, ?TS_UNIT, millisecond),
136 | SleepTime = max(500, sample_interval() - LastRunTime),
137 | erlang:garbage_collect(self()),
138 | {ok, TRef} = timer:send_after(SleepTime, collect_data),
139 | {noreply, State#state{ timer = TRef
140 | , time_to_collect = LastRunTime
141 | }};
142 | handle_info(_Info, State) ->
143 | {noreply, State}.
144 |
145 | %%%===================================================================
146 | %%% Internal functions
147 | %%%===================================================================
148 |
149 | %%--------------------------------------------------------------------
150 | %% Very important processes
151 | %%--------------------------------------------------------------------
152 |
153 | -spec vip_names() -> [atom()].
154 | vip_names() ->
155 | ?CFG(vips).
156 |
157 | -spec get_vip_pids() -> [pid()].
158 | get_vip_pids() ->
159 | lists:foldl( fun(I, Acc) ->
160 | case whereis(I) of
161 | undefined -> Acc;
162 | Pid -> [Pid|Acc]
163 | end
164 | end
165 | , []
166 | , vip_names()
167 | ).
168 |
169 | -spec make_is_vip() -> #{atom() => []}.
170 | make_is_vip() ->
171 | maps:from_list([{I, []} || I <- vip_names()]).
172 |
173 | %%--------------------------------------------------------------------
174 | %% Proc top collection
175 | %%--------------------------------------------------------------------
176 |
177 | -spec collect_proc_top(#state{}, integer(), [pid()], boolean()) -> {[#erl_top{}], #state{}}.
178 | collect_proc_top(State = #state{old_data = OldData, last_ts = LastTs}, Now, Pids, TooManyPids) ->
179 | Dt = max(1, Now - LastTs),
180 | {Deltas, NewData} = top_deltas(OldData, Pids, Dt),
181 | ProcTop = [make_fake_proc(Now) || TooManyPids] ++ [enrich(I, Now) || I <- Deltas],
182 | {ProcTop, State#state{old_data = NewData}}.
183 |
184 | -spec top_deltas([hist()], [pid()], non_neg_integer()) -> {[#delta{}], [hist()]}.
185 | top_deltas(OldData, Pids, Dt) ->
186 | Acc = go(OldData, Pids, empty_top(length(Pids), Dt)),
187 | {top_to_list(Acc), Acc#top_acc.hist_data}.
188 |
189 | -spec go([hist()], [pid()], #top_acc{}) -> #top_acc{}.
190 | go([], [], Acc) ->
191 | Acc;
192 | go(_Old, [], Acc) ->
193 | %% The rest of the processes have terminated, discard them:
194 | Acc;
195 | go([?HIST(OldPid, _, _)|OldL], PidL = [Pid|_], Acc) when Pid > OldPid ->
196 | %% OldPid terminated, discard it:
197 | go(OldL, PidL, Acc);
198 | go([Old = ?HIST(Pid, _, _)|OldL], [Pid|PidL], Acc0) ->
199 | %% This is a process that we've seen before:
200 | Acc = update_acc(Old, Acc0),
201 | go(OldL, PidL, Acc);
202 | go(OldL, [Pid|PidL], Acc0) ->
203 | %% This is a new process:
204 | Acc = update_acc(?HIST(Pid, 0, 0), Acc0),
205 | go(OldL, PidL, Acc).
206 |
207 | -spec update_acc(hist(), #top_acc{}) -> #top_acc{}.
208 | update_acc( ?HIST(Pid, OldReds, OldMem)
209 | , #top_acc{ dt = Dt
210 | , hist_data = Histories
211 | } = Acc0
212 | ) ->
213 | case get_pid_info(Pid) of
214 | {RegName, Reds, Mem, MQL} ->
215 | DReds = (Reds - OldReds) div Dt,
216 | DMem = (Mem - OldMem) div Dt,
217 | Delta = #delta{ reg_name = RegName
218 | , pid = Pid
219 | , reds = Reds
220 | , dreds = DReds
221 | , memory = Mem
222 | , dmemory = DMem
223 | , mql = MQL
224 | },
225 | {IsChanged, Acc} = maybe_push_to_top(Acc0, Delta),
226 | (diceroll(Acc#top_acc.sample_modulo) orelse IsChanged) andalso
227 | maybe_update_aggr_top(Delta),
228 | Acc#top_acc{ hist_data = [?HIST(Pid, Reds, Mem) | Histories]
229 | };
230 | undefined ->
231 | Acc0
232 | end.
233 |
234 | %%--------------------------------------------------------------------
235 | %% Sample top stuff
236 | %%--------------------------------------------------------------------
237 |
238 | -spec maybe_update_aggr_top(#delta{}) -> ok.
239 | maybe_update_aggr_top(#delta{ pid = Pid
240 | , dreds = DReds
241 | , memory = Memory
242 | }) ->
243 | case erlang:process_info(Pid, [current_function, group_leader, initial_call, dictionary]) of
244 | undefined ->
245 | ok;
246 | [{current_function, CurrentFunction}, {group_leader, GL}|L] ->
247 | InitialCall = initial_call(L),
248 | App = case application_controller:get_application(GL) of
249 | {ok, A} -> A;
250 | undefined -> undefined
251 | end,
252 | ets:update_counter(?TOP_CURR_FUN, CurrentFunction, {2, 1}, {CurrentFunction, 0}),
253 | ets:update_counter(?TOP_INIT_CALL, InitialCall, {2, 1}, {InitialCall, 0}),
254 | ets:update_counter(?TOP_APP_TAB, App, [{2, 1}, {3, DReds}, {4, Memory}], {App, 0, 0, 0}),
255 | ok
256 | end.
257 |
258 | -spec init_tables() -> ok.
259 | init_tables() ->
260 | ets:new(?TOP_APP_TAB, ?TAB_OPTS),
261 | ets:new(?TOP_CURR_FUN, ?TAB_OPTS),
262 | ets:new(?TOP_INIT_CALL, ?TAB_OPTS).
263 |
264 | -spec finalize_aggr_top(system_monitor_lib:ts(), non_neg_integer()) ->
265 | {[#app_top{}], system_monitor:function_top(), system_monitor:function_top()}.
266 | finalize_aggr_top(TS, NProc) ->
267 | %% Collect data:
268 | SampleSize = top_sample_size(),
269 | CurrFunTop = filter_nproc_results(?TOP_CURR_FUN, NProc, SampleSize),
270 | InitCallTop = filter_nproc_results(?TOP_INIT_CALL, NProc, SampleSize),
271 | AppTop = filter_app_top(TS),
272 | %% Cleanup:
273 | ets:delete(?TOP_APP_TAB),
274 | ets:delete(?TOP_CURR_FUN),
275 | ets:delete(?TOP_INIT_CALL),
276 | {AppTop, InitCallTop, CurrFunTop}.
277 |
278 | -spec filter_app_top(system_monitor_lib:ts()) -> [#app_top{}].
279 | filter_app_top(TS) ->
280 | L = ets:tab2list(?TOP_APP_TAB),
281 | TotalReds = lists:foldl( fun({_, _, Reds, _Mem}, Acc) ->
282 | Reds + Acc
283 | end
284 | , 0
285 | , L
286 | ),
287 | Factor = 1 / max(1, TotalReds),
288 | [#app_top{ app = App
289 | , ts = TS
290 | , red_abs = Reds
291 | , red_rel = Reds * Factor
292 | , memory = Mem
293 | , processes = Procs
294 | }
295 | || {App, Procs, Reds, Mem} <- L].
296 |
297 | filter_nproc_results(Tab, NProc, SampleSize) ->
298 | Factor = 1 / min(NProc, SampleSize),
299 | [{K, V * Factor} || {K, V} <- ets:tab2list(Tab)].
300 |
301 | %%--------------------------------------------------------------------
302 | %% Top accumulator manipulation
303 | %%--------------------------------------------------------------------
304 |
305 | -spec empty_top(non_neg_integer(), non_neg_integer()) -> #top_acc{}.
306 | empty_top(NProc, Dt) ->
307 | Empty = system_monitor_top:empty(?CFG(top_num_items)),
308 | SampleModulo = max(1, NProc div top_sample_size()),
309 | #top_acc{ is_vip = make_is_vip()
310 | , dt = Dt
311 | , hist_data = []
312 | , sample_modulo = SampleModulo
313 | , vips = []
314 | , memory = Empty
315 | , dmemory = Empty
316 | , dreds = Empty
317 | , mql = Empty
318 | }.
319 |
320 | -spec maybe_push_to_top(#top_acc{}, #delta{}) -> {IsChanged, #top_acc{}}
321 | when IsChanged :: boolean().
322 | maybe_push_to_top(#top_acc{ is_vip = IsVipP
323 | , vips = GVIPs
324 | , memory = GMem0
325 | , dreds = GDReds0
326 | , dmemory = GDMem0
327 | , mql = GMQL0
328 | } = Acc,
329 | Delta) ->
330 | IsVip = maps:is_key(Delta#delta.reg_name, IsVipP),
331 | {IsMem, GMem} = system_monitor_top:push(#delta.memory, Delta, GMem0),
332 | {IsDReds, GDReds} = system_monitor_top:push(#delta.dreds, Delta, GDReds0),
333 | {IsMQL, GMQL} = system_monitor_top:push(#delta.mql, Delta, GMQL0),
334 | {_, GDMem} = system_monitor_top:push(#delta.dmemory, Delta, GDMem0),
335 | IsChanged = IsVip orelse IsMem orelse IsDReds orelse IsMQL,
336 | { IsChanged
337 | , Acc#top_acc{ vips = [Delta || IsVip] ++ GVIPs
338 | , memory = GMem
339 | , dmemory = GDMem
340 | , dreds = GDReds
341 | , mql = GMQL
342 | }}.
343 |
344 | -spec top_to_list(#top_acc{}) -> [#delta{}].
345 | top_to_list(#top_acc{ vips = VIPs
346 | , memory = GMem
347 | , dreds = GDReds
348 | , dmemory = GDMem
349 | , mql = GMQL
350 | }) ->
351 | lists:usort(VIPs ++ lists:flatmap( fun system_monitor_top:to_list/1
352 | , [GMem, GDReds, GDMem, GMQL]
353 | )).
354 |
355 | %%--------------------------------------------------------------------
356 | %% Getting process info
357 | %%--------------------------------------------------------------------
358 |
359 | -spec enrich(#delta{}, system_monitor_lib:ts()) -> #erl_top{}.
360 | enrich(#delta{ pid = Pid
361 | , reg_name = RegName
362 | , reds = Reds
363 | , dreds = DReds
364 | , memory = Memory
365 | , dmemory = DMem
366 | , mql = MQL
367 | }, Now) ->
368 | Info = process_info(Pid, [group_leader, initial_call, dictionary, stack_size,
369 | heap_size, total_heap_size, current_function,
370 | current_stacktrace]),
371 | case Info of
372 | [{group_leader, GL}, {initial_call, _}, {dictionary, _},
373 | {stack_size, StackSize}, {heap_size, HeapSize}, {total_heap_size, Total},
374 | {current_function, CurrentFunction}, {current_stacktrace, CurrentStack}] ->
375 | InitialCall = initial_call(Info);
376 | undefined ->
377 | GL = ".?.?>",
378 | InitialCall = {'?', '?', 0},
379 | StackSize = 0,
380 | HeapSize = 0,
381 | Total = 0,
382 | CurrentStack = [],
383 | CurrentFunction = undefined
384 | end,
385 | #erl_top{ ts = Now
386 | , pid = pid_to_list(Pid)
387 | , group_leader = ensure_list(GL)
388 | , dreductions = DReds
389 | , dmemory = DMem
390 | , reductions = Reds
391 | , memory = Memory
392 | , message_queue_len = MQL
393 | , initial_call = InitialCall
394 | , registered_name = RegName
395 | , stack_size = StackSize
396 | , heap_size = HeapSize
397 | , total_heap_size = Total
398 | , current_stacktrace = CurrentStack
399 | , current_function = CurrentFunction
400 | }.
401 |
402 | -spec get_pid_info(pid()) -> {RegName, Reds, Mem, MQL} | undefined
403 | when RegName :: atom(),
404 | Reds :: non_neg_integer(),
405 | Mem :: non_neg_integer(),
406 | MQL :: non_neg_integer().
407 | get_pid_info(Pid) ->
408 | case erlang:process_info(Pid, [registered_name, reductions, memory, message_queue_len]) of
409 | [ {registered_name, RegName}
410 | , {reductions, Reds}
411 | , {memory, Mem}
412 | , {message_queue_len, MQL}
413 | ] ->
414 | {RegName, Reds, Mem, MQL};
415 | undefined ->
416 | undefined
417 | end.
418 |
419 | -spec initial_call(proplists:proplist()) -> mfa().
420 | initial_call(Info) ->
421 | case proplists:get_value(initial_call, Info) of
422 | {proc_lib, init_p, 5} ->
423 | proc_lib:translate_initial_call(Info);
424 | ICall ->
425 | ICall
426 | end.
427 |
428 | %%--------------------------------------------------------------------
429 | %% Misc
430 | %%--------------------------------------------------------------------
431 |
432 | make_fake_proc(Now) ->
433 | Infinity = 99999999999,
434 | #erl_top{ ts = Now
435 | , pid = "!!!"
436 | , group_leader = "!!!"
437 | , dreductions = Infinity
438 | , dmemory = Infinity
439 | , reductions = Infinity
440 | , memory = Infinity
441 | , message_queue_len = Infinity
442 | , initial_call = {undefined, undefined, 0}
443 | , registered_name = too_many_processes
444 | , stack_size = Infinity
445 | , heap_size = Infinity
446 | , total_heap_size = Infinity
447 | , current_stacktrace = []
448 | , current_function = {undefined, undefined, 0}
449 | }.
450 |
451 | sample_interval() ->
452 | ?CFG(top_sample_interval).
453 |
454 | top_sample_size() ->
455 | ?CFG(top_sample_size).
456 |
457 | diceroll(Mod) ->
458 | Cnt = get(?COUNT) + 1,
459 | put(?COUNT, Cnt rem Mod) =:= 0.
460 |
461 | ensure_list(Pid) when is_pid(Pid) ->
462 | pid_to_list(Pid);
463 | ensure_list(Str) ->
464 | Str.
465 |
466 | %%%_* Emacs ============================================================
467 | %%% Local Variables:
468 | %%% allout-layout: t
469 | %%% erlang-indent-level: 2
470 | %%% End:
471 |
--------------------------------------------------------------------------------
/src/system_monitor_dummy.erl:
--------------------------------------------------------------------------------
1 | %%--------------------------------------------------------------------
2 | %% Copyright (c) k32. All Rights Reserved.
3 | %%
4 | %% Licensed under the Apache License, Version 2.0 (the "License");
5 | %% you may not use this file except in compliance with the License.
6 | %% You may obtain a copy of the License at
7 | %%
8 | %% http://www.apache.org/licenses/LICENSE-2.0
9 | %%
10 | %% Unless required by applicable law or agreed to in writing, software
11 | %% distributed under the License is distributed on an "AS IS" BASIS,
12 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | %% See the License for the specific language governing permissions and
14 | %% limitations under the License.
15 | %%--------------------------------------------------------------------
16 | -module(system_monitor_dummy).
17 |
18 | %% API:
19 | -export([start/0, produce/2]).
20 |
21 | -include("sysmon_int.hrl").
22 |
23 | %%================================================================================
24 | %% API funcions
25 | %%================================================================================
26 |
27 | start() ->
28 | ok.
29 |
30 | produce(_Type, Events) ->
31 | [?tp(sysmon_produce, #{type => _Type, msg => _Msg, backend => dummy}) || _Msg <- Events],
32 | ok.
33 |
34 | %%================================================================================
35 | %% Internal functions
36 | %%================================================================================
37 |
--------------------------------------------------------------------------------
/src/system_monitor_events.erl:
--------------------------------------------------------------------------------
1 | %%--------------------------------------------------------------------------------
2 | %% Copyright 2022 k32
3 | %% Copyright 2020 Klarna Bank AB
4 | %%
5 | %% Licensed under the Apache License, Version 2.0 (the "License");
6 | %% you may not use this file except in compliance with the License.
7 | %% You may obtain a copy of the License at
8 | %%
9 | %% http://www.apache.org/licenses/LICENSE-2.0
10 | %%
11 | %% Unless required by applicable law or agreed to in writing, software
12 | %% distributed under the License is distributed on an "AS IS" BASIS,
13 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | %% See the License for the specific language governing permissions and
15 | %% limitations under the License.
16 | %%--------------------------------------------------------------------------------
17 | %%% @doc
18 | %%% Print BEAM VM events to the logs
19 | %%%
20 | %%% @end
21 | -module(system_monitor_events).
22 |
23 | -behaviour(gen_server).
24 |
25 | -include("sysmon_int.hrl").
26 |
27 | -export([start_link/0]).
28 |
29 | %% gen_server callbacks
30 | -export([ init/1
31 | , handle_call/3
32 | , handle_cast/2
33 | , handle_info/2
34 | ]).
35 |
36 | %%--------------------------------------------------------------------
37 | %% @doc
38 | %% Starts the server
39 | %% @end
40 | %%--------------------------------------------------------------------
41 | -spec start_link() -> {ok, pid()}.
42 | start_link() ->
43 | gen_server:start_link({local, ?MODULE}, ?MODULE, [], []).
44 |
45 | %%====================================================================
46 | %% gen_server callbacks
47 | %%====================================================================
48 |
49 | init([]) ->
50 | logger:update_process_metadata(#{domain => [system_monitor, events]}),
51 | setup_system_monitor(),
52 | {ok, {}}.
53 |
54 | handle_call(_Request, _From, State) ->
55 | {reply, {error, unknown_call}, State}.
56 |
57 | handle_cast(_Msg, State) ->
58 | {noreply, State}.
59 |
60 | handle_info({monitor, PidOrPort, EventKind, Info}, State) ->
61 | ReferenceData = data_for_reference(PidOrPort),
62 | InfoTxt = format_system_event_info(Info),
63 | ?tp(info, "system monitor event",
64 | #{ type => EventKind
65 | , pid_or_port => PidOrPort
66 | , info => InfoTxt
67 | , reference => ReferenceData
68 | }),
69 | case application:get_env(?APP, external_monitoring) of
70 | {ok, Mod} -> Mod:system_monitor_event(EventKind, Info);
71 | undefined -> ok
72 | end,
73 | {noreply, State};
74 | handle_info(_Info, State) ->
75 | {noreply, State}.
76 |
77 | %%==============================================================================
78 | %% Internal functions
79 | %%==============================================================================
80 |
81 | %%--------------------------------------------------------------------
82 | %% @doc: Set the current process as the receiver of the BEAM system
83 | %% events
84 | %%--------------------------------------------------------------------
85 | -spec setup_system_monitor() -> ok.
86 | setup_system_monitor() ->
87 | {ok, Opts} = application:get_env(?APP, beam_events),
88 | erlang:system_monitor(self(), Opts),
89 | ok.
90 |
91 | data_for_reference(Pid) when is_pid(Pid) ->
92 | case system_monitor:get_proc_info(Pid) of
93 | false -> "Proc not in top";
94 | ProcErlTop -> system_monitor_lib:erl_top_to_str(ProcErlTop)
95 | end;
96 | data_for_reference(_Port) ->
97 | "".
98 |
99 | -spec format_system_event_info(term()) -> io_lib:chars().
100 | format_system_event_info(Info) when is_list(Info) ->
101 | lists:foldl(
102 | fun({Key, Value}, Acc) ->
103 | [io_lib:format("~p=~p ", [Key, Value])|Acc];
104 | (Value, Acc) ->
105 | [io_lib:format("~p ", [Value])|Acc]
106 | end,
107 | [],
108 | Info);
109 | format_system_event_info(Port) when is_port(Port) ->
110 | format_system_event_info([{port, Port}]);
111 | format_system_event_info(Pid) when is_pid(Pid) ->
112 | format_system_event_info([{pid_2, Pid}]);
113 | format_system_event_info(Term) ->
114 | format_system_event_info([{info, Term}]).
115 |
116 | %%%_* Emacs ============================================================
117 | %%% Local Variables:
118 | %%% allout-layout: t
119 | %%% erlang-indent-level: 2
120 | %%% End:
121 |
--------------------------------------------------------------------------------
/src/system_monitor_lib.erl:
--------------------------------------------------------------------------------
1 | %%--------------------------------------------------------------------
2 | %% Copyright (c) 2022 k32, Ltd. All Rights Reserved.
3 | %%
4 | %% Licensed under the Apache License, Version 2.0 (the "License");
5 | %% you may not use this file except in compliance with the License.
6 | %% You may obtain a copy of the License at
7 | %%
8 | %% http://www.apache.org/licenses/LICENSE-2.0
9 | %%
10 | %% Unless required by applicable law or agreed to in writing, software
11 | %% distributed under the License is distributed on an "AS IS" BASIS,
12 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | %% See the License for the specific language governing permissions and
14 | %% limitations under the License.
15 | %%--------------------------------------------------------------------
16 | -module(system_monitor_lib).
17 |
18 | %% @doc Utility functions
19 |
20 | %% API:
21 | -export([ cfg/1
22 | , fmt_mem/1
23 | , fmt_stack/1
24 | , fmt_mfa/1
25 | , find_first/3
26 | , erl_top_to_str/1
27 | , timestamp/0
28 | ]).
29 |
30 | -export_type([ts/0]).
31 |
32 | -include("sysmon_int.hrl").
33 |
34 | %%================================================================================
35 | %% Type declarations
36 | %%================================================================================
37 |
38 | -type ts() :: integer().
39 |
40 | %%================================================================================
41 | %% API funcions
42 | %%================================================================================
43 |
44 | %% @private
45 | -spec cfg(atom()) -> _.
46 | cfg(Key) ->
47 | {ok, Val} = application:get_env(?APP, Key),
48 | Val.
49 |
50 | fmt_mem(Mem) ->
51 | Units = [{1, "Bytes"}, {1024, "KB"}, {1024 * 1024, "MB"}, {1024 * 1024 * 1024, "GB"}],
52 | MemIsSmallEnough = fun({Dividor, _UnitStr}) -> Mem =< Dividor * 1024 end,
53 | {Dividor, UnitStr} =
54 | find_first(MemIsSmallEnough, Units, {1024 * 1024 * 1024 * 1024, "TB"}),
55 | io_lib:format("~.1f ~s", [Mem / Dividor, UnitStr]).
56 |
57 | fmt_stack(CurrentStack) ->
58 | [[fmt_mfa(MFA), "\n"] || MFA <- CurrentStack].
59 |
60 | fmt_mfa({Mod, Fun, Arity, Prop}) ->
61 | case proplists:get_value(line, Prop, undefined) of
62 | undefined ->
63 | fmt_mfa({Mod, Fun, Arity});
64 | Line ->
65 | io_lib:format("~s:~s/~p (Line ~p)", [Mod, Fun, Arity, Line])
66 | end;
67 | fmt_mfa({Mod, Fun, Arity}) ->
68 | io_lib:format("~s:~s/~p", [Mod, Fun, Arity]);
69 | fmt_mfa(L) ->
70 | io_lib:format("~p", [L]).
71 |
72 | -spec find_first(fun((any()) -> boolean()), [T], Default) -> T | Default.
73 | find_first(Pred, List, Default) ->
74 | case lists:search(Pred, List) of
75 | {value, Elem} -> Elem;
76 | false -> Default
77 | end.
78 |
79 | %% @doc logs "the interesting parts" of erl_top
80 | erl_top_to_str(Proc) ->
81 | #erl_top{registered_name = RegisteredName,
82 | pid = Pid,
83 | initial_call = InitialCall,
84 | memory = Memory,
85 | message_queue_len = MessageQueueLength,
86 | stack_size = StackSize,
87 | heap_size = HeapSize,
88 | total_heap_size = TotalHeapSize,
89 | current_function = CurrentFunction,
90 | current_stacktrace = CurrentStack} =
91 | Proc,
92 | WordSize = erlang:system_info(wordsize),
93 | Format =
94 | "registered_name=~p~n"
95 | "offending_pid=~s~n"
96 | "initial_call=~s~n"
97 | "memory=~p (~s)~n"
98 | "message_queue_len=~p~n"
99 | "stack_size=~p~n"
100 | "heap_size=~p (~s)~n"
101 | "total_heap_size=~p (~s)~n"
102 | "current_function=~s~n"
103 | "current_stack:~n~s",
104 | Args =
105 | [RegisteredName,
106 | Pid,
107 | system_monitor_lib:fmt_mfa(InitialCall),
108 | Memory, system_monitor_lib:fmt_mem(Memory),
109 | MessageQueueLength,
110 | StackSize,
111 | HeapSize, system_monitor_lib:fmt_mem(WordSize * HeapSize),
112 | TotalHeapSize, system_monitor_lib:fmt_mem(WordSize * TotalHeapSize),
113 | system_monitor_lib:fmt_mfa(CurrentFunction),
114 | system_monitor_lib:fmt_stack(CurrentStack)],
115 | io_lib:format(Format, Args).
116 |
117 | -spec timestamp() -> ts().
118 | timestamp() ->
119 | erlang:system_time(?TS_UNIT).
120 |
121 | %%================================================================================
122 | %% Internal functions
123 | %%================================================================================
124 |
--------------------------------------------------------------------------------
/src/system_monitor_pg.erl:
--------------------------------------------------------------------------------
1 | %%--------------------------------------------------------------------------------
2 | %% Copyright 2022 ieQu1
3 | %% Copyright 2021 Klarna Bank AB
4 | %%
5 | %% Licensed under the Apache License, Version 2.0 (the "License");
6 | %% you may not use this file except in compliance with the License.
7 | %% You may obtain a copy of the License at
8 | %%
9 | %% http://www.apache.org/licenses/LICENSE-2.0
10 | %%
11 | %% Unless required by applicable law or agreed to in writing, software
12 | %% distributed under the License is distributed on an "AS IS" BASIS,
13 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | %% See the License for the specific language governing permissions and
15 | %% limitations under the License.
16 | %%--------------------------------------------------------------------------------
17 | -module(system_monitor_pg).
18 |
19 | -behaviour(gen_server).
20 | -export([ start_link/0
21 | , init/1
22 | , handle_continue/2
23 | , handle_call/3
24 | , handle_info/2
25 | , handle_cast/2
26 | , terminate/2
27 |
28 | , connect_options/0
29 | ]).
30 |
31 | -behaviour(system_monitor_callback).
32 | -export([ start/0, stop/0, produce/2 ]).
33 |
34 | -include("sysmon_int.hrl").
35 | -include_lib("kernel/include/logger.hrl").
36 |
37 | -define(SERVER, ?MODULE).
38 | -define(FIVE_SECONDS, 5000).
39 | -define(ONE_HOUR, 60*60*1000).
40 |
41 | %%%_* API =================================================================
42 | start() ->
43 | {ok, _} = system_monitor_sup:start_child(?MODULE),
44 | ok.
45 |
46 | stop() ->
47 | gen_server:stop(?SERVER).
48 |
49 | produce(Type, Events) ->
50 | gen_server:cast(?SERVER, {produce, Type, Events}).
51 |
52 | %%%_* Callbacks =================================================================
53 | start_link() ->
54 | gen_server:start_link({local, ?SERVER}, ?MODULE, [], []).
55 |
56 | init(_Args) ->
57 | erlang:process_flag(trap_exit, true),
58 | logger:update_process_metadata(#{domain => [system_monitor, pg]}),
59 | {ok, #{}, {continue, start_pg}}.
60 |
61 | handle_continue(start_pg, State) ->
62 | Conn = initialize(),
63 | case Conn of
64 | undefined ->
65 | timer:send_after(?FIVE_SECONDS, reinitialize);
66 | Conn ->
67 | ok
68 | end,
69 | timer:send_after(?ONE_HOUR, mk_partitions),
70 | {noreply, State#{connection => Conn}}.
71 |
72 | handle_call(_Msg, _From, State) ->
73 | {reply, ok, State}.
74 |
75 | handle_info({'EXIT', Conn, _Reason}, #{connection := Conn} = State) ->
76 | timer:send_after(?FIVE_SECONDS, reinitialize),
77 | {noreply, State};
78 | handle_info({'EXIT', _Conn, _Reason}, #{connection := undefined} = State) ->
79 | timer:send_after(?FIVE_SECONDS, reinitialize),
80 | {noreply, State};
81 | handle_info({'EXIT', _Conn, normal}, State) ->
82 | {noreply, State};
83 | handle_info(mk_partitions, #{connection := undefined} = State) ->
84 | timer:send_after(?ONE_HOUR, mk_partitions),
85 | {noreply, State};
86 | handle_info(mk_partitions, #{connection := Conn} = State) ->
87 | mk_partitions(Conn),
88 | timer:send_after(?ONE_HOUR, mk_partitions),
89 | {noreply, State};
90 | handle_info(reinitialize, State) ->
91 | {noreply, State#{connection => initialize()}}.
92 |
93 | handle_cast({produce, _Type, _Events}, #{connection := undefined} = State) ->
94 | {noreply, State};
95 | handle_cast({produce, Type, Events}, #{connection := Conn} = State) ->
96 | MaxMsgQueueSize = application:get_env(?APP, max_message_queue_len, 1000),
97 | case process_info(self(), message_queue_len) of
98 | {_, N} when N > MaxMsgQueueSize ->
99 | ignore;
100 | _ ->
101 | run_query(Conn, Type, Events)
102 | end,
103 | {noreply, State}.
104 |
105 | terminate(_Reason, #{connection := undefined}) ->
106 | ok;
107 | terminate(_Reason, #{connection := Conn}) ->
108 | epgsql:close(Conn).
109 |
110 | %%%_* Internal functions ======================================================
111 |
112 | run_query(Conn, Type, Events) ->
113 | {ok, Statement} = epgsql:parse(Conn, query(Type)),
114 | Batch = [{Statement, params(Type, I)} || I <- Events],
115 | Results = epgsql:execute_batch(Conn, Batch),
116 | emit_traces(Type, Events, Results).
117 |
118 | emit_traces(_Type, [], []) ->
119 | ok;
120 | emit_traces(Type, [_Evt|Evts], [Result|Results]) ->
121 | case Result of
122 | {error, Err} ->
123 | ?tp(debug, system_monitor_pg_query_error,
124 | #{ query => Type
125 | , error => Err
126 | });
127 | _Ok ->
128 | ?tp(sysmon_produce, #{ type => Type
129 | , msg => _Evt
130 | , backend => pg
131 | })
132 | end,
133 | emit_traces(Type, Evts, Results).
134 |
135 | initialize() ->
136 | case connect() of
137 | undefined ->
138 | undefined;
139 | Conn ->
140 | mk_partitions(Conn),
141 | Conn
142 | end.
143 |
144 | connect() ->
145 | case epgsql:connect(connect_options()) of
146 | {ok, Conn} ->
147 | Conn;
148 | Err ->
149 | ?LOG_WARNING("Failed to open connection to the DB: ~p", [Err]),
150 | undefined
151 | end.
152 |
153 | connect_options() ->
154 | #{host => ?CFG(db_hostname),
155 | port => ?CFG(db_port),
156 | username => ?CFG(db_username),
157 | password => ?CFG(db_password),
158 | database => ?CFG(db_name),
159 | timeout => ?CFG(db_connection_timeout),
160 | codecs => []
161 | }.
162 |
163 | mk_partitions(Conn) ->
164 | DaysAhead = application:get_env(system_monitor, partition_days_ahead, 10),
165 | DaysBehind = application:get_env(system_monitor, partition_days_behind, 10),
166 | %% date() uses local time while event data is in UTC
167 | %% so we need to subtract 1 day to make sure there is partition for current UTC timestamps
168 | GDate = calendar:date_to_gregorian_days(date()) - 1,
169 | DaysAheadL = lists:seq(GDate, GDate + DaysAhead),
170 | %% Delete 10 days older than partition_days_behind config
171 | DaysBehindL = lists:seq(GDate - DaysBehind - 10, GDate - DaysBehind - 2),
172 | lists:foreach(fun(Day) -> create_partition_tables(Conn, Day) end, DaysAheadL),
173 | lists:foreach(fun(Day) -> delete_partition_tables(Conn, Day) end, DaysBehindL).
174 |
175 | create_partition_tables(Conn, Day) ->
176 | Tables = [<<"prc">>, <<"app_top">>, <<"initial_fun_top">>, <<"current_fun_top">>, <<"node_status">>],
177 | From = to_postgres_date(Day),
178 | To = to_postgres_date(Day + 1),
179 | lists:foreach(fun(Table) ->
180 | Query = create_partition_query(Table, Day, From, To),
181 | check_result(epgsql:squery(Conn, Query))
182 | end,
183 | Tables).
184 |
185 | check_result([]) ->
186 | ok;
187 | check_result({error, {error, error, _, duplicate_table, _, _}}) ->
188 | ok;
189 | check_result([{ok, [], []} | Rest]) ->
190 | check_result(Rest);
191 | check_result(Err) ->
192 | error({failed_to_create_partition, Err}).
193 |
194 | delete_partition_tables(Conn, Day) ->
195 | Tables = [<<"prc">>, <<"app_top">>, <<"fun_top">>, <<"node_status">>],
196 | lists:foreach(fun(Table) ->
197 | Query = delete_partition_query(Table, Day),
198 | {ok, [], []} = epgsql:squery(Conn, Query)
199 | end,
200 | Tables).
201 |
202 | create_partition_query(Table, Day, From, To) ->
203 | <<"CREATE TABLE IF NOT EXISTS ", Table/binary, "_", (integer_to_binary(Day))/binary, " ",
204 | "PARTITION OF ", Table/binary, " ",
205 | "FOR VALUES "
206 | "FROM ('", (list_to_binary(From))/binary, "') TO ('", (list_to_binary(To))/binary, "');"
207 | "CREATE INDEX IF NOT EXISTS ",
208 | Table/binary, "_", (integer_to_binary(Day))/binary, "_ts_idx "
209 | "ON ", Table/binary, "_", (integer_to_binary(Day))/binary, "(ts);">>.
210 |
211 | delete_partition_query(Table, Day) ->
212 | <<"DROP TABLE IF EXISTS ", Table/binary, "_", (integer_to_binary(Day))/binary, ";">>.
213 |
214 | to_postgres_date(GDays) ->
215 | {YY, MM, DD} = calendar:gregorian_days_to_date(GDays),
216 | lists:flatten(io_lib:format("~w-~2..0w-~2..0w", [YY, MM, DD])).
217 |
218 | query(initial_fun_top) ->
219 | fun_top_query("initial");
220 | query(current_fun_top) ->
221 | fun_top_query("current");
222 | query(app_top) ->
223 | app_top_query();
224 | query(node_status) ->
225 | node_status_query();
226 | query(proc_top) ->
227 | prc_query().
228 |
229 | prc_query() ->
230 | <<"insert into prc (node, ts, pid, dreductions, dmemory, reductions, "
231 | "memory, message_queue_len, current_function, initial_call, "
232 | "registered_name, stack_size, heap_size, total_heap_size, current_stacktrace, group_leader) "
233 | "VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16);">>.
234 |
235 | app_top_query() ->
236 | <<"insert into app_top (node, ts, application, red_abs, red_rel, memory, num_processes)"
237 | " VALUES ($1, $2, $3, $4, $5, $6, $7);">>.
238 |
239 | fun_top_query(Top) ->
240 | iolist_to_binary(
241 | [<<"insert into ">>,
242 | Top,
243 | <<"_fun_top(node, ts, fun, percent_processes) VALUES ($1, $2, $3, $4);">>]).
244 |
245 | node_status_query() ->
246 | <<"insert into node_status (node, ts, data) VALUES ($1, $2, $3);">>.
247 |
248 | params(Top, {Node, TS, Function, PercentProcesses}) when Top =:= initial_fun_top;
249 | Top =:= current_fun_top ->
250 | [atom_to_list(Node),
251 | ts_to_timestamp(TS),
252 | system_monitor_lib:fmt_mfa(Function),
253 | PercentProcesses];
254 | params(app_top,
255 | #app_top{app = App,
256 | ts = TS,
257 | red_abs = RedAbs,
258 | red_rel = RedRel,
259 | memory = Mem,
260 | processes = NumProcesses
261 | }) ->
262 | [atom_to_binary(node(), latin1),
263 | ts_to_timestamp(TS),
264 | atom_to_binary(App, latin1),
265 | RedAbs,
266 | RedRel,
267 | Mem,
268 | NumProcesses];
269 | params(node_status, {node_status, Node, TS, Bin}) ->
270 | [atom_to_list(Node), ts_to_timestamp(TS), Bin];
271 | params(proc_top,
272 | #erl_top{ts = TS,
273 | pid = Pid,
274 | dreductions = DR,
275 | dmemory = DM,
276 | reductions = R,
277 | memory = M,
278 | message_queue_len = MQL,
279 | current_function = CF,
280 | initial_call = IC,
281 | registered_name = RN,
282 | stack_size = SS,
283 | heap_size = HS,
284 | total_heap_size = THS,
285 | current_stacktrace = CS,
286 | group_leader = GL} =
287 | _Event) ->
288 | [atom_to_binary(node(), latin1),
289 | ts_to_timestamp(TS),
290 | Pid,
291 | DR,
292 | DM,
293 | R,
294 | M,
295 | MQL,
296 | system_monitor_lib:fmt_mfa(CF),
297 | system_monitor_lib:fmt_mfa(IC),
298 | name_to_list(RN),
299 | SS,
300 | HS,
301 | THS,
302 | system_monitor_lib:fmt_stack(CS),
303 | GL].
304 |
305 | ts_to_timestamp(TS) ->
306 | calendar:system_time_to_universal_time(TS, ?TS_UNIT).
307 |
308 | name_to_list(Term) ->
309 | case io_lib:printable_latin1_list(Term) of
310 | true ->
311 | Term;
312 | false ->
313 | lists:flatten(io_lib:format("~p", [Term]))
314 | end.
315 |
--------------------------------------------------------------------------------
/src/system_monitor_sup.erl:
--------------------------------------------------------------------------------
1 | %%--------------------------------------------------------------------------------
2 | %% Copyright 2022 k32
3 | %% Copyright 2020 Klarna Bank AB
4 | %%
5 | %% Licensed under the Apache License, Version 2.0 (the "License");
6 | %% you may not use this file except in compliance with the License.
7 | %% You may obtain a copy of the License at
8 | %%
9 | %% http://www.apache.org/licenses/LICENSE-2.0
10 | %%
11 | %% Unless required by applicable law or agreed to in writing, software
12 | %% distributed under the License is distributed on an "AS IS" BASIS,
13 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | %% See the License for the specific language governing permissions and
15 | %% limitations under the License.
16 | %%--------------------------------------------------------------------------------
17 | -module(system_monitor_sup).
18 |
19 | %% TODO: Dialyzer doesn't like this one:
20 | %-behaviour(supervisor3).
21 |
22 | %% External exports
23 | -export([start_link/0, start_child/1]).
24 |
25 | %% supervisor callbacks
26 | -export([init/1, post_init/1]).
27 |
28 | %%--------------------------------------------------------------------
29 | %% Macros
30 | %%--------------------------------------------------------------------
31 | -define(SERVER, ?MODULE).
32 | -define(SUP2, system_monitor2_sup).
33 |
34 | %%%----------------------------------------------------------------------
35 | %%% API
36 | %%%----------------------------------------------------------------------
37 | start_link() ->
38 | supervisor3:start_link({local, ?SERVER}, ?MODULE, ?SERVER).
39 |
40 | start_child(Name) ->
41 | supervisor3:start_child(?SUP2, worker(Name)).
42 |
43 | %%%----------------------------------------------------------------------
44 | %%% Callback functions from supervisor
45 | %%%----------------------------------------------------------------------
46 |
47 | server(Name, Type) ->
48 | server(Name, Type, 2000).
49 |
50 | server(Name, Type, Shutdown) ->
51 | {Name, {Name, start_link, []}, {permanent, 15}, Shutdown, Type, [Name]}.
52 |
53 | worker(Name) -> server(Name, worker).
54 |
55 | post_init(_) ->
56 | ignore.
57 |
58 | init(?SERVER) ->
59 | %% The top level supervisor *does not allow restarts*; if a component
60 | %% directly under this supervisor crashes, the entire node will shut
61 | %% down and restart. Thus, only those components that must never be
62 | %% unavailable should be directly under this supervisor.
63 |
64 | SecondSup = {?SUP2,
65 | {supervisor3, start_link,
66 | [{local, ?SUP2}, ?MODULE, ?SUP2]},
67 | permanent, 2000, supervisor, [?MODULE]},
68 |
69 | {ok, {{one_for_one,0,1}, % no restarts allowed!
70 | [SecondSup]
71 | }};
72 | init(?SUP2) ->
73 | %% The second-level supervisor allows some restarts. This is where the
74 | %% normal services live.
75 | {ok, {{one_for_one, 10, 20},
76 | [ worker(system_monitor_collector)
77 | , worker(system_monitor_events)
78 | , worker(system_monitor)
79 | ]
80 | }}.
81 |
--------------------------------------------------------------------------------
/src/system_monitor_top.erl:
--------------------------------------------------------------------------------
1 | %%--------------------------------------------------------------------
2 | %% Copyright (c) 2022 k32. All Rights Reserved.
3 | %%
4 | %% Licensed under the Apache License, Version 2.0 (the "License");
5 | %% you may not use this file except in compliance with the License.
6 | %% You may obtain a copy of the License at
7 | %%
8 | %% http://www.apache.org/licenses/LICENSE-2.0
9 | %%
10 | %% Unless required by applicable law or agreed to in writing, software
11 | %% distributed under the License is distributed on an "AS IS" BASIS,
12 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | %% See the License for the specific language governing permissions and
14 | %% limitations under the License.
15 | %%--------------------------------------------------------------------
16 | -module(system_monitor_top).
17 |
18 | %% API:
19 | -export([empty/1, push/3, to_list/1]).
20 |
21 | -export_type([top/0]).
22 |
23 | -ifdef(TEST).
24 | -include_lib("proper/include/proper.hrl").
25 | -include_lib("eunit/include/eunit.hrl").
26 | -endif. % TEST
27 |
28 | %%================================================================================
29 | %% Type declarations
30 | %%================================================================================
31 |
32 | -record(top,
33 | { minimum :: non_neg_integer()
34 | , size :: non_neg_integer()
35 | , max_size :: non_neg_integer()
36 | , data :: gb_trees:tree(non_neg_integer(), [tuple()])
37 | }).
38 |
39 | -opaque top() :: #top{}.
40 |
41 | %%================================================================================
42 | %% API funcions
43 | %%================================================================================
44 |
45 | -spec empty(non_neg_integer()) -> top().
46 | empty(MaxItems) ->
47 | #top{ minimum = 0
48 | , size = 0
49 | , max_size = MaxItems
50 | , data = gb_trees:empty()
51 | }.
52 |
53 | -spec to_list(top()) -> [tuple()].
54 | to_list(#top{data = Data}) ->
55 | lists:append(gb_trees:values(Data)).
56 |
57 | -spec push(integer(), tuple(), top()) -> {Changed, top()}
58 | when Changed :: boolean().
59 | push(_, _, Top = #top{max_size = 0}) ->
60 | {false, Top};
61 | push(FieldID, Val, #top{ size = Size
62 | , max_size = MaxSize
63 | , data = Data0
64 | }) when Size < MaxSize ->
65 | Key = element(FieldID, Val),
66 | Data = gb_insert(Key, Val, Data0),
67 | {Min, _} = gb_trees:smallest(Data),
68 | {true, #top{ size = Size + 1
69 | , max_size = MaxSize
70 | , minimum = Min
71 | , data = Data
72 | }};
73 | push(FieldID, Val,
74 | OldTop = #top{ minimum = OldMin
75 | , data = Data0
76 | , max_size = MaxSize
77 | }) ->
78 | Key = element(FieldID, Val),
79 | if OldMin < Key ->
80 | {SKey, SVal, Data1} = gb_trees:take_smallest(Data0),
81 | case SVal of
82 | [_] ->
83 | Data2 = Data1;
84 | [_|SVal2] ->
85 | Data2 = gb_trees:enter(SKey, SVal2, Data1)
86 | end,
87 | Data = gb_insert(Key, Val, Data2),
88 | {Min, _} = gb_trees:smallest(Data),
89 | {true, #top{ minimum = Min
90 | , size = MaxSize
91 | , max_size = MaxSize
92 | , data = Data
93 | }};
94 | true ->
95 | {false, OldTop}
96 | end.
97 |
98 | %%================================================================================
99 | %% Internal functions
100 | %%================================================================================
101 |
102 | gb_insert(Key, Val, Tree) ->
103 | case gb_trees:lookup(Key, Tree) of
104 | none ->
105 | gb_trees:enter(Key, [Val], Tree);
106 | {value, Vals} ->
107 | gb_trees:update(Key, [Val|Vals], Tree)
108 | end.
109 |
110 | %%%===================================================================
111 | %%% Tests
112 | %%%===================================================================
113 |
114 | -ifdef(TEST).
115 |
116 | tuples() ->
117 | list({non_neg_integer()}).
118 |
119 | %% maybe_push_to_top function is just an optimized version
120 | %% of sorting a list and then taking its first N elements.
121 | %%
122 | %% Check that it is indeed true
123 | maybe_push_to_top_same_as_sort_prop() ->
124 | ?FORALL({NItems, L}, {range(0, 10), tuples()},
125 | ?IMPLIES(
126 | length(L) >= NItems,
127 | begin
128 | Reference = lists:nthtail(length(L) - NItems, lists:sort(L)),
129 | Top = lists:foldl( fun(I, Acc0) ->
130 | {_, Acc} = push(1, I, Acc0),
131 | Acc
132 | end
133 | , empty(NItems)
134 | , L
135 | ),
136 | ?assertEqual(Reference, to_list(Top)),
137 | true
138 | end)).
139 |
140 | maybe_push_to_top_test() ->
141 | ?assertEqual(true, proper:quickcheck(
142 | proper:numtests(
143 | 1000,
144 | maybe_push_to_top_same_as_sort_prop())
145 | )).
146 |
147 | -endif.
148 |
--------------------------------------------------------------------------------
/test/sysmon_SUITE.erl:
--------------------------------------------------------------------------------
1 | %%--------------------------------------------------------------------
2 | %% Copyright 2022 k32
3 | %%
4 | %% Licensed under the Apache License, Version 2.0 (the "License");
5 | %% you may not use this file except in compliance with the License.
6 | %% You may obtain a copy of the License at
7 | %%
8 | %% http://www.apache.org/licenses/LICENSE-2.0
9 | %%
10 | %% Unless required by applicable law or agreed to in writing, software
11 | %% distributed under the License is distributed on an "AS IS" BASIS,
12 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | %% See the License for the specific language governing permissions and
14 | %% limitations under the License.
15 | %%--------------------------------------------------------------------
16 | -module(sysmon_SUITE).
17 |
18 | -compile(export_all).
19 | -compile(nowarn_export_all).
20 |
21 | -include("sysmon_int.hrl").
22 | -include_lib("snabbkaffe/include/snabbkaffe.hrl").
23 | -include_lib("stdlib/include/assert.hrl").
24 |
25 | %%================================================================================
26 | %% behavior callbacks
27 | %%================================================================================
28 |
29 | all() ->
30 | [Fun || {Fun, 1} <- ?MODULE:module_info(exports), lists:prefix("t_", atom_to_list(Fun))].
31 |
32 | init_per_suite(Config) ->
33 | snabbkaffe:fix_ct_logging(),
34 | application:load(?APP),
35 | application:set_env(?APP, vips, [some_random_name|vips()]),
36 | application:set_env(?APP, top_sample_interval, 1000),
37 | application:set_env(?APP, tick_interval, 100),
38 | application:set_env(?APP, top_significance_threshold,
39 | #{ current_function => 0
40 | , initial_call => 0
41 | , reductions => 0
42 | , abs_reductions => 0
43 | , memory => 0
44 | , num_processes => 1
45 | }),
46 | docker_cleanup(),
47 | ?assertMatch(0, docker_startup()),
48 | OldConf = application:get_all_env(?APP),
49 | [{old_conf, OldConf} | Config].
50 |
51 | end_per_suite(_Config) ->
52 | docker_cleanup(),
53 | ok.
54 |
55 | init_per_testcase(TestCase, Config) ->
56 | logger:notice(asciiart:visible($%, "Starting ~p", [TestCase])),
57 | Config.
58 |
59 | end_per_testcase(TestCase, Config) ->
60 | logger:notice(asciiart:visible($%, "Complete ~p", [TestCase])),
61 | snabbkaffe:stop(),
62 | [application:set_env(?APP, K, V) || {K, V} <- proplists:get_value(old_conf, Config)],
63 | Config.
64 |
65 | %%================================================================================
66 | %% Tests
67 | %%================================================================================
68 |
69 | t_start(_) ->
70 | ?check_trace(
71 | #{timetrap => 30000},
72 | try
73 | application:ensure_all_started(?APP),
74 | spawn_procs(100, 1000, 10000),
75 | %% Wait several events:
76 | [?block_until(#{?snk_kind := sysmon_report_data}, infinity, 0) || _ <- lists:seq(1, 10)],
77 | ?assertMatch([{App, N}|_] when is_atom(App) andalso is_number(N), system_monitor:get_app_top()),
78 | ?assertMatch([{App, N}|_] when is_atom(App) andalso is_number(N), system_monitor:get_abs_app_top()),
79 | ?assertMatch([{App, N}|_] when is_atom(App) andalso is_number(N), system_monitor:get_app_memory()),
80 | ?assertMatch([{App, N}|_] when is_atom(App) andalso is_number(N), system_monitor:get_app_processes()),
81 | ?assertMatch( #{ initial_call := [{{M1, F1, A1}, V1}|_]
82 | , current_function := [{{M2, F2, A2}, V2}|_]
83 | } when is_atom(M1) andalso is_atom(M2) andalso is_atom(F1) andalso is_atom(F2) andalso
84 | is_number(A1) andalso is_number(A2) andalso is_number(V1) andalso is_number(V2)
85 | , system_monitor:get_function_top()
86 | )
87 | after
88 | application:stop(?APP)
89 | end,
90 | [ fun ?MODULE:check_produce_seal/1
91 | , fun ?MODULE:check_produce_vips/1
92 | ]).
93 |
94 | t_too_many_procs(_) ->
95 | ?check_trace(
96 | #{timetrap => 30000},
97 | try
98 | application:set_env(?APP, top_max_procs, 1),
99 | application:ensure_all_started(?APP),
100 | ?block_until(#{?snk_kind := sysmon_report_data}),
101 | Top = system_monitor:get_proc_top(),
102 | %% Check that "warning" process is there:
103 | ?assertMatch( #erl_top{pid = "!!!", group_leader = "!!!", registered_name = too_many_processes}
104 | , lists:keyfind("!!!", #erl_top.pid, Top)
105 | ),
106 | %% Check that the VIPs are still there:
107 | ?assertMatch(#erl_top{}, system_monitor:get_proc_info(system_monitor_collector)),
108 | ?assertMatch(#erl_top{}, system_monitor:get_proc_info(system_monitor)),
109 | ?assertMatch(#erl_top{}, system_monitor:get_proc_info(application_controller)),
110 | %% Misc checks:
111 | ?assertMatch(false, system_monitor:get_proc_info(some_random_name))
112 | after
113 | application:stop(?APP)
114 | end,
115 | [ fun ?MODULE:check_produce_seal/1
116 | , fun ?MODULE:check_produce_vips/1
117 | ]).
118 |
119 | t_add_remove_vips(_) ->
120 | ?check_trace(
121 | #{timetrap => 30000},
122 | try
123 | application:set_env(?APP, top_max_procs, 1),
124 | application:ensure_all_started(?APP),
125 | ?wait_async_action( begin
126 | system_monitor:add_vip(global_name_server),
127 | system_monitor:remove_vip(system_monitor)
128 | end
129 | , #{?snk_kind := sysmon_report_data}
130 | ),
131 | Top = system_monitor:get_proc_top(),
132 | %% Check that "warning" process is there:
133 | ?assertMatch( #erl_top{pid = "!!!", group_leader = "!!!", registered_name = too_many_processes}
134 | , lists:keyfind("!!!", #erl_top.pid, Top)
135 | ),
136 | %% Check the VIPs:
137 | ?assertMatch(false, system_monitor:get_proc_info(system_monitor)),
138 | ?assertMatch(#erl_top{}, system_monitor:get_proc_info(application_controller)),
139 | ?assertMatch(#erl_top{}, system_monitor:get_proc_info(global_name_server))
140 | after
141 | application:stop(?APP)
142 | end,
143 | []).
144 |
145 | t_postgres(_) ->
146 | ?check_trace(
147 | #{timetrap => 30000},
148 | try
149 | application:set_env(?APP, top_max_procs, 1),
150 | application:set_env(?APP, db_name, "postgres"),
151 | application:set_env(?APP, callback_mod, system_monitor_pg),
152 | application:ensure_all_started(?APP),
153 | link(whereis(system_monitor_pg)), % if it crashes we will know
154 | {ok, _} = ?block_until(#{?snk_kind := sysmon_produce, backend := pg, type := proc_top,
155 | msg := Msg} when Msg#erl_top.registered_name =:= too_many_processes),
156 | {ok, _} = ?block_until(#{?snk_kind := sysmon_produce, backend := pg, type := proc_top,
157 | msg := Msg} when Msg#erl_top.registered_name =:= system_monitor)
158 | after
159 | unlink(whereis(system_monitor_pg)),
160 | application:stop(?APP)
161 | end,
162 | [ fun ?MODULE:no_pg_query_failures/1
163 | , fun ?MODULE:success_proc_top_queries/1
164 | , fun ?MODULE:success_app_top_queries/1
165 | , fun ?MODULE:success_fun_top_queries/1
166 | , fun ?MODULE:success_node_status_queries/1
167 | ]).
168 |
169 | t_builtin_checks(_) ->
170 | ?check_trace(
171 | #{timetrap => 30000},
172 | try
173 | NProc = erlang:system_info(process_count),
174 | application:set_env(?APP, suspect_procs_max_memory, 1),
175 | application:set_env(?APP, top_max_procs, NProc * 2),
176 | application:set_env(?APP, node_status_fun, {?MODULE, node_status}),
177 | application:ensure_all_started(?APP),
178 | ?block_until(#{?snk_kind := "Abnormal process count"}),
179 | %% Now insert a failing status check, to verify that it doesn't
180 | %% affect the others:
181 | FailingCheck = {?MODULE, failing_check, false, 1},
182 | application:set_env(?APP, status_checks, [FailingCheck|?CFG(status_checks)]),
183 | system_monitor:reset(),
184 | ?block_until(#{?snk_kind := sysmon_failing_check_run}, infinity, 0),
185 | ?block_until(#{?snk_kind := "Abnormal process count"}, infinity, 0)
186 | after
187 | application:stop(?APP)
188 | end,
189 | []).
190 |
191 | t_events(_) ->
192 | ?check_trace(
193 | try
194 | application:ensure_all_started(?APP),
195 | ?block_until(#{?snk_kind := sysmon_report_data}),
196 | GCInfo = [{timeout, 100}, {heap_size, 42}, {heap_block_size}, {stack_size},
197 | {mbuf_size, 42}, {old_heap_size, 42}, {old_heap_block_size, 42}],
198 | ?wait_async_action( system_monitor_events ! {monitor, whereis(system_monitor), long_gc, GCInfo}
199 | , #{?snk_kind := "system monitor event", type := long_gc}
200 | ),
201 | ?wait_async_action( system_monitor_events ! {monitor, list_to_pid("<0.42.42>"), long_gc, GCInfo}
202 | , #{?snk_kind := "system monitor event", type := long_gc}
203 | ),
204 | PortInfo = [{timeout, 42}, {port_op, timeout}],
205 | ?wait_async_action( system_monitor_events ! {monitor, hd(erlang:ports()), long_schedule, PortInfo}
206 | , #{?snk_kind := "system monitor event", type := long_schedule}
207 | )
208 | after
209 | application:stop(?APP)
210 | end,
211 | []).
212 |
213 | %%================================================================================
214 | %% Trace specs
215 | %%================================================================================
216 |
217 | no_pg_query_failures(Trace) ->
218 | ?assertMatch([], ?of_kind(system_monitor_pg_query_error, Trace)).
219 |
220 | success_proc_top_queries(Trace) ->
221 | contains_type(proc_top, Trace).
222 |
223 | success_app_top_queries(Trace) ->
224 | contains_type(app_top, Trace).
225 |
226 | success_fun_top_queries(Trace) ->
227 | contains_type(initial_fun_top, Trace) andalso contains_type(current_fun_top, Trace).
228 |
229 | success_node_status_queries(Trace) ->
230 | contains_type(node_status, Trace).
231 |
232 | contains_type(Type, Trace) ->
233 | lists:search( ?match_event(#{?snk_kind := sysmon_produce, backend := pg, type := T}
234 | when T =:= Type)
235 | , Trace
236 | ) =/= false.
237 |
238 | check_produce_seal(Trace) ->
239 | ?assert(
240 | ?strict_causality( #{?snk_kind := sysmon_produce, type := node_status}
241 | , #{?snk_kind := sysmon_report_data}
242 | , Trace
243 | )).
244 |
245 | check_produce_vips(Trace) ->
246 | [?assert(
247 | ?strict_causality( #{?snk_kind := sysmon_produce, type := proc_top, msg := Msg}
248 | when Msg#erl_top.registered_name =:= VIP
249 | , #{?snk_kind := sysmon_report_data}
250 | , Trace
251 | )) || VIP <- vips()],
252 | ok.
253 |
254 | %%================================================================================
255 | %% Internal functions
256 | %%================================================================================
257 |
258 | failing_check() ->
259 | ?tp(sysmon_failing_check_run, #{}),
260 | error(deliberate).
261 |
262 | spawn_procs(N, MinSleep, MaxSleep) ->
263 | Parent = self(),
264 | lists:foreach( fun(_) ->
265 | erlang:spawn(?MODULE, idle_loop, [Parent, MinSleep, MaxSleep])
266 | end
267 | , lists:seq(1, N)
268 | ).
269 |
270 | idle_loop(Parent, MinSleep, MaxSleep) ->
271 | timer:sleep(MinSleep + rand:uniform(MaxSleep - MinSleep)),
272 | erlang:spawn(?MODULE, ?FUNCTION_NAME, [Parent, MinSleep, MaxSleep]).
273 |
274 | vips() ->
275 | [system_monitor, system_monitor_collector, application_controller].
276 |
277 | node_status() ->
278 | "this is my status".
279 |
280 | docker_startup() ->
281 | exec("docker run -d --name sysmondb -p 5432:5432 \\
282 | -e SYSMON_PASS=system_monitor_password \\
283 | -e GRAFANA_PASS=system_monitor_password \\
284 | -e POSTGRES_PASSWORD=system_monitor_password \\
285 | ghcr.io/k32/sysmon-postgres:1.0.0").
286 |
287 | docker_cleanup() ->
288 | exec("docker kill sysmondb"),
289 | exec("docker rm -f sysmondb").
290 |
291 | -spec exec(file:filename()) -> integer().
292 | exec(CMD) ->
293 | Port = open_port( {spawn, CMD}
294 | , [ exit_status
295 | , binary
296 | , stderr_to_stdout
297 | , {line, 300}
298 | ]
299 | ),
300 | collect_port_output(Port).
301 |
302 | -spec collect_port_output(port()) -> integer().
303 | collect_port_output(Port) ->
304 | receive
305 | {Port, {data, {_, Data}}} ->
306 | io:format(user, "docker: ~s~n", [Data]),
307 | collect_port_output(Port);
308 | {Port, {exit_status, ExitStatus}} ->
309 | ExitStatus
310 | end.
311 |
--------------------------------------------------------------------------------