├── .github └── workflows │ └── ci.yml ├── .gitignore ├── CHANGELOG.md ├── LICENSE ├── Makefile ├── README.md ├── doc ├── app_top.png ├── proc_history.png └── proc_top.png ├── include └── system_monitor.hrl ├── rebar.config ├── rebar.lock ├── src ├── sysmon_int.hrl ├── system_monitor.app.src ├── system_monitor.erl ├── system_monitor_app.erl ├── system_monitor_callback.erl ├── system_monitor_collector.erl ├── system_monitor_dummy.erl ├── system_monitor_events.erl ├── system_monitor_lib.erl ├── system_monitor_pg.erl ├── system_monitor_sup.erl └── system_monitor_top.erl └── test └── sysmon_SUITE.erl /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | on: [push, pull_request] 3 | jobs: 4 | test: 5 | runs-on: ubuntu-latest 6 | strategy: 7 | matrix: 8 | erlang: 9 | - otp: "24" 10 | rebar3: "3.20" 11 | - otp: "25" 12 | rebar3: "3.22" 13 | - otp: "26" 14 | rebar3: "3.22" 15 | - otp: "27" 16 | rebar3: "3.24" 17 | 18 | steps: 19 | - uses: actions/checkout@v4 20 | 21 | - name: Install Erlang/OTP 22 | uses: erlef/setup-beam@v1 23 | with: 24 | otp-version: ${{ matrix.erlang.otp }} 25 | rebar3-version: ${{ matrix.erlang.rebar3 }} 26 | 27 | - name: Run tests 28 | run: make 29 | 30 | - name: Archive common test results 31 | if: ${{ always() }} 32 | uses: actions/upload-artifact@v4 33 | with: 34 | name: CT results 35 | path: _build/test/logs/** 36 | retention-days: 1 37 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | _build 2 | *.beam 3 | ebin/ 4 | .idea 5 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | All notable changes to this project will be documented in this file. 4 | 5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 6 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). 7 | 8 | ## [3.0.2] - 2022-02-08 9 | 10 | Drop support for postgres replay queue. Batch insert operations in 11 | postgres backend. 12 | 13 | ## [3.0.0] - 2022-02-06 14 | 15 | Optimized top collection for systems with millions of processes. 16 | Added "very important processes" feature: some registered processes 17 | are always collected to the top. Added CI and improved test suite. 18 | Major refactoring. Hard fork from the [Klarna version](https://github.com/klarna-incubator/system_monitor). 19 | 20 | Warning: the table schema has changed! See: [example schema](https://github.com/k32/grafana-dashboards/blob/master/postgres/20-schema.sql) 21 | 22 | ## [2.2.0] - 2021-11-05 23 | 24 | Added support for configuring a module to use to send system_monitor events to 25 | an external destination. 26 | 27 | ## [2.1.0] - 2021-10-20 28 | 29 | Data format of system\_monitor\_top is changed to keep static data between 30 | ticks. Since this gen server is started by a supervisor that allows for some 31 | restarts, you can either let the server crash or stop+start this application. 32 | 33 | ## [2.0.0] - 2021-04-07 34 | 35 | Replace Kafka backend with a configurable one that defaults into Postgres 36 | 37 | ## [1.0.0] - 2020-09-02 38 | 39 | Initial version 40 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: all 2 | all: 3 | rebar3 do compile, dialyzer, eunit, ct --readable=false, cover 4 | 5 | .PHONY: clean 6 | clean: 7 | rm -rf _build 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # system_monitor 2 | > Erlang telemetry collector 3 | 4 | `system_monitor` is a BEAM VM monitoring and introspection application 5 | that helps troubleshooting live systems. It collects various 6 | information about Erlang and Elixir processes and applications. 7 | 8 | Unlike `observer`, `system_monitor` it does not require connecting to 9 | the monitored system via Erlang distribution protocol, and can be used 10 | to monitor systems with very tight access restrictions. It can happily 11 | monitor systems with millions of processes. 12 | 13 | By default the data is stored in a Postgres database, and visualized 14 | using Grafana. Ready to use docker images of 15 | [Postgres](https://github.com/k32/grafana-dashboards/pkgs/container/sysmon-postgres) 16 | with the necessary schema and 17 | [Grafana](https://github.com/k32/grafana-dashboards/pkgs/container/sysmon-grafana) 18 | with the dashboards are provided. See 19 | [documentation](https://github.com/k32/grafana-dashboards). 20 | 21 | ## Features 22 | 23 | ### Process top 24 | 25 | Information about top N Erlang processes consuming the most resources 26 | (such as reductions or memory), or have the longest message queues, is 27 | presented on process top dashboard: 28 | 29 | ![Process top](doc/proc_top.png) 30 | 31 | Historical data can be accessed via standard Grafana time 32 | picker. `status` panel can display important information about the 33 | node state. Pids of the processes on that dashboard are clickable 34 | links that lead to the process history dashboard. 35 | 36 | ### Process history 37 | ![Process history](doc/proc_history.png) 38 | 39 | Process history dashboard displays time series data about certain 40 | Erlang process. Note that some data points can be missing if the 41 | process didn't consume enough resources to appear in the process top. 42 | 43 | ### Application top 44 | ![Application top](doc/app_top.png) 45 | 46 | Application top dashboard contains various information aggregated per 47 | OTP application. 48 | 49 | ## Usage example 50 | 51 | In order to integrate `system_monitor` into your system, simply add it 52 | to the release apps. Add the following lines to `rebar.config`: 53 | 54 | ```erlang 55 | {deps, 56 | [ {system_monitor, {git, "https://github.com/k32/system_monitor", {tag, "3.0.2"}}} 57 | ]}. 58 | 59 | {relx, 60 | [ {release, {my_release, "1.0.0"}, 61 | [kernel, sasl, ..., system_monitor]} 62 | ]}. 63 | ``` 64 | 65 | Or to `mix.exs` for Elixir: 66 | 67 | ```elixir 68 | defp deps() do 69 | [ 70 | {:system_monitor, github: "k32/system_monitor", tag: "3.0.2"} 71 | ] 72 | end 73 | ``` 74 | 75 | To enable export to Postgres: 76 | 77 | ```erlang 78 | application:load(system_monitor), 79 | application:set_env(system_monitor, callback_mod, system_monitor_pg) 80 | ``` 81 | 82 | ### Custom node status 83 | 84 | `system_monitor` can export arbitrary node status information that is 85 | deemed important for the operator. This is done by defining a callback 86 | function that returns an HTML-formatted string (or iolist): 87 | 88 | ```erlang 89 | -module(foo). 90 | 91 | -export([node_status/0]). 92 | 93 | node_status() -> 94 | ["my node type
", 95 | case healthy() of 96 | true -> "UP
" 97 | false -> "DEGRADED
" 98 | end, 99 | io_lib:format("very important value=~p", [very_important_value()]) 100 | ]. 101 | ``` 102 | 103 | This callback then needs to be added to the system_monitor application 104 | environment: 105 | 106 | ```erlang 107 | application:set_env(system_monitor, node_status_fun, {?MODULE, node_status}) 108 | ``` 109 | 110 | More information about configurable options and the defaults is found 111 | [here](src/system_monitor.app.src). 112 | 113 | ### What are the preconfigured monitors 114 | 115 | * `check_process_count` 116 | Logs if the process_count passes a certain threshold 117 | * `suspect_procs` 118 | Logs if it detects processes with suspiciously high memory 119 | 120 | `system_monitor_pg` allows for Postgres being temporary down by storing the stats in its own internal buffer. 121 | This buffer is built with a sliding window that will stop the state from growing too big whenever 122 | Postgres is down for too long. On top of this `system_monitor_pg` has a built-in load 123 | shedding mechanism that protects itself once the message length queue grows bigger than a certain level. 124 | 125 | ## Release History 126 | 127 | See our [changelog](CHANGELOG.md). 128 | 129 | ## License 130 | 131 | Copyright © 2020 Klarna Bank AB 132 | Copyright © 2021-2022 k32 133 | -------------------------------------------------------------------------------- /doc/app_top.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ieQu1/system_monitor/a0c537677fadb4dc8d8eedffb9394b9908dd3ae2/doc/app_top.png -------------------------------------------------------------------------------- /doc/proc_history.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ieQu1/system_monitor/a0c537677fadb4dc8d8eedffb9394b9908dd3ae2/doc/proc_history.png -------------------------------------------------------------------------------- /doc/proc_top.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ieQu1/system_monitor/a0c537677fadb4dc8d8eedffb9394b9908dd3ae2/doc/proc_top.png -------------------------------------------------------------------------------- /include/system_monitor.hrl: -------------------------------------------------------------------------------- 1 | %%-------------------------------------------------------------------------------- 2 | %% Copyright 2022 k32 3 | %% Copyright 2020 Klarna Bank AB 4 | %% 5 | %% Licensed under the Apache License, Version 2.0 (the "License"); 6 | %% you may not use this file except in compliance with the License. 7 | %% You may obtain a copy of the License at 8 | %% 9 | %% http://www.apache.org/licenses/LICENSE-2.0 10 | %% 11 | %% Unless required by applicable law or agreed to in writing, software 12 | %% distributed under the License is distributed on an "AS IS" BASIS, 13 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | %% See the License for the specific language governing permissions and 15 | %% limitations under the License. 16 | %%-------------------------------------------------------------------------------- 17 | -ifndef(SYSTEM_MONITOR_HRL). 18 | -define(SYSTEM_MONITOR_HRL, true). 19 | 20 | -record(erl_top, 21 | { ts :: system_monitor_lib:ts() 22 | , pid :: string() 23 | , dreductions :: integer() 24 | , dmemory :: integer() 25 | , reductions :: integer() 26 | , memory :: integer() %% bytes 27 | , message_queue_len :: integer() 28 | , current_function :: mfa() 29 | , initial_call :: mfa() 30 | , registered_name :: atom() | [] 31 | , stack_size :: integer() 32 | , heap_size :: integer() %% words 33 | , total_heap_size :: integer() %% words 34 | , current_stacktrace :: list() 35 | , group_leader :: string() 36 | }). 37 | 38 | -record(app_top, 39 | { app :: atom() 40 | , ts :: system_monitor_lib:ts() 41 | , red_abs :: integer() 42 | , red_rel :: float() 43 | , memory :: integer() 44 | , processes :: integer() 45 | }). 46 | 47 | -endif. 48 | -------------------------------------------------------------------------------- /rebar.config: -------------------------------------------------------------------------------- 1 | %% -*- mode:erlang -*- 2 | {erl_opts, 3 | [debug_info, warnings_as_errors]}. 4 | 5 | {deps, 6 | [ {supervisor3, "1.1.12"} 7 | , {epgsql, "4.7.1"} 8 | , {snabbkaffe, {git, "https://github.com/kafka4beam/snabbkaffe", {tag, "1.0.10"}}} 9 | ]}. 10 | 11 | {dialyzer, [{warnings, [unknown]}]}. 12 | 13 | {profiles, 14 | [ {test, [ {deps, [ {proper, "1.4.0"} 15 | ]} 16 | , {cover_enabled, true} 17 | ]} 18 | , {dev, 19 | [{plugins, [rebar3_hex]}]} 20 | ]}. 21 | 22 | {cover_enabled, true}. 23 | {cover_opts, [verbose]}. 24 | {cover_export_enabled, true}. 25 | -------------------------------------------------------------------------------- /rebar.lock: -------------------------------------------------------------------------------- 1 | {"1.2.0", 2 | [{<<"epgsql">>,{pkg,<<"epgsql">>,<<"4.7.1">>},0}, 3 | {<<"snabbkaffe">>, 4 | {git,"https://github.com/kafka4beam/snabbkaffe", 5 | {ref,"b59298334ed349556f63405d1353184c63c66534"}}, 6 | 0}, 7 | {<<"supervisor3">>,{pkg,<<"supervisor3">>,<<"1.1.12">>},0}]}. 8 | [ 9 | {pkg_hash,[ 10 | {<<"epgsql">>, <<"D4E47CAE46C18C8AFA88E34D59A9B4BAE16368D7CE1EB3DA24FA755EB28393EB">>}, 11 | {<<"supervisor3">>, <<"2FAB1AF26BB9F8AE07692BB30EF79D5F1940E1587EFF9C14C6C8B04B16B400A8">>}]}, 12 | {pkg_hash_ext,[ 13 | {<<"epgsql">>, <<"B6D86B7DC42C8555B1D4E20880E5099D6D6D053148000E188E548F98E4E01836">>}, 14 | {<<"supervisor3">>, <<"62BF29F802C8620B7F9609FE5D81212B1AA5A75A7D86876B61CEA73BE58BA2A6">>}]} 15 | ]. 16 | -------------------------------------------------------------------------------- /src/sysmon_int.hrl: -------------------------------------------------------------------------------- 1 | %%-------------------------------------------------------------------- 2 | %% Copyright (c) 2022 k43. All Rights Reserved. 3 | %% 4 | %% Licensed under the Apache License, Version 2.0 (the "License"); 5 | %% you may not use this file except in compliance with the License. 6 | %% You may obtain a copy of the License at 7 | %% 8 | %% http://www.apache.org/licenses/LICENSE-2.0 9 | %% 10 | %% Unless required by applicable law or agreed to in writing, software 11 | %% distributed under the License is distributed on an "AS IS" BASIS, 12 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | %% See the License for the specific language governing permissions and 14 | %% limitations under the License. 15 | %%-------------------------------------------------------------------- 16 | -ifndef(SYSMON_INT_HRL). 17 | -define(SYSMON_INT_HRL, true). 18 | 19 | -include("system_monitor.hrl"). 20 | -include_lib("snabbkaffe/include/trace.hrl"). 21 | 22 | -define(APP, system_monitor). 23 | 24 | -define(CFG(KEY), system_monitor_lib:cfg(KEY)). 25 | 26 | -define(TS_UNIT, microsecond). 27 | 28 | -endif. 29 | -------------------------------------------------------------------------------- /src/system_monitor.app.src: -------------------------------------------------------------------------------- 1 | %% -*- mode: erlang -*- 2 | %%-------------------------------------------------------------------------------- 3 | %% Copyright 2022 k32 4 | %% Copyright 2020 Klarna Bank AB 5 | %% 6 | %% Licensed under the Apache License, Version 2.0 (the "License"); 7 | %% you may not use this file except in compliance with the License. 8 | %% You may obtain a copy of the License at 9 | %% 10 | %% http://www.apache.org/licenses/LICENSE-2.0 11 | %% 12 | %% Unless required by applicable law or agreed to in writing, software 13 | %% distributed under the License is distributed on an "AS IS" BASIS, 14 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | %% See the License for the specific language governing permissions and 16 | %% limitations under the License. 17 | %%-------------------------------------------------------------------------------- 18 | {application, system_monitor, 19 | [ {description, "Monitoring app that exports Erlang VM introspection data to the external databases."} 20 | , {licenses, ["Apache 2.0"]} 21 | , {vsn, "git"} 22 | , {registered, []} 23 | , {modules, []} 24 | , {mod, {system_monitor_app, []}} 25 | , {applications, [kernel, stdlib, supervisor3, epgsql]} 26 | , {env, 27 | [ %% Specifies how many topmost processes should be reported per 28 | %% category (such as `top_memory', `top_reductions', etc.) 29 | {top_num_items, 10} 30 | %% Specifies how often process top should be collected (in ms): 31 | , {top_sample_interval, 2000} 32 | %% Specifies sample size for the approximate metrics, such as 33 | %% 'percentage of processes started by an app', and 'percentage 34 | %% of processes running a function': 35 | , {top_sample_size, 1000} 36 | %% Stop reporting exact process data when the number of 37 | %% processes is above this threshold, in order to avoid 38 | %% hammering the VM with introspection BIFs (this doesn't affect 39 | %% approximate monitors that rely on sampling): 40 | , {top_max_procs, 15000} 41 | %% Don't report values to `app_top' and `fun_top' below the 42 | %% threshold as insignificant: 43 | , {top_significance_threshold, 44 | #{ current_function => 0.01 % 1 percent of all processes 45 | , initial_call => 0.01 % 1 percent of all processes 46 | , reductions => 0.01 % 1 percent of total reductions 47 | , abs_reductions => 100 % Absolute number of reductions 48 | , memory => 0.01 % 1 percent of total memory 49 | , num_processes => 100 % absolute number of processes 50 | }} 51 | 52 | %% List of registered processes that should be always reported: 53 | , {vips, [mnesia_tm, mnesia_locker]} 54 | 55 | %% Data reporting callback. It is called whenever the data is collected. 56 | , {callback_mod, system_monitor_dummy} 57 | %% Postgres callback settings: 58 | , {db_hostname, "localhost"} 59 | , {db_port, 5432} 60 | , {db_username, "system_monitor"} 61 | , {db_password, "system_monitor_password"} 62 | , {db_name, "system_monitor"} 63 | , {db_connection_timeout, 5000} 64 | 65 | %% Specify node-specific healthcheck function as `{module(), 66 | %% function()}', for example: `{my_app, node_status}'. This 67 | %% function should return an HTML-formatted status report: 68 | , {node_status_fun, undefined} 69 | %% List of status check functions: The format is 70 | %% 71 | %% `{Module, FunctionName, RunAtTerminate, Interval(Ticks)}' 72 | , {status_checks, [ {system_monitor, check_process_count, false, 30} 73 | , {system_monitor, suspect_procs, false, 5} 74 | ]} 75 | , {tick_interval, 1000} 76 | %% BEAM event settings: 77 | , {beam_events, 78 | [ busy_port 79 | , busy_dist_port 80 | , {long_gc, 500} 81 | , {long_schedule, 500} 82 | ]} 83 | %% Suspect process settings: 84 | , {suspect_procs_max_memory, 524288000} %% 500 MB 85 | , {suspect_procs_max_message_queue_len, 5000} 86 | , {suspect_procs_max_total_heap_size, 524288000} %% 500 MB 87 | ]} 88 | ]}. 89 | -------------------------------------------------------------------------------- /src/system_monitor.erl: -------------------------------------------------------------------------------- 1 | %% -*- mode: erlang -*- 2 | %%-------------------------------------------------------------------------------- 3 | %% Copyright 2022 k32 4 | %% Copyright 2021 Klarna Bank AB 5 | %% 6 | %% Licensed under the Apache License, Version 2.0 (the "License"); 7 | %% you may not use this file except in compliance with the License. 8 | %% You may obtain a copy of the License at 9 | %% 10 | %% http://www.apache.org/licenses/LICENSE-2.0 11 | %% 12 | %% Unless required by applicable law or agreed to in writing, software 13 | %% distributed under the License is distributed on an "AS IS" BASIS, 14 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | %% See the License for the specific language governing permissions and 16 | %% limitations under the License. 17 | %%-------------------------------------------------------------------------------- 18 | %% @private 19 | -module(system_monitor). 20 | 21 | -behaviour(gen_server). 22 | 23 | %%-------------------------------------------------------------------- 24 | %% Include files 25 | %%-------------------------------------------------------------------- 26 | 27 | -include("sysmon_int.hrl"). 28 | 29 | -include_lib("kernel/include/logger.hrl"). 30 | 31 | %% API 32 | -export([ start_link/0 33 | , reset/0 34 | 35 | , get_app_top/0 36 | , get_abs_app_top/0 37 | , get_app_memory/0 38 | , get_app_processes/0 39 | , get_function_top/0 40 | , get_proc_top/0 41 | , get_proc_info/1 42 | 43 | , add_vip/1 44 | , remove_vip/1 45 | ]). 46 | 47 | %% Builtin checks 48 | -export([ check_process_count/0 49 | , suspect_procs/0 50 | ]). 51 | 52 | %% gen_server callbacks 53 | -export([ init/1 54 | , handle_continue/2 55 | , handle_call/3 56 | , handle_cast/2 57 | , handle_info/2 58 | , terminate/2 59 | ]). 60 | 61 | %% Internal exports 62 | -export([report_data/2]). 63 | 64 | -export_type([ function_top/0 65 | ]). 66 | 67 | -include_lib("kernel/include/logger.hrl"). 68 | 69 | -define(SERVER, ?MODULE). 70 | -define(TABLE, system_monitor_data_tab). 71 | 72 | -type function_top() :: [{mfa(), number()}]. 73 | 74 | -record(state, { monitors = [] 75 | , timer_ref 76 | }). 77 | 78 | %%==================================================================== 79 | %% API 80 | %%==================================================================== 81 | 82 | -spec start_link() -> {ok, pid()} | ignore | {error, term()}. 83 | start_link() -> gen_server:start_link({local, ?SERVER}, ?MODULE, [], []). 84 | 85 | %% @doc Reset monitors 86 | -spec reset() -> ok. 87 | reset() -> 88 | gen_server:cast(?SERVER, reset). 89 | 90 | %% @doc Add a VIP 91 | -spec add_vip(atom() | [atom()]) -> ok. 92 | add_vip(NameOrNames) -> 93 | system_monitor_collector:add_vip(NameOrNames). 94 | 95 | %% @doc Add a VIP 96 | -spec remove_vip(atom()) -> ok. 97 | remove_vip(RegName) -> 98 | system_monitor_collector:remove_vip(RegName). 99 | 100 | %% @doc Get Erlang process top 101 | -spec get_proc_top() -> [#erl_top{}]. 102 | get_proc_top() -> 103 | lookup_top(proc_top). 104 | 105 | %% @doc Get Erlang process top info for one process 106 | -spec get_proc_info(pid() | atom()) -> #erl_top{} | false. 107 | get_proc_info(Name) when is_atom(Name) -> 108 | case whereis(Name) of 109 | undefined -> false; 110 | Pid -> get_proc_info(Pid) 111 | end; 112 | get_proc_info(Pid) -> 113 | Top = lookup_top(proc_top), 114 | lists:keyfind(pid_to_list(Pid), #erl_top.pid, Top). 115 | 116 | %% @doc Get relative reduction utilization per application, sorted by 117 | %% reductions 118 | -spec get_app_top() -> [{atom(), float()}]. 119 | get_app_top() -> 120 | get_filtered_top(app_top, #app_top.app, #app_top.red_rel, reductions). 121 | 122 | %% @doc Get absolute reduction utilization per application, sorted by 123 | %% reductions 124 | -spec get_abs_app_top() -> [{atom(), integer()}]. 125 | get_abs_app_top() -> 126 | get_filtered_top(app_top, #app_top.app, #app_top.red_abs, abs_reductions). 127 | 128 | %% @doc Get memory utilization per application, sorted by memory 129 | -spec get_app_memory() -> [{atom(), integer()}]. 130 | get_app_memory() -> 131 | get_filtered_top(app_top, #app_top.app, #app_top.memory, memory). 132 | 133 | %% @doc Get number of processes spawned by each application 134 | -spec get_app_processes() -> [{atom(), integer()}]. 135 | get_app_processes() -> 136 | get_filtered_top(app_top, #app_top.app, #app_top.processes, num_processes). 137 | 138 | %% @doc Get approximate distribution of initilal_call and 139 | %% current_function per process 140 | -spec get_function_top() -> #{ initial_call := function_top() 141 | , current_function := function_top() 142 | }. 143 | get_function_top() -> 144 | #{ initial_call => get_filtered_top(init_call_top, 1, 2, initial_call) 145 | , current_function => get_filtered_top(current_fun_top, 1, 2, current_function) 146 | }. 147 | 148 | %%==================================================================== 149 | %% gen_server callbacks 150 | %%==================================================================== 151 | 152 | init([]) -> 153 | process_flag(trap_exit, true), 154 | logger:update_process_metadata(#{domain => [system_monitor, status_check]}), 155 | ets:new(?TABLE, [ public 156 | , named_table 157 | , set 158 | , {keypos, 1} 159 | , {write_concurrency, false} 160 | ]), 161 | {ok, Timer} = timer:send_interval(?CFG(tick_interval), {self(), tick}), 162 | State = #state{ monitors = init_monitors() 163 | , timer_ref = Timer 164 | }, 165 | {ok, State, {continue, start_callback}}. 166 | 167 | handle_continue(start_callback, State) -> 168 | ok = system_monitor_callback:start(), 169 | {noreply, State}. 170 | 171 | handle_call(_Request, _From, State) -> 172 | {reply, {error, unknown_call}, State}. 173 | 174 | handle_cast({report_data, SnapshotTS, ProcTop, AppTop, InitCallTop, CurrentFunTop}, State) -> 175 | ets:insert(?TABLE, {proc_top, SnapshotTS, ProcTop}), 176 | ets:insert(?TABLE, {app_top, SnapshotTS, AppTop}), 177 | ets:insert(?TABLE, {init_call_top, SnapshotTS, InitCallTop}), 178 | ets:insert(?TABLE, {current_fun_top, SnapshotTS, CurrentFunTop}), 179 | report_node_status(SnapshotTS, ProcTop, AppTop), 180 | ?tp(sysmon_report_data, #{ts => SnapshotTS}), 181 | {noreply, State}; 182 | handle_cast(reset, State) -> 183 | {noreply, State#state{monitors = init_monitors()}}; 184 | handle_cast(_Msg, State) -> 185 | {noreply, State}. 186 | 187 | handle_info({Self, tick}, State) when Self =:= self() -> 188 | Monitors = [case Ticks - 1 of 189 | 0 -> 190 | try 191 | apply(Module, Function, []) 192 | catch 193 | EC:Error:Stack -> 194 | logger:debug( 195 | "system_monitor ~p crashed:~n~p:~p~nStacktrace: ~p~n", 196 | [{Module, Function}, EC, Error, Stack]) 197 | end, 198 | {Module, Function, RunOnTerminate, TicksReset, TicksReset}; 199 | TicksDecremented -> 200 | {Module, Function, RunOnTerminate, TicksReset, TicksDecremented} 201 | end || {Module, Function, 202 | RunOnTerminate, TicksReset, Ticks} <- State#state.monitors], 203 | {noreply, State#state{monitors = Monitors}}; 204 | handle_info(_Info, State) -> 205 | {noreply, State}. 206 | 207 | -spec terminate(term(), #state{}) -> any(). 208 | terminate(_Reason, State) -> 209 | %% Possibly, one last check. 210 | [apply(?MODULE, Monitor, []) || 211 | {Monitor, true, _TicksReset, _Ticks} <- State#state.monitors]. 212 | 213 | %%================================================================================ 214 | %% Builtin checks 215 | %%================================================================================ 216 | 217 | %% @doc Check the number of processes and log an aggregate summary of 218 | %% the process info if the count is above Threshold. 219 | -spec check_process_count() -> ok. 220 | check_process_count() -> 221 | {ok, MaxProcs} = application:get_env(?APP, top_max_procs), 222 | case erlang:system_info(process_count) of 223 | Count when Count > MaxProcs div 5 -> 224 | ?tp(warning, "Abnormal process count", #{n_procs => Count}); 225 | _ -> 226 | ok 227 | end. 228 | 229 | suspect_procs() -> 230 | ProcTop = get_proc_top(), 231 | Conf = { ?CFG(suspect_procs_max_memory) 232 | , ?CFG(suspect_procs_max_message_queue_len) 233 | , ?CFG(suspect_procs_max_total_heap_size) 234 | }, 235 | SuspectProcs = lists:filter(fun(Proc) -> is_suspect_proc(Proc, Conf) end, ProcTop), 236 | lists:foreach(fun log_suspect_proc/1, SuspectProcs). 237 | 238 | %%==================================================================== 239 | %% Internal exports 240 | %%==================================================================== 241 | 242 | report_data(SnapshotTS, {ProcTop, AppTop, InitCallTop, CurrentFunTop}) -> 243 | gen_server:cast(?SERVER, {report_data, SnapshotTS, ProcTop, AppTop, InitCallTop, CurrentFunTop}). 244 | 245 | %%============================================================================== 246 | %% Internal functions 247 | %%============================================================================== 248 | 249 | %% @doc Return the list of initiated monitors. 250 | -spec init_monitors() -> [{module(), function(), boolean(), pos_integer(), pos_integer()}]. 251 | init_monitors() -> 252 | [{Module, Function, RunOnTerminate, Ticks, Ticks} 253 | || {Module, Function, RunOnTerminate, Ticks} <- monitors()]. 254 | 255 | %% @doc Returns the list of monitors. The format is 256 | %% 257 | %% ```{Module, FunctionName, RunAtTerminate, NumberOfTicks}''' 258 | %% 259 | %% `RunMonitorAtTerminate' determines whether the monitor is to be run 260 | %% in the terminate gen_server callback. ... and `NumberOfTicks' is 261 | %% the number of ticks between invocations of the monitor in 262 | %% question. So, if `NumberOfTicks' is 3600, the monitor is to be run 263 | %% once every hour, as there is a tick every second. 264 | -spec monitors() -> [{module(), function(), boolean(), pos_integer()}]. 265 | monitors() -> 266 | ?CFG(status_checks). 267 | 268 | %% @doc Report node status 269 | report_node_status(TS, ProcTop, AppTop) -> 270 | system_monitor_callback:produce(proc_top, ProcTop), 271 | system_monitor_callback:produce(app_top, AppTop), 272 | produce_fun_top(TS), 273 | %% Node status report goes last, and it "seals" the report for this 274 | %% time interval: 275 | NodeReport = 276 | case application:get_env(?APP, node_status_fun) of 277 | {ok, {Module, Function}} -> 278 | try 279 | Module:Function() 280 | catch 281 | _:_ -> 282 | <<>> 283 | end; 284 | _ -> 285 | <<>> 286 | end, 287 | system_monitor_callback:produce(node_status, 288 | [{node_status, node(), TS, iolist_to_binary(NodeReport)}]). 289 | 290 | -spec get_filtered_top(proc_top | app_top | init_call_top | current_fun_top, byte(), byte(), atom()) -> 291 | [{atom(), number()}]. 292 | get_filtered_top(Top, KeyField, ValueField, ThresholdKey) -> 293 | Threshold = maps:get(ThresholdKey, ?CFG(top_significance_threshold), 0.0001), 294 | lists:reverse(lists:keysort(2, lookup_top_kv(Top, KeyField, ValueField, Threshold))). 295 | 296 | -spec lookup_top_kv(proc_top | app_top | init_call_top | current_fun_top, byte(), byte(), number()) -> 297 | [{atom(), number()}]. 298 | lookup_top_kv(Top, KeyField, ValueField, Threshold) -> 299 | lists:filtermap( fun(Record) -> 300 | Key = element(KeyField, Record), 301 | Val = element(ValueField, Record), 302 | case Val > Threshold of 303 | true -> {true, {Key, Val}}; 304 | false -> false 305 | end 306 | end 307 | , lookup_top(Top) 308 | ). 309 | 310 | -spec lookup_top(proc_top | app_top | init_call_top | current_fun_top) -> list(). 311 | lookup_top(Key) -> 312 | case ets:lookup(?TABLE, Key) of 313 | [{Key, _Timestamp, Vals}] -> Vals; 314 | [] -> [] 315 | end. 316 | 317 | is_suspect_proc(#erl_top{pid = "!!!"}, _) -> 318 | false; 319 | is_suspect_proc(Proc, {MaxMemory, MaxMqLen, MaxTotalHeapSize}) -> 320 | #erl_top{memory = Memory, 321 | message_queue_len = MessageQueueLen, 322 | total_heap_size = TotalHeapSize} = 323 | Proc, 324 | GreaterIfDef = 325 | fun ({undefined, _}) -> 326 | false; 327 | ({Comp, Value}) -> 328 | Value >= Comp 329 | end, 330 | ToCompare = 331 | [{MaxMemory, Memory}, {MaxMqLen, MessageQueueLen}, {MaxTotalHeapSize, TotalHeapSize}], 332 | lists:any(GreaterIfDef, ToCompare). 333 | 334 | log_suspect_proc(Proc) -> 335 | ErlTopStr = system_monitor_lib:erl_top_to_str(Proc), 336 | Format = "Suspect Proc~n~s", 337 | ?LOG_WARNING(Format, [ErlTopStr], #{domain => [system_monitor]}). 338 | 339 | -spec produce_fun_top(system_monitor_lib:ts()) -> ok. 340 | produce_fun_top(TS) -> 341 | #{ current_function := CurrentFunctionTop 342 | , initial_call := InitialCallTop 343 | } = get_function_top(), 344 | produce_fun_top(current_fun_top, CurrentFunctionTop, TS), 345 | produce_fun_top(initial_fun_top, InitialCallTop, TS), 346 | ok. 347 | 348 | produce_fun_top(TopType, Values, TS) -> 349 | Node = node(), 350 | L = lists:map(fun({Function, PercentProcesses}) -> 351 | {Node, TS, Function, PercentProcesses} 352 | end, 353 | Values), 354 | system_monitor_callback:produce(TopType, L). 355 | -------------------------------------------------------------------------------- /src/system_monitor_app.erl: -------------------------------------------------------------------------------- 1 | %%-------------------------------------------------------------------------------- 2 | %% Copyright 2020 Klarna Bank AB 3 | %% 4 | %% Licensed under the Apache License, Version 2.0 (the "License"); 5 | %% you may not use this file except in compliance with the License. 6 | %% You may obtain a copy of the License at 7 | %% 8 | %% http://www.apache.org/licenses/LICENSE-2.0 9 | %% 10 | %% Unless required by applicable law or agreed to in writing, software 11 | %% distributed under the License is distributed on an "AS IS" BASIS, 12 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | %% See the License for the specific language governing permissions and 14 | %% limitations under the License. 15 | %%-------------------------------------------------------------------------------- 16 | -module(system_monitor_app). 17 | 18 | -behaviour(application). 19 | 20 | -export([start/2, stop/1]). 21 | 22 | start(_Type, _StartArgs) -> 23 | system_monitor_sup:start_link(). 24 | 25 | stop(_State) -> 26 | ok. 27 | -------------------------------------------------------------------------------- /src/system_monitor_callback.erl: -------------------------------------------------------------------------------- 1 | %%-------------------------------------------------------------------------------- 2 | %% Copyright 2021 Klarna Bank AB 3 | %% 4 | %% Licensed under the Apache License, Version 2.0 (the "License"); 5 | %% you may not use this file except in compliance with the License. 6 | %% You may obtain a copy of the License at 7 | %% 8 | %% http://www.apache.org/licenses/LICENSE-2.0 9 | %% 10 | %% Unless required by applicable law or agreed to in writing, software 11 | %% distributed under the License is distributed on an "AS IS" BASIS, 12 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | %% See the License for the specific language governing permissions and 14 | %% limitations under the License. 15 | %%-------------------------------------------------------------------------------- 16 | 17 | -module(system_monitor_callback). 18 | 19 | -export([ start/0 20 | , stop/0 21 | , produce/2 22 | , is_configured/0 23 | ]). 24 | 25 | -include("sysmon_int.hrl"). 26 | 27 | -callback start() -> ok. 28 | -callback stop() -> ok. 29 | -callback produce(atom(), list()) -> ok. 30 | 31 | start() -> 32 | (get_callback_mod()):?FUNCTION_NAME(). 33 | 34 | stop() -> 35 | (get_callback_mod()):?FUNCTION_NAME(). 36 | 37 | produce(Type, Events) -> 38 | (get_callback_mod()):?FUNCTION_NAME(Type, Events). 39 | 40 | -compile({inline, [get_callback_mod/0]}). 41 | get_callback_mod() -> 42 | application:get_env(?APP, callback_mod, system_monitor_dummy). 43 | 44 | is_configured() -> 45 | get_callback_mod() =/= system_monitor_dummy. 46 | -------------------------------------------------------------------------------- /src/system_monitor_collector.erl: -------------------------------------------------------------------------------- 1 | %%-------------------------------------------------------------------------------- 2 | %% Copyright 2022 k32 3 | %% Copyright 2020 Klarna Bank AB 4 | %% 5 | %% Licensed under the Apache License, Version 2.0 (the "License"); 6 | %% you may not use this file except in compliance with the License. 7 | %% You may obtain a copy of the License at 8 | %% 9 | %% http://www.apache.org/licenses/LICENSE-2.0 10 | %% 11 | %% Unless required by applicable law or agreed to in writing, software 12 | %% distributed under the License is distributed on an "AS IS" BASIS, 13 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | %% See the License for the specific language governing permissions and 15 | %% limitations under the License. 16 | %%-------------------------------------------------------------------------------- 17 | 18 | %%% @doc Collect Erlang process statistics and push it to the 19 | %%% configured destination 20 | -module(system_monitor_collector). 21 | 22 | -behaviour(gen_server). 23 | 24 | -include("sysmon_int.hrl"). 25 | 26 | %% API 27 | -export([start_link/0, add_vip/1, remove_vip/1]). 28 | 29 | %% gen_server callbacks 30 | -export([init/1, handle_call/3, handle_cast/2, handle_info/2]). 31 | 32 | -define(SERVER, ?MODULE). 33 | 34 | -define(TOP_APP_TAB, sysmon_top_app_tab). 35 | -define(TOP_INIT_CALL, sysmon_top_init_call). 36 | -define(TOP_CURR_FUN, sysmon_top_curr_fun). 37 | -define(TAB_OPTS, [private, named_table, set, {keypos, 1}]). 38 | 39 | -define(COUNT, diceroll_counter). 40 | 41 | %% Type and record definitions 42 | 43 | -define(HIST(PID, REDS, MEM), {PID, REDS, MEM}). 44 | 45 | -record(state, 46 | { timer :: timer:tref() 47 | , old_data = [] :: [hist()] 48 | , last_ts :: system_monitor_lib:ts() 49 | , time_to_collect = 0 :: non_neg_integer() 50 | }). 51 | 52 | -record(delta, 53 | { pid :: pid() 54 | , reg_name :: atom() 55 | , reds :: non_neg_integer() 56 | , dreds :: non_neg_integer() 57 | , memory :: non_neg_integer() 58 | , dmemory :: non_neg_integer() 59 | , mql :: non_neg_integer() 60 | }). 61 | 62 | -record(top_acc, 63 | { is_vip :: #{atom() => _} 64 | , dt :: non_neg_integer() 65 | , hist_data :: [hist()] 66 | , sample_modulo :: non_neg_integer() 67 | %% Tops 68 | , vips :: [#delta{}] 69 | , memory :: system_monitor_top:top() 70 | , dmemory :: system_monitor_top:top() 71 | , dreds :: system_monitor_top:top() 72 | , mql :: system_monitor_top:top() 73 | }). 74 | 75 | -type hist() :: ?HIST(pid(), non_neg_integer(), non_neg_integer()). 76 | 77 | %%%=================================================================== 78 | %%% API 79 | %%%=================================================================== 80 | 81 | %% @doc Add a VIP 82 | -spec add_vip(atom() | [atom()]) -> ok. 83 | add_vip(RegName) when is_atom(RegName) -> 84 | add_vip([RegName]); 85 | add_vip(RegNames) when is_list(RegNames) -> 86 | gen_server:call(?SERVER, {add_vip, RegNames}). 87 | 88 | %% @doc Add a VIP 89 | -spec remove_vip(atom()) -> ok. 90 | remove_vip(RegName) -> 91 | gen_server:call(?SERVER, {remove_vip, RegName}). 92 | 93 | -spec start_link() -> {ok, pid()} | ignore | {error, term()}. 94 | start_link() -> 95 | gen_server:start_link({local, ?SERVER}, ?MODULE, [], []). 96 | 97 | %%%=================================================================== 98 | %%% gen_server callbacks 99 | %%%=================================================================== 100 | 101 | init([]) -> 102 | put(?COUNT, 0), 103 | {ok, TRef} = timer:send_after(sample_interval(), collect_data), 104 | {ok, #state{ timer = TRef 105 | , last_ts = system_monitor_lib:timestamp() 106 | }}. 107 | 108 | handle_call({add_vip, RegNames}, _From, State) -> 109 | application:set_env(?APP, vips, lists:usort(RegNames ++ ?CFG(vips))), 110 | {reply, ok, State}; 111 | handle_call({remove_vip, RegName}, _From, State) -> 112 | application:set_env(?APP, vips, lists:delete(RegName, ?CFG(vips))), 113 | {reply, ok, State}; 114 | handle_call(_Msg, _From, State) -> 115 | {reply, {error, bad_call}, State}. 116 | 117 | handle_cast(_Msg, State) -> 118 | {noreply, State}. 119 | 120 | handle_info(collect_data, State0) -> 121 | init_tables(), 122 | T1 = system_monitor_lib:timestamp(), 123 | NumProcesses = erlang:system_info(process_count), 124 | TooManyPids = NumProcesses > ?CFG(top_max_procs), 125 | Pids = case TooManyPids of 126 | false -> lists:sort(processes()); 127 | true -> lists:sort(get_vip_pids()) 128 | end, 129 | {ProcTop, State} = collect_proc_top(State0, T1, Pids, TooManyPids), 130 | {AppTop, InitCallTop, CurrFunTop} = finalize_aggr_top(T1, NumProcesses), 131 | %% Report the collected data: 132 | system_monitor:report_data(T1, {ProcTop, AppTop, InitCallTop, CurrFunTop}), 133 | %% Prepare for the next iteration: 134 | T2 = system_monitor_lib:timestamp(), 135 | LastRunTime = erlang:convert_time_unit(T2 - T1, ?TS_UNIT, millisecond), 136 | SleepTime = max(500, sample_interval() - LastRunTime), 137 | erlang:garbage_collect(self()), 138 | {ok, TRef} = timer:send_after(SleepTime, collect_data), 139 | {noreply, State#state{ timer = TRef 140 | , time_to_collect = LastRunTime 141 | }}; 142 | handle_info(_Info, State) -> 143 | {noreply, State}. 144 | 145 | %%%=================================================================== 146 | %%% Internal functions 147 | %%%=================================================================== 148 | 149 | %%-------------------------------------------------------------------- 150 | %% Very important processes 151 | %%-------------------------------------------------------------------- 152 | 153 | -spec vip_names() -> [atom()]. 154 | vip_names() -> 155 | ?CFG(vips). 156 | 157 | -spec get_vip_pids() -> [pid()]. 158 | get_vip_pids() -> 159 | lists:foldl( fun(I, Acc) -> 160 | case whereis(I) of 161 | undefined -> Acc; 162 | Pid -> [Pid|Acc] 163 | end 164 | end 165 | , [] 166 | , vip_names() 167 | ). 168 | 169 | -spec make_is_vip() -> #{atom() => []}. 170 | make_is_vip() -> 171 | maps:from_list([{I, []} || I <- vip_names()]). 172 | 173 | %%-------------------------------------------------------------------- 174 | %% Proc top collection 175 | %%-------------------------------------------------------------------- 176 | 177 | -spec collect_proc_top(#state{}, integer(), [pid()], boolean()) -> {[#erl_top{}], #state{}}. 178 | collect_proc_top(State = #state{old_data = OldData, last_ts = LastTs}, Now, Pids, TooManyPids) -> 179 | Dt = max(1, Now - LastTs), 180 | {Deltas, NewData} = top_deltas(OldData, Pids, Dt), 181 | ProcTop = [make_fake_proc(Now) || TooManyPids] ++ [enrich(I, Now) || I <- Deltas], 182 | {ProcTop, State#state{old_data = NewData}}. 183 | 184 | -spec top_deltas([hist()], [pid()], non_neg_integer()) -> {[#delta{}], [hist()]}. 185 | top_deltas(OldData, Pids, Dt) -> 186 | Acc = go(OldData, Pids, empty_top(length(Pids), Dt)), 187 | {top_to_list(Acc), Acc#top_acc.hist_data}. 188 | 189 | -spec go([hist()], [pid()], #top_acc{}) -> #top_acc{}. 190 | go([], [], Acc) -> 191 | Acc; 192 | go(_Old, [], Acc) -> 193 | %% The rest of the processes have terminated, discard them: 194 | Acc; 195 | go([?HIST(OldPid, _, _)|OldL], PidL = [Pid|_], Acc) when Pid > OldPid -> 196 | %% OldPid terminated, discard it: 197 | go(OldL, PidL, Acc); 198 | go([Old = ?HIST(Pid, _, _)|OldL], [Pid|PidL], Acc0) -> 199 | %% This is a process that we've seen before: 200 | Acc = update_acc(Old, Acc0), 201 | go(OldL, PidL, Acc); 202 | go(OldL, [Pid|PidL], Acc0) -> 203 | %% This is a new process: 204 | Acc = update_acc(?HIST(Pid, 0, 0), Acc0), 205 | go(OldL, PidL, Acc). 206 | 207 | -spec update_acc(hist(), #top_acc{}) -> #top_acc{}. 208 | update_acc( ?HIST(Pid, OldReds, OldMem) 209 | , #top_acc{ dt = Dt 210 | , hist_data = Histories 211 | } = Acc0 212 | ) -> 213 | case get_pid_info(Pid) of 214 | {RegName, Reds, Mem, MQL} -> 215 | DReds = (Reds - OldReds) div Dt, 216 | DMem = (Mem - OldMem) div Dt, 217 | Delta = #delta{ reg_name = RegName 218 | , pid = Pid 219 | , reds = Reds 220 | , dreds = DReds 221 | , memory = Mem 222 | , dmemory = DMem 223 | , mql = MQL 224 | }, 225 | {IsChanged, Acc} = maybe_push_to_top(Acc0, Delta), 226 | (diceroll(Acc#top_acc.sample_modulo) orelse IsChanged) andalso 227 | maybe_update_aggr_top(Delta), 228 | Acc#top_acc{ hist_data = [?HIST(Pid, Reds, Mem) | Histories] 229 | }; 230 | undefined -> 231 | Acc0 232 | end. 233 | 234 | %%-------------------------------------------------------------------- 235 | %% Sample top stuff 236 | %%-------------------------------------------------------------------- 237 | 238 | -spec maybe_update_aggr_top(#delta{}) -> ok. 239 | maybe_update_aggr_top(#delta{ pid = Pid 240 | , dreds = DReds 241 | , memory = Memory 242 | }) -> 243 | case erlang:process_info(Pid, [current_function, group_leader, initial_call, dictionary]) of 244 | undefined -> 245 | ok; 246 | [{current_function, CurrentFunction}, {group_leader, GL}|L] -> 247 | InitialCall = initial_call(L), 248 | App = case application_controller:get_application(GL) of 249 | {ok, A} -> A; 250 | undefined -> undefined 251 | end, 252 | ets:update_counter(?TOP_CURR_FUN, CurrentFunction, {2, 1}, {CurrentFunction, 0}), 253 | ets:update_counter(?TOP_INIT_CALL, InitialCall, {2, 1}, {InitialCall, 0}), 254 | ets:update_counter(?TOP_APP_TAB, App, [{2, 1}, {3, DReds}, {4, Memory}], {App, 0, 0, 0}), 255 | ok 256 | end. 257 | 258 | -spec init_tables() -> ok. 259 | init_tables() -> 260 | ets:new(?TOP_APP_TAB, ?TAB_OPTS), 261 | ets:new(?TOP_CURR_FUN, ?TAB_OPTS), 262 | ets:new(?TOP_INIT_CALL, ?TAB_OPTS). 263 | 264 | -spec finalize_aggr_top(system_monitor_lib:ts(), non_neg_integer()) -> 265 | {[#app_top{}], system_monitor:function_top(), system_monitor:function_top()}. 266 | finalize_aggr_top(TS, NProc) -> 267 | %% Collect data: 268 | SampleSize = top_sample_size(), 269 | CurrFunTop = filter_nproc_results(?TOP_CURR_FUN, NProc, SampleSize), 270 | InitCallTop = filter_nproc_results(?TOP_INIT_CALL, NProc, SampleSize), 271 | AppTop = filter_app_top(TS), 272 | %% Cleanup: 273 | ets:delete(?TOP_APP_TAB), 274 | ets:delete(?TOP_CURR_FUN), 275 | ets:delete(?TOP_INIT_CALL), 276 | {AppTop, InitCallTop, CurrFunTop}. 277 | 278 | -spec filter_app_top(system_monitor_lib:ts()) -> [#app_top{}]. 279 | filter_app_top(TS) -> 280 | L = ets:tab2list(?TOP_APP_TAB), 281 | TotalReds = lists:foldl( fun({_, _, Reds, _Mem}, Acc) -> 282 | Reds + Acc 283 | end 284 | , 0 285 | , L 286 | ), 287 | Factor = 1 / max(1, TotalReds), 288 | [#app_top{ app = App 289 | , ts = TS 290 | , red_abs = Reds 291 | , red_rel = Reds * Factor 292 | , memory = Mem 293 | , processes = Procs 294 | } 295 | || {App, Procs, Reds, Mem} <- L]. 296 | 297 | filter_nproc_results(Tab, NProc, SampleSize) -> 298 | Factor = 1 / min(NProc, SampleSize), 299 | [{K, V * Factor} || {K, V} <- ets:tab2list(Tab)]. 300 | 301 | %%-------------------------------------------------------------------- 302 | %% Top accumulator manipulation 303 | %%-------------------------------------------------------------------- 304 | 305 | -spec empty_top(non_neg_integer(), non_neg_integer()) -> #top_acc{}. 306 | empty_top(NProc, Dt) -> 307 | Empty = system_monitor_top:empty(?CFG(top_num_items)), 308 | SampleModulo = max(1, NProc div top_sample_size()), 309 | #top_acc{ is_vip = make_is_vip() 310 | , dt = Dt 311 | , hist_data = [] 312 | , sample_modulo = SampleModulo 313 | , vips = [] 314 | , memory = Empty 315 | , dmemory = Empty 316 | , dreds = Empty 317 | , mql = Empty 318 | }. 319 | 320 | -spec maybe_push_to_top(#top_acc{}, #delta{}) -> {IsChanged, #top_acc{}} 321 | when IsChanged :: boolean(). 322 | maybe_push_to_top(#top_acc{ is_vip = IsVipP 323 | , vips = GVIPs 324 | , memory = GMem0 325 | , dreds = GDReds0 326 | , dmemory = GDMem0 327 | , mql = GMQL0 328 | } = Acc, 329 | Delta) -> 330 | IsVip = maps:is_key(Delta#delta.reg_name, IsVipP), 331 | {IsMem, GMem} = system_monitor_top:push(#delta.memory, Delta, GMem0), 332 | {IsDReds, GDReds} = system_monitor_top:push(#delta.dreds, Delta, GDReds0), 333 | {IsMQL, GMQL} = system_monitor_top:push(#delta.mql, Delta, GMQL0), 334 | {_, GDMem} = system_monitor_top:push(#delta.dmemory, Delta, GDMem0), 335 | IsChanged = IsVip orelse IsMem orelse IsDReds orelse IsMQL, 336 | { IsChanged 337 | , Acc#top_acc{ vips = [Delta || IsVip] ++ GVIPs 338 | , memory = GMem 339 | , dmemory = GDMem 340 | , dreds = GDReds 341 | , mql = GMQL 342 | }}. 343 | 344 | -spec top_to_list(#top_acc{}) -> [#delta{}]. 345 | top_to_list(#top_acc{ vips = VIPs 346 | , memory = GMem 347 | , dreds = GDReds 348 | , dmemory = GDMem 349 | , mql = GMQL 350 | }) -> 351 | lists:usort(VIPs ++ lists:flatmap( fun system_monitor_top:to_list/1 352 | , [GMem, GDReds, GDMem, GMQL] 353 | )). 354 | 355 | %%-------------------------------------------------------------------- 356 | %% Getting process info 357 | %%-------------------------------------------------------------------- 358 | 359 | -spec enrich(#delta{}, system_monitor_lib:ts()) -> #erl_top{}. 360 | enrich(#delta{ pid = Pid 361 | , reg_name = RegName 362 | , reds = Reds 363 | , dreds = DReds 364 | , memory = Memory 365 | , dmemory = DMem 366 | , mql = MQL 367 | }, Now) -> 368 | Info = process_info(Pid, [group_leader, initial_call, dictionary, stack_size, 369 | heap_size, total_heap_size, current_function, 370 | current_stacktrace]), 371 | case Info of 372 | [{group_leader, GL}, {initial_call, _}, {dictionary, _}, 373 | {stack_size, StackSize}, {heap_size, HeapSize}, {total_heap_size, Total}, 374 | {current_function, CurrentFunction}, {current_stacktrace, CurrentStack}] -> 375 | InitialCall = initial_call(Info); 376 | undefined -> 377 | GL = "", 378 | InitialCall = {'?', '?', 0}, 379 | StackSize = 0, 380 | HeapSize = 0, 381 | Total = 0, 382 | CurrentStack = [], 383 | CurrentFunction = undefined 384 | end, 385 | #erl_top{ ts = Now 386 | , pid = pid_to_list(Pid) 387 | , group_leader = ensure_list(GL) 388 | , dreductions = DReds 389 | , dmemory = DMem 390 | , reductions = Reds 391 | , memory = Memory 392 | , message_queue_len = MQL 393 | , initial_call = InitialCall 394 | , registered_name = RegName 395 | , stack_size = StackSize 396 | , heap_size = HeapSize 397 | , total_heap_size = Total 398 | , current_stacktrace = CurrentStack 399 | , current_function = CurrentFunction 400 | }. 401 | 402 | -spec get_pid_info(pid()) -> {RegName, Reds, Mem, MQL} | undefined 403 | when RegName :: atom(), 404 | Reds :: non_neg_integer(), 405 | Mem :: non_neg_integer(), 406 | MQL :: non_neg_integer(). 407 | get_pid_info(Pid) -> 408 | case erlang:process_info(Pid, [registered_name, reductions, memory, message_queue_len]) of 409 | [ {registered_name, RegName} 410 | , {reductions, Reds} 411 | , {memory, Mem} 412 | , {message_queue_len, MQL} 413 | ] -> 414 | {RegName, Reds, Mem, MQL}; 415 | undefined -> 416 | undefined 417 | end. 418 | 419 | -spec initial_call(proplists:proplist()) -> mfa(). 420 | initial_call(Info) -> 421 | case proplists:get_value(initial_call, Info) of 422 | {proc_lib, init_p, 5} -> 423 | proc_lib:translate_initial_call(Info); 424 | ICall -> 425 | ICall 426 | end. 427 | 428 | %%-------------------------------------------------------------------- 429 | %% Misc 430 | %%-------------------------------------------------------------------- 431 | 432 | make_fake_proc(Now) -> 433 | Infinity = 99999999999, 434 | #erl_top{ ts = Now 435 | , pid = "!!!" 436 | , group_leader = "!!!" 437 | , dreductions = Infinity 438 | , dmemory = Infinity 439 | , reductions = Infinity 440 | , memory = Infinity 441 | , message_queue_len = Infinity 442 | , initial_call = {undefined, undefined, 0} 443 | , registered_name = too_many_processes 444 | , stack_size = Infinity 445 | , heap_size = Infinity 446 | , total_heap_size = Infinity 447 | , current_stacktrace = [] 448 | , current_function = {undefined, undefined, 0} 449 | }. 450 | 451 | sample_interval() -> 452 | ?CFG(top_sample_interval). 453 | 454 | top_sample_size() -> 455 | ?CFG(top_sample_size). 456 | 457 | diceroll(Mod) -> 458 | Cnt = get(?COUNT) + 1, 459 | put(?COUNT, Cnt rem Mod) =:= 0. 460 | 461 | ensure_list(Pid) when is_pid(Pid) -> 462 | pid_to_list(Pid); 463 | ensure_list(Str) -> 464 | Str. 465 | 466 | %%%_* Emacs ============================================================ 467 | %%% Local Variables: 468 | %%% allout-layout: t 469 | %%% erlang-indent-level: 2 470 | %%% End: 471 | -------------------------------------------------------------------------------- /src/system_monitor_dummy.erl: -------------------------------------------------------------------------------- 1 | %%-------------------------------------------------------------------- 2 | %% Copyright (c) k32. All Rights Reserved. 3 | %% 4 | %% Licensed under the Apache License, Version 2.0 (the "License"); 5 | %% you may not use this file except in compliance with the License. 6 | %% You may obtain a copy of the License at 7 | %% 8 | %% http://www.apache.org/licenses/LICENSE-2.0 9 | %% 10 | %% Unless required by applicable law or agreed to in writing, software 11 | %% distributed under the License is distributed on an "AS IS" BASIS, 12 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | %% See the License for the specific language governing permissions and 14 | %% limitations under the License. 15 | %%-------------------------------------------------------------------- 16 | -module(system_monitor_dummy). 17 | 18 | %% API: 19 | -export([start/0, produce/2]). 20 | 21 | -include("sysmon_int.hrl"). 22 | 23 | %%================================================================================ 24 | %% API funcions 25 | %%================================================================================ 26 | 27 | start() -> 28 | ok. 29 | 30 | produce(_Type, Events) -> 31 | [?tp(sysmon_produce, #{type => _Type, msg => _Msg, backend => dummy}) || _Msg <- Events], 32 | ok. 33 | 34 | %%================================================================================ 35 | %% Internal functions 36 | %%================================================================================ 37 | -------------------------------------------------------------------------------- /src/system_monitor_events.erl: -------------------------------------------------------------------------------- 1 | %%-------------------------------------------------------------------------------- 2 | %% Copyright 2022 k32 3 | %% Copyright 2020 Klarna Bank AB 4 | %% 5 | %% Licensed under the Apache License, Version 2.0 (the "License"); 6 | %% you may not use this file except in compliance with the License. 7 | %% You may obtain a copy of the License at 8 | %% 9 | %% http://www.apache.org/licenses/LICENSE-2.0 10 | %% 11 | %% Unless required by applicable law or agreed to in writing, software 12 | %% distributed under the License is distributed on an "AS IS" BASIS, 13 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | %% See the License for the specific language governing permissions and 15 | %% limitations under the License. 16 | %%-------------------------------------------------------------------------------- 17 | %%% @doc 18 | %%% Print BEAM VM events to the logs 19 | %%% 20 | %%% @end 21 | -module(system_monitor_events). 22 | 23 | -behaviour(gen_server). 24 | 25 | -include("sysmon_int.hrl"). 26 | 27 | -export([start_link/0]). 28 | 29 | %% gen_server callbacks 30 | -export([ init/1 31 | , handle_call/3 32 | , handle_cast/2 33 | , handle_info/2 34 | ]). 35 | 36 | %%-------------------------------------------------------------------- 37 | %% @doc 38 | %% Starts the server 39 | %% @end 40 | %%-------------------------------------------------------------------- 41 | -spec start_link() -> {ok, pid()}. 42 | start_link() -> 43 | gen_server:start_link({local, ?MODULE}, ?MODULE, [], []). 44 | 45 | %%==================================================================== 46 | %% gen_server callbacks 47 | %%==================================================================== 48 | 49 | init([]) -> 50 | logger:update_process_metadata(#{domain => [system_monitor, events]}), 51 | setup_system_monitor(), 52 | {ok, {}}. 53 | 54 | handle_call(_Request, _From, State) -> 55 | {reply, {error, unknown_call}, State}. 56 | 57 | handle_cast(_Msg, State) -> 58 | {noreply, State}. 59 | 60 | handle_info({monitor, PidOrPort, EventKind, Info}, State) -> 61 | ReferenceData = data_for_reference(PidOrPort), 62 | InfoTxt = format_system_event_info(Info), 63 | ?tp(info, "system monitor event", 64 | #{ type => EventKind 65 | , pid_or_port => PidOrPort 66 | , info => InfoTxt 67 | , reference => ReferenceData 68 | }), 69 | case application:get_env(?APP, external_monitoring) of 70 | {ok, Mod} -> Mod:system_monitor_event(EventKind, Info); 71 | undefined -> ok 72 | end, 73 | {noreply, State}; 74 | handle_info(_Info, State) -> 75 | {noreply, State}. 76 | 77 | %%============================================================================== 78 | %% Internal functions 79 | %%============================================================================== 80 | 81 | %%-------------------------------------------------------------------- 82 | %% @doc: Set the current process as the receiver of the BEAM system 83 | %% events 84 | %%-------------------------------------------------------------------- 85 | -spec setup_system_monitor() -> ok. 86 | setup_system_monitor() -> 87 | {ok, Opts} = application:get_env(?APP, beam_events), 88 | erlang:system_monitor(self(), Opts), 89 | ok. 90 | 91 | data_for_reference(Pid) when is_pid(Pid) -> 92 | case system_monitor:get_proc_info(Pid) of 93 | false -> "Proc not in top"; 94 | ProcErlTop -> system_monitor_lib:erl_top_to_str(ProcErlTop) 95 | end; 96 | data_for_reference(_Port) -> 97 | "". 98 | 99 | -spec format_system_event_info(term()) -> io_lib:chars(). 100 | format_system_event_info(Info) when is_list(Info) -> 101 | lists:foldl( 102 | fun({Key, Value}, Acc) -> 103 | [io_lib:format("~p=~p ", [Key, Value])|Acc]; 104 | (Value, Acc) -> 105 | [io_lib:format("~p ", [Value])|Acc] 106 | end, 107 | [], 108 | Info); 109 | format_system_event_info(Port) when is_port(Port) -> 110 | format_system_event_info([{port, Port}]); 111 | format_system_event_info(Pid) when is_pid(Pid) -> 112 | format_system_event_info([{pid_2, Pid}]); 113 | format_system_event_info(Term) -> 114 | format_system_event_info([{info, Term}]). 115 | 116 | %%%_* Emacs ============================================================ 117 | %%% Local Variables: 118 | %%% allout-layout: t 119 | %%% erlang-indent-level: 2 120 | %%% End: 121 | -------------------------------------------------------------------------------- /src/system_monitor_lib.erl: -------------------------------------------------------------------------------- 1 | %%-------------------------------------------------------------------- 2 | %% Copyright (c) 2022 k32, Ltd. All Rights Reserved. 3 | %% 4 | %% Licensed under the Apache License, Version 2.0 (the "License"); 5 | %% you may not use this file except in compliance with the License. 6 | %% You may obtain a copy of the License at 7 | %% 8 | %% http://www.apache.org/licenses/LICENSE-2.0 9 | %% 10 | %% Unless required by applicable law or agreed to in writing, software 11 | %% distributed under the License is distributed on an "AS IS" BASIS, 12 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | %% See the License for the specific language governing permissions and 14 | %% limitations under the License. 15 | %%-------------------------------------------------------------------- 16 | -module(system_monitor_lib). 17 | 18 | %% @doc Utility functions 19 | 20 | %% API: 21 | -export([ cfg/1 22 | , fmt_mem/1 23 | , fmt_stack/1 24 | , fmt_mfa/1 25 | , find_first/3 26 | , erl_top_to_str/1 27 | , timestamp/0 28 | ]). 29 | 30 | -export_type([ts/0]). 31 | 32 | -include("sysmon_int.hrl"). 33 | 34 | %%================================================================================ 35 | %% Type declarations 36 | %%================================================================================ 37 | 38 | -type ts() :: integer(). 39 | 40 | %%================================================================================ 41 | %% API funcions 42 | %%================================================================================ 43 | 44 | %% @private 45 | -spec cfg(atom()) -> _. 46 | cfg(Key) -> 47 | {ok, Val} = application:get_env(?APP, Key), 48 | Val. 49 | 50 | fmt_mem(Mem) -> 51 | Units = [{1, "Bytes"}, {1024, "KB"}, {1024 * 1024, "MB"}, {1024 * 1024 * 1024, "GB"}], 52 | MemIsSmallEnough = fun({Dividor, _UnitStr}) -> Mem =< Dividor * 1024 end, 53 | {Dividor, UnitStr} = 54 | find_first(MemIsSmallEnough, Units, {1024 * 1024 * 1024 * 1024, "TB"}), 55 | io_lib:format("~.1f ~s", [Mem / Dividor, UnitStr]). 56 | 57 | fmt_stack(CurrentStack) -> 58 | [[fmt_mfa(MFA), "\n"] || MFA <- CurrentStack]. 59 | 60 | fmt_mfa({Mod, Fun, Arity, Prop}) -> 61 | case proplists:get_value(line, Prop, undefined) of 62 | undefined -> 63 | fmt_mfa({Mod, Fun, Arity}); 64 | Line -> 65 | io_lib:format("~s:~s/~p (Line ~p)", [Mod, Fun, Arity, Line]) 66 | end; 67 | fmt_mfa({Mod, Fun, Arity}) -> 68 | io_lib:format("~s:~s/~p", [Mod, Fun, Arity]); 69 | fmt_mfa(L) -> 70 | io_lib:format("~p", [L]). 71 | 72 | -spec find_first(fun((any()) -> boolean()), [T], Default) -> T | Default. 73 | find_first(Pred, List, Default) -> 74 | case lists:search(Pred, List) of 75 | {value, Elem} -> Elem; 76 | false -> Default 77 | end. 78 | 79 | %% @doc logs "the interesting parts" of erl_top 80 | erl_top_to_str(Proc) -> 81 | #erl_top{registered_name = RegisteredName, 82 | pid = Pid, 83 | initial_call = InitialCall, 84 | memory = Memory, 85 | message_queue_len = MessageQueueLength, 86 | stack_size = StackSize, 87 | heap_size = HeapSize, 88 | total_heap_size = TotalHeapSize, 89 | current_function = CurrentFunction, 90 | current_stacktrace = CurrentStack} = 91 | Proc, 92 | WordSize = erlang:system_info(wordsize), 93 | Format = 94 | "registered_name=~p~n" 95 | "offending_pid=~s~n" 96 | "initial_call=~s~n" 97 | "memory=~p (~s)~n" 98 | "message_queue_len=~p~n" 99 | "stack_size=~p~n" 100 | "heap_size=~p (~s)~n" 101 | "total_heap_size=~p (~s)~n" 102 | "current_function=~s~n" 103 | "current_stack:~n~s", 104 | Args = 105 | [RegisteredName, 106 | Pid, 107 | system_monitor_lib:fmt_mfa(InitialCall), 108 | Memory, system_monitor_lib:fmt_mem(Memory), 109 | MessageQueueLength, 110 | StackSize, 111 | HeapSize, system_monitor_lib:fmt_mem(WordSize * HeapSize), 112 | TotalHeapSize, system_monitor_lib:fmt_mem(WordSize * TotalHeapSize), 113 | system_monitor_lib:fmt_mfa(CurrentFunction), 114 | system_monitor_lib:fmt_stack(CurrentStack)], 115 | io_lib:format(Format, Args). 116 | 117 | -spec timestamp() -> ts(). 118 | timestamp() -> 119 | erlang:system_time(?TS_UNIT). 120 | 121 | %%================================================================================ 122 | %% Internal functions 123 | %%================================================================================ 124 | -------------------------------------------------------------------------------- /src/system_monitor_pg.erl: -------------------------------------------------------------------------------- 1 | %%-------------------------------------------------------------------------------- 2 | %% Copyright 2022 ieQu1 3 | %% Copyright 2021 Klarna Bank AB 4 | %% 5 | %% Licensed under the Apache License, Version 2.0 (the "License"); 6 | %% you may not use this file except in compliance with the License. 7 | %% You may obtain a copy of the License at 8 | %% 9 | %% http://www.apache.org/licenses/LICENSE-2.0 10 | %% 11 | %% Unless required by applicable law or agreed to in writing, software 12 | %% distributed under the License is distributed on an "AS IS" BASIS, 13 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | %% See the License for the specific language governing permissions and 15 | %% limitations under the License. 16 | %%-------------------------------------------------------------------------------- 17 | -module(system_monitor_pg). 18 | 19 | -behaviour(gen_server). 20 | -export([ start_link/0 21 | , init/1 22 | , handle_continue/2 23 | , handle_call/3 24 | , handle_info/2 25 | , handle_cast/2 26 | , terminate/2 27 | 28 | , connect_options/0 29 | ]). 30 | 31 | -behaviour(system_monitor_callback). 32 | -export([ start/0, stop/0, produce/2 ]). 33 | 34 | -include("sysmon_int.hrl"). 35 | -include_lib("kernel/include/logger.hrl"). 36 | 37 | -define(SERVER, ?MODULE). 38 | -define(FIVE_SECONDS, 5000). 39 | -define(ONE_HOUR, 60*60*1000). 40 | 41 | %%%_* API ================================================================= 42 | start() -> 43 | {ok, _} = system_monitor_sup:start_child(?MODULE), 44 | ok. 45 | 46 | stop() -> 47 | gen_server:stop(?SERVER). 48 | 49 | produce(Type, Events) -> 50 | gen_server:cast(?SERVER, {produce, Type, Events}). 51 | 52 | %%%_* Callbacks ================================================================= 53 | start_link() -> 54 | gen_server:start_link({local, ?SERVER}, ?MODULE, [], []). 55 | 56 | init(_Args) -> 57 | erlang:process_flag(trap_exit, true), 58 | logger:update_process_metadata(#{domain => [system_monitor, pg]}), 59 | {ok, #{}, {continue, start_pg}}. 60 | 61 | handle_continue(start_pg, State) -> 62 | Conn = initialize(), 63 | case Conn of 64 | undefined -> 65 | timer:send_after(?FIVE_SECONDS, reinitialize); 66 | Conn -> 67 | ok 68 | end, 69 | timer:send_after(?ONE_HOUR, mk_partitions), 70 | {noreply, State#{connection => Conn}}. 71 | 72 | handle_call(_Msg, _From, State) -> 73 | {reply, ok, State}. 74 | 75 | handle_info({'EXIT', Conn, _Reason}, #{connection := Conn} = State) -> 76 | timer:send_after(?FIVE_SECONDS, reinitialize), 77 | {noreply, State}; 78 | handle_info({'EXIT', _Conn, _Reason}, #{connection := undefined} = State) -> 79 | timer:send_after(?FIVE_SECONDS, reinitialize), 80 | {noreply, State}; 81 | handle_info({'EXIT', _Conn, normal}, State) -> 82 | {noreply, State}; 83 | handle_info(mk_partitions, #{connection := undefined} = State) -> 84 | timer:send_after(?ONE_HOUR, mk_partitions), 85 | {noreply, State}; 86 | handle_info(mk_partitions, #{connection := Conn} = State) -> 87 | mk_partitions(Conn), 88 | timer:send_after(?ONE_HOUR, mk_partitions), 89 | {noreply, State}; 90 | handle_info(reinitialize, State) -> 91 | {noreply, State#{connection => initialize()}}. 92 | 93 | handle_cast({produce, _Type, _Events}, #{connection := undefined} = State) -> 94 | {noreply, State}; 95 | handle_cast({produce, Type, Events}, #{connection := Conn} = State) -> 96 | MaxMsgQueueSize = application:get_env(?APP, max_message_queue_len, 1000), 97 | case process_info(self(), message_queue_len) of 98 | {_, N} when N > MaxMsgQueueSize -> 99 | ignore; 100 | _ -> 101 | run_query(Conn, Type, Events) 102 | end, 103 | {noreply, State}. 104 | 105 | terminate(_Reason, #{connection := undefined}) -> 106 | ok; 107 | terminate(_Reason, #{connection := Conn}) -> 108 | epgsql:close(Conn). 109 | 110 | %%%_* Internal functions ====================================================== 111 | 112 | run_query(Conn, Type, Events) -> 113 | {ok, Statement} = epgsql:parse(Conn, query(Type)), 114 | Batch = [{Statement, params(Type, I)} || I <- Events], 115 | Results = epgsql:execute_batch(Conn, Batch), 116 | emit_traces(Type, Events, Results). 117 | 118 | emit_traces(_Type, [], []) -> 119 | ok; 120 | emit_traces(Type, [_Evt|Evts], [Result|Results]) -> 121 | case Result of 122 | {error, Err} -> 123 | ?tp(debug, system_monitor_pg_query_error, 124 | #{ query => Type 125 | , error => Err 126 | }); 127 | _Ok -> 128 | ?tp(sysmon_produce, #{ type => Type 129 | , msg => _Evt 130 | , backend => pg 131 | }) 132 | end, 133 | emit_traces(Type, Evts, Results). 134 | 135 | initialize() -> 136 | case connect() of 137 | undefined -> 138 | undefined; 139 | Conn -> 140 | mk_partitions(Conn), 141 | Conn 142 | end. 143 | 144 | connect() -> 145 | case epgsql:connect(connect_options()) of 146 | {ok, Conn} -> 147 | Conn; 148 | Err -> 149 | ?LOG_WARNING("Failed to open connection to the DB: ~p", [Err]), 150 | undefined 151 | end. 152 | 153 | connect_options() -> 154 | #{host => ?CFG(db_hostname), 155 | port => ?CFG(db_port), 156 | username => ?CFG(db_username), 157 | password => ?CFG(db_password), 158 | database => ?CFG(db_name), 159 | timeout => ?CFG(db_connection_timeout), 160 | codecs => [] 161 | }. 162 | 163 | mk_partitions(Conn) -> 164 | DaysAhead = application:get_env(system_monitor, partition_days_ahead, 10), 165 | DaysBehind = application:get_env(system_monitor, partition_days_behind, 10), 166 | %% date() uses local time while event data is in UTC 167 | %% so we need to subtract 1 day to make sure there is partition for current UTC timestamps 168 | GDate = calendar:date_to_gregorian_days(date()) - 1, 169 | DaysAheadL = lists:seq(GDate, GDate + DaysAhead), 170 | %% Delete 10 days older than partition_days_behind config 171 | DaysBehindL = lists:seq(GDate - DaysBehind - 10, GDate - DaysBehind - 2), 172 | lists:foreach(fun(Day) -> create_partition_tables(Conn, Day) end, DaysAheadL), 173 | lists:foreach(fun(Day) -> delete_partition_tables(Conn, Day) end, DaysBehindL). 174 | 175 | create_partition_tables(Conn, Day) -> 176 | Tables = [<<"prc">>, <<"app_top">>, <<"initial_fun_top">>, <<"current_fun_top">>, <<"node_status">>], 177 | From = to_postgres_date(Day), 178 | To = to_postgres_date(Day + 1), 179 | lists:foreach(fun(Table) -> 180 | Query = create_partition_query(Table, Day, From, To), 181 | check_result(epgsql:squery(Conn, Query)) 182 | end, 183 | Tables). 184 | 185 | check_result([]) -> 186 | ok; 187 | check_result({error, {error, error, _, duplicate_table, _, _}}) -> 188 | ok; 189 | check_result([{ok, [], []} | Rest]) -> 190 | check_result(Rest); 191 | check_result(Err) -> 192 | error({failed_to_create_partition, Err}). 193 | 194 | delete_partition_tables(Conn, Day) -> 195 | Tables = [<<"prc">>, <<"app_top">>, <<"fun_top">>, <<"node_status">>], 196 | lists:foreach(fun(Table) -> 197 | Query = delete_partition_query(Table, Day), 198 | {ok, [], []} = epgsql:squery(Conn, Query) 199 | end, 200 | Tables). 201 | 202 | create_partition_query(Table, Day, From, To) -> 203 | <<"CREATE TABLE IF NOT EXISTS ", Table/binary, "_", (integer_to_binary(Day))/binary, " ", 204 | "PARTITION OF ", Table/binary, " ", 205 | "FOR VALUES " 206 | "FROM ('", (list_to_binary(From))/binary, "') TO ('", (list_to_binary(To))/binary, "');" 207 | "CREATE INDEX IF NOT EXISTS ", 208 | Table/binary, "_", (integer_to_binary(Day))/binary, "_ts_idx " 209 | "ON ", Table/binary, "_", (integer_to_binary(Day))/binary, "(ts);">>. 210 | 211 | delete_partition_query(Table, Day) -> 212 | <<"DROP TABLE IF EXISTS ", Table/binary, "_", (integer_to_binary(Day))/binary, ";">>. 213 | 214 | to_postgres_date(GDays) -> 215 | {YY, MM, DD} = calendar:gregorian_days_to_date(GDays), 216 | lists:flatten(io_lib:format("~w-~2..0w-~2..0w", [YY, MM, DD])). 217 | 218 | query(initial_fun_top) -> 219 | fun_top_query("initial"); 220 | query(current_fun_top) -> 221 | fun_top_query("current"); 222 | query(app_top) -> 223 | app_top_query(); 224 | query(node_status) -> 225 | node_status_query(); 226 | query(proc_top) -> 227 | prc_query(). 228 | 229 | prc_query() -> 230 | <<"insert into prc (node, ts, pid, dreductions, dmemory, reductions, " 231 | "memory, message_queue_len, current_function, initial_call, " 232 | "registered_name, stack_size, heap_size, total_heap_size, current_stacktrace, group_leader) " 233 | "VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16);">>. 234 | 235 | app_top_query() -> 236 | <<"insert into app_top (node, ts, application, red_abs, red_rel, memory, num_processes)" 237 | " VALUES ($1, $2, $3, $4, $5, $6, $7);">>. 238 | 239 | fun_top_query(Top) -> 240 | iolist_to_binary( 241 | [<<"insert into ">>, 242 | Top, 243 | <<"_fun_top(node, ts, fun, percent_processes) VALUES ($1, $2, $3, $4);">>]). 244 | 245 | node_status_query() -> 246 | <<"insert into node_status (node, ts, data) VALUES ($1, $2, $3);">>. 247 | 248 | params(Top, {Node, TS, Function, PercentProcesses}) when Top =:= initial_fun_top; 249 | Top =:= current_fun_top -> 250 | [atom_to_list(Node), 251 | ts_to_timestamp(TS), 252 | system_monitor_lib:fmt_mfa(Function), 253 | PercentProcesses]; 254 | params(app_top, 255 | #app_top{app = App, 256 | ts = TS, 257 | red_abs = RedAbs, 258 | red_rel = RedRel, 259 | memory = Mem, 260 | processes = NumProcesses 261 | }) -> 262 | [atom_to_binary(node(), latin1), 263 | ts_to_timestamp(TS), 264 | atom_to_binary(App, latin1), 265 | RedAbs, 266 | RedRel, 267 | Mem, 268 | NumProcesses]; 269 | params(node_status, {node_status, Node, TS, Bin}) -> 270 | [atom_to_list(Node), ts_to_timestamp(TS), Bin]; 271 | params(proc_top, 272 | #erl_top{ts = TS, 273 | pid = Pid, 274 | dreductions = DR, 275 | dmemory = DM, 276 | reductions = R, 277 | memory = M, 278 | message_queue_len = MQL, 279 | current_function = CF, 280 | initial_call = IC, 281 | registered_name = RN, 282 | stack_size = SS, 283 | heap_size = HS, 284 | total_heap_size = THS, 285 | current_stacktrace = CS, 286 | group_leader = GL} = 287 | _Event) -> 288 | [atom_to_binary(node(), latin1), 289 | ts_to_timestamp(TS), 290 | Pid, 291 | DR, 292 | DM, 293 | R, 294 | M, 295 | MQL, 296 | system_monitor_lib:fmt_mfa(CF), 297 | system_monitor_lib:fmt_mfa(IC), 298 | name_to_list(RN), 299 | SS, 300 | HS, 301 | THS, 302 | system_monitor_lib:fmt_stack(CS), 303 | GL]. 304 | 305 | ts_to_timestamp(TS) -> 306 | calendar:system_time_to_universal_time(TS, ?TS_UNIT). 307 | 308 | name_to_list(Term) -> 309 | case io_lib:printable_latin1_list(Term) of 310 | true -> 311 | Term; 312 | false -> 313 | lists:flatten(io_lib:format("~p", [Term])) 314 | end. 315 | -------------------------------------------------------------------------------- /src/system_monitor_sup.erl: -------------------------------------------------------------------------------- 1 | %%-------------------------------------------------------------------------------- 2 | %% Copyright 2022 k32 3 | %% Copyright 2020 Klarna Bank AB 4 | %% 5 | %% Licensed under the Apache License, Version 2.0 (the "License"); 6 | %% you may not use this file except in compliance with the License. 7 | %% You may obtain a copy of the License at 8 | %% 9 | %% http://www.apache.org/licenses/LICENSE-2.0 10 | %% 11 | %% Unless required by applicable law or agreed to in writing, software 12 | %% distributed under the License is distributed on an "AS IS" BASIS, 13 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | %% See the License for the specific language governing permissions and 15 | %% limitations under the License. 16 | %%-------------------------------------------------------------------------------- 17 | -module(system_monitor_sup). 18 | 19 | %% TODO: Dialyzer doesn't like this one: 20 | %-behaviour(supervisor3). 21 | 22 | %% External exports 23 | -export([start_link/0, start_child/1]). 24 | 25 | %% supervisor callbacks 26 | -export([init/1, post_init/1]). 27 | 28 | %%-------------------------------------------------------------------- 29 | %% Macros 30 | %%-------------------------------------------------------------------- 31 | -define(SERVER, ?MODULE). 32 | -define(SUP2, system_monitor2_sup). 33 | 34 | %%%---------------------------------------------------------------------- 35 | %%% API 36 | %%%---------------------------------------------------------------------- 37 | start_link() -> 38 | supervisor3:start_link({local, ?SERVER}, ?MODULE, ?SERVER). 39 | 40 | start_child(Name) -> 41 | supervisor3:start_child(?SUP2, worker(Name)). 42 | 43 | %%%---------------------------------------------------------------------- 44 | %%% Callback functions from supervisor 45 | %%%---------------------------------------------------------------------- 46 | 47 | server(Name, Type) -> 48 | server(Name, Type, 2000). 49 | 50 | server(Name, Type, Shutdown) -> 51 | {Name, {Name, start_link, []}, {permanent, 15}, Shutdown, Type, [Name]}. 52 | 53 | worker(Name) -> server(Name, worker). 54 | 55 | post_init(_) -> 56 | ignore. 57 | 58 | init(?SERVER) -> 59 | %% The top level supervisor *does not allow restarts*; if a component 60 | %% directly under this supervisor crashes, the entire node will shut 61 | %% down and restart. Thus, only those components that must never be 62 | %% unavailable should be directly under this supervisor. 63 | 64 | SecondSup = {?SUP2, 65 | {supervisor3, start_link, 66 | [{local, ?SUP2}, ?MODULE, ?SUP2]}, 67 | permanent, 2000, supervisor, [?MODULE]}, 68 | 69 | {ok, {{one_for_one,0,1}, % no restarts allowed! 70 | [SecondSup] 71 | }}; 72 | init(?SUP2) -> 73 | %% The second-level supervisor allows some restarts. This is where the 74 | %% normal services live. 75 | {ok, {{one_for_one, 10, 20}, 76 | [ worker(system_monitor_collector) 77 | , worker(system_monitor_events) 78 | , worker(system_monitor) 79 | ] 80 | }}. 81 | -------------------------------------------------------------------------------- /src/system_monitor_top.erl: -------------------------------------------------------------------------------- 1 | %%-------------------------------------------------------------------- 2 | %% Copyright (c) 2022 k32. All Rights Reserved. 3 | %% 4 | %% Licensed under the Apache License, Version 2.0 (the "License"); 5 | %% you may not use this file except in compliance with the License. 6 | %% You may obtain a copy of the License at 7 | %% 8 | %% http://www.apache.org/licenses/LICENSE-2.0 9 | %% 10 | %% Unless required by applicable law or agreed to in writing, software 11 | %% distributed under the License is distributed on an "AS IS" BASIS, 12 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | %% See the License for the specific language governing permissions and 14 | %% limitations under the License. 15 | %%-------------------------------------------------------------------- 16 | -module(system_monitor_top). 17 | 18 | %% API: 19 | -export([empty/1, push/3, to_list/1]). 20 | 21 | -export_type([top/0]). 22 | 23 | -ifdef(TEST). 24 | -include_lib("proper/include/proper.hrl"). 25 | -include_lib("eunit/include/eunit.hrl"). 26 | -endif. % TEST 27 | 28 | %%================================================================================ 29 | %% Type declarations 30 | %%================================================================================ 31 | 32 | -record(top, 33 | { minimum :: non_neg_integer() 34 | , size :: non_neg_integer() 35 | , max_size :: non_neg_integer() 36 | , data :: gb_trees:tree(non_neg_integer(), [tuple()]) 37 | }). 38 | 39 | -opaque top() :: #top{}. 40 | 41 | %%================================================================================ 42 | %% API funcions 43 | %%================================================================================ 44 | 45 | -spec empty(non_neg_integer()) -> top(). 46 | empty(MaxItems) -> 47 | #top{ minimum = 0 48 | , size = 0 49 | , max_size = MaxItems 50 | , data = gb_trees:empty() 51 | }. 52 | 53 | -spec to_list(top()) -> [tuple()]. 54 | to_list(#top{data = Data}) -> 55 | lists:append(gb_trees:values(Data)). 56 | 57 | -spec push(integer(), tuple(), top()) -> {Changed, top()} 58 | when Changed :: boolean(). 59 | push(_, _, Top = #top{max_size = 0}) -> 60 | {false, Top}; 61 | push(FieldID, Val, #top{ size = Size 62 | , max_size = MaxSize 63 | , data = Data0 64 | }) when Size < MaxSize -> 65 | Key = element(FieldID, Val), 66 | Data = gb_insert(Key, Val, Data0), 67 | {Min, _} = gb_trees:smallest(Data), 68 | {true, #top{ size = Size + 1 69 | , max_size = MaxSize 70 | , minimum = Min 71 | , data = Data 72 | }}; 73 | push(FieldID, Val, 74 | OldTop = #top{ minimum = OldMin 75 | , data = Data0 76 | , max_size = MaxSize 77 | }) -> 78 | Key = element(FieldID, Val), 79 | if OldMin < Key -> 80 | {SKey, SVal, Data1} = gb_trees:take_smallest(Data0), 81 | case SVal of 82 | [_] -> 83 | Data2 = Data1; 84 | [_|SVal2] -> 85 | Data2 = gb_trees:enter(SKey, SVal2, Data1) 86 | end, 87 | Data = gb_insert(Key, Val, Data2), 88 | {Min, _} = gb_trees:smallest(Data), 89 | {true, #top{ minimum = Min 90 | , size = MaxSize 91 | , max_size = MaxSize 92 | , data = Data 93 | }}; 94 | true -> 95 | {false, OldTop} 96 | end. 97 | 98 | %%================================================================================ 99 | %% Internal functions 100 | %%================================================================================ 101 | 102 | gb_insert(Key, Val, Tree) -> 103 | case gb_trees:lookup(Key, Tree) of 104 | none -> 105 | gb_trees:enter(Key, [Val], Tree); 106 | {value, Vals} -> 107 | gb_trees:update(Key, [Val|Vals], Tree) 108 | end. 109 | 110 | %%%=================================================================== 111 | %%% Tests 112 | %%%=================================================================== 113 | 114 | -ifdef(TEST). 115 | 116 | tuples() -> 117 | list({non_neg_integer()}). 118 | 119 | %% maybe_push_to_top function is just an optimized version 120 | %% of sorting a list and then taking its first N elements. 121 | %% 122 | %% Check that it is indeed true 123 | maybe_push_to_top_same_as_sort_prop() -> 124 | ?FORALL({NItems, L}, {range(0, 10), tuples()}, 125 | ?IMPLIES( 126 | length(L) >= NItems, 127 | begin 128 | Reference = lists:nthtail(length(L) - NItems, lists:sort(L)), 129 | Top = lists:foldl( fun(I, Acc0) -> 130 | {_, Acc} = push(1, I, Acc0), 131 | Acc 132 | end 133 | , empty(NItems) 134 | , L 135 | ), 136 | ?assertEqual(Reference, to_list(Top)), 137 | true 138 | end)). 139 | 140 | maybe_push_to_top_test() -> 141 | ?assertEqual(true, proper:quickcheck( 142 | proper:numtests( 143 | 1000, 144 | maybe_push_to_top_same_as_sort_prop()) 145 | )). 146 | 147 | -endif. 148 | -------------------------------------------------------------------------------- /test/sysmon_SUITE.erl: -------------------------------------------------------------------------------- 1 | %%-------------------------------------------------------------------- 2 | %% Copyright 2022 k32 3 | %% 4 | %% Licensed under the Apache License, Version 2.0 (the "License"); 5 | %% you may not use this file except in compliance with the License. 6 | %% You may obtain a copy of the License at 7 | %% 8 | %% http://www.apache.org/licenses/LICENSE-2.0 9 | %% 10 | %% Unless required by applicable law or agreed to in writing, software 11 | %% distributed under the License is distributed on an "AS IS" BASIS, 12 | %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | %% See the License for the specific language governing permissions and 14 | %% limitations under the License. 15 | %%-------------------------------------------------------------------- 16 | -module(sysmon_SUITE). 17 | 18 | -compile(export_all). 19 | -compile(nowarn_export_all). 20 | 21 | -include("sysmon_int.hrl"). 22 | -include_lib("snabbkaffe/include/snabbkaffe.hrl"). 23 | -include_lib("stdlib/include/assert.hrl"). 24 | 25 | %%================================================================================ 26 | %% behavior callbacks 27 | %%================================================================================ 28 | 29 | all() -> 30 | [Fun || {Fun, 1} <- ?MODULE:module_info(exports), lists:prefix("t_", atom_to_list(Fun))]. 31 | 32 | init_per_suite(Config) -> 33 | snabbkaffe:fix_ct_logging(), 34 | application:load(?APP), 35 | application:set_env(?APP, vips, [some_random_name|vips()]), 36 | application:set_env(?APP, top_sample_interval, 1000), 37 | application:set_env(?APP, tick_interval, 100), 38 | application:set_env(?APP, top_significance_threshold, 39 | #{ current_function => 0 40 | , initial_call => 0 41 | , reductions => 0 42 | , abs_reductions => 0 43 | , memory => 0 44 | , num_processes => 1 45 | }), 46 | docker_cleanup(), 47 | ?assertMatch(0, docker_startup()), 48 | OldConf = application:get_all_env(?APP), 49 | [{old_conf, OldConf} | Config]. 50 | 51 | end_per_suite(_Config) -> 52 | docker_cleanup(), 53 | ok. 54 | 55 | init_per_testcase(TestCase, Config) -> 56 | logger:notice(asciiart:visible($%, "Starting ~p", [TestCase])), 57 | Config. 58 | 59 | end_per_testcase(TestCase, Config) -> 60 | logger:notice(asciiart:visible($%, "Complete ~p", [TestCase])), 61 | snabbkaffe:stop(), 62 | [application:set_env(?APP, K, V) || {K, V} <- proplists:get_value(old_conf, Config)], 63 | Config. 64 | 65 | %%================================================================================ 66 | %% Tests 67 | %%================================================================================ 68 | 69 | t_start(_) -> 70 | ?check_trace( 71 | #{timetrap => 30000}, 72 | try 73 | application:ensure_all_started(?APP), 74 | spawn_procs(100, 1000, 10000), 75 | %% Wait several events: 76 | [?block_until(#{?snk_kind := sysmon_report_data}, infinity, 0) || _ <- lists:seq(1, 10)], 77 | ?assertMatch([{App, N}|_] when is_atom(App) andalso is_number(N), system_monitor:get_app_top()), 78 | ?assertMatch([{App, N}|_] when is_atom(App) andalso is_number(N), system_monitor:get_abs_app_top()), 79 | ?assertMatch([{App, N}|_] when is_atom(App) andalso is_number(N), system_monitor:get_app_memory()), 80 | ?assertMatch([{App, N}|_] when is_atom(App) andalso is_number(N), system_monitor:get_app_processes()), 81 | ?assertMatch( #{ initial_call := [{{M1, F1, A1}, V1}|_] 82 | , current_function := [{{M2, F2, A2}, V2}|_] 83 | } when is_atom(M1) andalso is_atom(M2) andalso is_atom(F1) andalso is_atom(F2) andalso 84 | is_number(A1) andalso is_number(A2) andalso is_number(V1) andalso is_number(V2) 85 | , system_monitor:get_function_top() 86 | ) 87 | after 88 | application:stop(?APP) 89 | end, 90 | [ fun ?MODULE:check_produce_seal/1 91 | , fun ?MODULE:check_produce_vips/1 92 | ]). 93 | 94 | t_too_many_procs(_) -> 95 | ?check_trace( 96 | #{timetrap => 30000}, 97 | try 98 | application:set_env(?APP, top_max_procs, 1), 99 | application:ensure_all_started(?APP), 100 | ?block_until(#{?snk_kind := sysmon_report_data}), 101 | Top = system_monitor:get_proc_top(), 102 | %% Check that "warning" process is there: 103 | ?assertMatch( #erl_top{pid = "!!!", group_leader = "!!!", registered_name = too_many_processes} 104 | , lists:keyfind("!!!", #erl_top.pid, Top) 105 | ), 106 | %% Check that the VIPs are still there: 107 | ?assertMatch(#erl_top{}, system_monitor:get_proc_info(system_monitor_collector)), 108 | ?assertMatch(#erl_top{}, system_monitor:get_proc_info(system_monitor)), 109 | ?assertMatch(#erl_top{}, system_monitor:get_proc_info(application_controller)), 110 | %% Misc checks: 111 | ?assertMatch(false, system_monitor:get_proc_info(some_random_name)) 112 | after 113 | application:stop(?APP) 114 | end, 115 | [ fun ?MODULE:check_produce_seal/1 116 | , fun ?MODULE:check_produce_vips/1 117 | ]). 118 | 119 | t_add_remove_vips(_) -> 120 | ?check_trace( 121 | #{timetrap => 30000}, 122 | try 123 | application:set_env(?APP, top_max_procs, 1), 124 | application:ensure_all_started(?APP), 125 | ?wait_async_action( begin 126 | system_monitor:add_vip(global_name_server), 127 | system_monitor:remove_vip(system_monitor) 128 | end 129 | , #{?snk_kind := sysmon_report_data} 130 | ), 131 | Top = system_monitor:get_proc_top(), 132 | %% Check that "warning" process is there: 133 | ?assertMatch( #erl_top{pid = "!!!", group_leader = "!!!", registered_name = too_many_processes} 134 | , lists:keyfind("!!!", #erl_top.pid, Top) 135 | ), 136 | %% Check the VIPs: 137 | ?assertMatch(false, system_monitor:get_proc_info(system_monitor)), 138 | ?assertMatch(#erl_top{}, system_monitor:get_proc_info(application_controller)), 139 | ?assertMatch(#erl_top{}, system_monitor:get_proc_info(global_name_server)) 140 | after 141 | application:stop(?APP) 142 | end, 143 | []). 144 | 145 | t_postgres(_) -> 146 | ?check_trace( 147 | #{timetrap => 30000}, 148 | try 149 | application:set_env(?APP, top_max_procs, 1), 150 | application:set_env(?APP, db_name, "postgres"), 151 | application:set_env(?APP, callback_mod, system_monitor_pg), 152 | application:ensure_all_started(?APP), 153 | link(whereis(system_monitor_pg)), % if it crashes we will know 154 | {ok, _} = ?block_until(#{?snk_kind := sysmon_produce, backend := pg, type := proc_top, 155 | msg := Msg} when Msg#erl_top.registered_name =:= too_many_processes), 156 | {ok, _} = ?block_until(#{?snk_kind := sysmon_produce, backend := pg, type := proc_top, 157 | msg := Msg} when Msg#erl_top.registered_name =:= system_monitor) 158 | after 159 | unlink(whereis(system_monitor_pg)), 160 | application:stop(?APP) 161 | end, 162 | [ fun ?MODULE:no_pg_query_failures/1 163 | , fun ?MODULE:success_proc_top_queries/1 164 | , fun ?MODULE:success_app_top_queries/1 165 | , fun ?MODULE:success_fun_top_queries/1 166 | , fun ?MODULE:success_node_status_queries/1 167 | ]). 168 | 169 | t_builtin_checks(_) -> 170 | ?check_trace( 171 | #{timetrap => 30000}, 172 | try 173 | NProc = erlang:system_info(process_count), 174 | application:set_env(?APP, suspect_procs_max_memory, 1), 175 | application:set_env(?APP, top_max_procs, NProc * 2), 176 | application:set_env(?APP, node_status_fun, {?MODULE, node_status}), 177 | application:ensure_all_started(?APP), 178 | ?block_until(#{?snk_kind := "Abnormal process count"}), 179 | %% Now insert a failing status check, to verify that it doesn't 180 | %% affect the others: 181 | FailingCheck = {?MODULE, failing_check, false, 1}, 182 | application:set_env(?APP, status_checks, [FailingCheck|?CFG(status_checks)]), 183 | system_monitor:reset(), 184 | ?block_until(#{?snk_kind := sysmon_failing_check_run}, infinity, 0), 185 | ?block_until(#{?snk_kind := "Abnormal process count"}, infinity, 0) 186 | after 187 | application:stop(?APP) 188 | end, 189 | []). 190 | 191 | t_events(_) -> 192 | ?check_trace( 193 | try 194 | application:ensure_all_started(?APP), 195 | ?block_until(#{?snk_kind := sysmon_report_data}), 196 | GCInfo = [{timeout, 100}, {heap_size, 42}, {heap_block_size}, {stack_size}, 197 | {mbuf_size, 42}, {old_heap_size, 42}, {old_heap_block_size, 42}], 198 | ?wait_async_action( system_monitor_events ! {monitor, whereis(system_monitor), long_gc, GCInfo} 199 | , #{?snk_kind := "system monitor event", type := long_gc} 200 | ), 201 | ?wait_async_action( system_monitor_events ! {monitor, list_to_pid("<0.42.42>"), long_gc, GCInfo} 202 | , #{?snk_kind := "system monitor event", type := long_gc} 203 | ), 204 | PortInfo = [{timeout, 42}, {port_op, timeout}], 205 | ?wait_async_action( system_monitor_events ! {monitor, hd(erlang:ports()), long_schedule, PortInfo} 206 | , #{?snk_kind := "system monitor event", type := long_schedule} 207 | ) 208 | after 209 | application:stop(?APP) 210 | end, 211 | []). 212 | 213 | %%================================================================================ 214 | %% Trace specs 215 | %%================================================================================ 216 | 217 | no_pg_query_failures(Trace) -> 218 | ?assertMatch([], ?of_kind(system_monitor_pg_query_error, Trace)). 219 | 220 | success_proc_top_queries(Trace) -> 221 | contains_type(proc_top, Trace). 222 | 223 | success_app_top_queries(Trace) -> 224 | contains_type(app_top, Trace). 225 | 226 | success_fun_top_queries(Trace) -> 227 | contains_type(initial_fun_top, Trace) andalso contains_type(current_fun_top, Trace). 228 | 229 | success_node_status_queries(Trace) -> 230 | contains_type(node_status, Trace). 231 | 232 | contains_type(Type, Trace) -> 233 | lists:search( ?match_event(#{?snk_kind := sysmon_produce, backend := pg, type := T} 234 | when T =:= Type) 235 | , Trace 236 | ) =/= false. 237 | 238 | check_produce_seal(Trace) -> 239 | ?assert( 240 | ?strict_causality( #{?snk_kind := sysmon_produce, type := node_status} 241 | , #{?snk_kind := sysmon_report_data} 242 | , Trace 243 | )). 244 | 245 | check_produce_vips(Trace) -> 246 | [?assert( 247 | ?strict_causality( #{?snk_kind := sysmon_produce, type := proc_top, msg := Msg} 248 | when Msg#erl_top.registered_name =:= VIP 249 | , #{?snk_kind := sysmon_report_data} 250 | , Trace 251 | )) || VIP <- vips()], 252 | ok. 253 | 254 | %%================================================================================ 255 | %% Internal functions 256 | %%================================================================================ 257 | 258 | failing_check() -> 259 | ?tp(sysmon_failing_check_run, #{}), 260 | error(deliberate). 261 | 262 | spawn_procs(N, MinSleep, MaxSleep) -> 263 | Parent = self(), 264 | lists:foreach( fun(_) -> 265 | erlang:spawn(?MODULE, idle_loop, [Parent, MinSleep, MaxSleep]) 266 | end 267 | , lists:seq(1, N) 268 | ). 269 | 270 | idle_loop(Parent, MinSleep, MaxSleep) -> 271 | timer:sleep(MinSleep + rand:uniform(MaxSleep - MinSleep)), 272 | erlang:spawn(?MODULE, ?FUNCTION_NAME, [Parent, MinSleep, MaxSleep]). 273 | 274 | vips() -> 275 | [system_monitor, system_monitor_collector, application_controller]. 276 | 277 | node_status() -> 278 | "this is my status". 279 | 280 | docker_startup() -> 281 | exec("docker run -d --name sysmondb -p 5432:5432 \\ 282 | -e SYSMON_PASS=system_monitor_password \\ 283 | -e GRAFANA_PASS=system_monitor_password \\ 284 | -e POSTGRES_PASSWORD=system_monitor_password \\ 285 | ghcr.io/k32/sysmon-postgres:1.0.0"). 286 | 287 | docker_cleanup() -> 288 | exec("docker kill sysmondb"), 289 | exec("docker rm -f sysmondb"). 290 | 291 | -spec exec(file:filename()) -> integer(). 292 | exec(CMD) -> 293 | Port = open_port( {spawn, CMD} 294 | , [ exit_status 295 | , binary 296 | , stderr_to_stdout 297 | , {line, 300} 298 | ] 299 | ), 300 | collect_port_output(Port). 301 | 302 | -spec collect_port_output(port()) -> integer(). 303 | collect_port_output(Port) -> 304 | receive 305 | {Port, {data, {_, Data}}} -> 306 | io:format(user, "docker: ~s~n", [Data]), 307 | collect_port_output(Port); 308 | {Port, {exit_status, ExitStatus}} -> 309 | ExitStatus 310 | end. 311 | --------------------------------------------------------------------------------