├── .formatter.exs ├── .github └── workflows │ └── ci.yml ├── .gitignore ├── LICENSE ├── README.md ├── config ├── config.exs ├── dev.exs ├── prod.exs └── test.exs ├── lib ├── zen_monitor.ex └── zen_monitor │ ├── application.ex │ ├── local.ex │ ├── local │ ├── connector.ex │ ├── dispatcher.ex │ ├── supervisor.ex │ └── tables.ex │ ├── metrics.ex │ ├── proxy.ex │ ├── proxy │ ├── batcher.ex │ ├── supervisor.ex │ └── tables.ex │ ├── supervisor.ex │ └── truncator.ex ├── mix.exs ├── mix.lock └── test ├── black_box_test.exs ├── local ├── connector_test.exs ├── dispatcher_test.exs └── local_test.exs ├── proxy ├── batcher_test.exs └── proxy_test.exs ├── stress_test.exs ├── support ├── child_node.ex ├── observable_gen.ex └── subscriber.ex ├── test_helper.exs └── truncator_test.exs /.formatter.exs: -------------------------------------------------------------------------------- 1 | [ 2 | inputs: [ 3 | "lib/**/*.{ex,exs}", 4 | "test/**/*.{ex,exs}", 5 | "config/**/*.exs", 6 | "mix.exs" 7 | ], 8 | ] 9 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | pull_request: 7 | branches: [ master ] 8 | 9 | jobs: 10 | build: 11 | name: Build and test 12 | runs-on: ubuntu-20.04 13 | strategy: 14 | matrix: 15 | include: 16 | - elixir-version: 1.7.4 17 | otp-version: 20.3 18 | - elixir-version: 1.7.4 19 | otp-version: 21.3 20 | - elixir-version: 1.11.4 21 | otp-version: 21.3 22 | - elixir-version: 1.11.4 23 | otp-version: 24.3 24 | - elixir-version: 1.12.3 25 | otp-version: 24.3 26 | - elixir-version: 1.13.3 27 | otp-version: 24.3 28 | - elixir-version: 1.13.3 29 | otp-version: 25.0 30 | steps: 31 | - uses: actions/checkout@v2 32 | - name: Set up Elixir 33 | uses: erlef/setup-beam@v1 34 | with: 35 | elixir-version: ${{ matrix.elixir-version }} 36 | otp-version: ${{ matrix.otp-version }} 37 | - name: Restore dependencies cache 38 | uses: actions/cache@v2 39 | with: 40 | path: deps 41 | key: ${{ runner.os }}-${{ matrix.elixir-version }}-${{ matrix.otp-version}}-mix-${{ hashFiles('**/mix.lock') }} 42 | restore-keys: ${{ runner.os }}-${{ matrix.elixir-version}}-${{ matrix.otp-version }}-mix- 43 | - name: Start EPMD 44 | run: epmd -daemon 45 | - name: Install dependencies 46 | run: mix deps.get 47 | - name: Run tests 48 | run: mix test 49 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # The directory Mix will write compiled artifacts to. 2 | /_build/ 3 | 4 | # If you run "mix test --cover", coverage assets end up here. 5 | /cover/ 6 | 7 | # The directory Mix downloads your dependencies sources to. 8 | /deps/ 9 | 10 | # Where 3rd-party dependencies like ExDoc output generated docs. 11 | /doc/ 12 | 13 | # Ignore .fetch files in case you like to edit your project deps locally. 14 | /.fetch 15 | 16 | # If the VM crashes, it generates a dump, let's ignore it too. 17 | erl_crash.dump 18 | 19 | # Also ignore archive artifacts (built via "mix archive.build"). 20 | *.ez 21 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Discord 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ZenMonitor 2 | 3 | [![CI](https://github.com/discord/zen_monitor/workflows/CI/badge.svg)](https://github.com/discord/zen_monitor/actions) 4 | [![Hex.pm Version](http://img.shields.io/hexpm/v/zen_monitor.svg?style=flat)](https://hex.pm/packages/zen_monitor) 5 | [![Hex.pm License](http://img.shields.io/hexpm/l/zen_monitor.svg?style=flat)](https://hex.pm/packages/zen_monitor) 6 | [![HexDocs](https://img.shields.io/badge/HexDocs-Yes-blue)](https://hexdocs.pm/zen_monitor) 7 | 8 | ZenMonitor allows for the efficient monitoring of remote processes with minimal use of ERTS 9 | Distribution. 10 | 11 | ## Installation 12 | 13 | Add `ZenMonitor` to your dependencies 14 | 15 | ```elixir 16 | def deps do 17 | [ 18 | {:zen_monitor, "~> 2.1.0"} 19 | ] 20 | end 21 | ``` 22 | 23 | ## Using ZenMonitor 24 | 25 | ZenMonitor strives to be a drop-in replacement for `Process.monitor/1`. To those ends, the 26 | programming interface and all the complexities of how it carries out its task are simplified by a 27 | simple unified programming interface. All the functions that the caller needs to use have 28 | convenient delegates available in the top-level `ZenMonitor` module. The interface is detailed 29 | below. 30 | 31 | ### ZenMonitor.monitor/1 32 | 33 | This is a drop-in replacement for `Process.monitor/1` when it comes to processes. It is 34 | compatible with the various ways that `Process.monitor/1` can establish monitors and will accept 35 | one of a `pid`, a `name` which is the `atom` that a local process is registered under, or a tuple 36 | of `{name, node}` for a registered process on a remote node. These are defined as the 37 | `ZenMonitor.destination` type. 38 | 39 | `ZenMonitor.monitor/1` returns a standard reference that can be used to `demonitor` and can be 40 | matched against the reference provided in the `:DOWN` message. 41 | 42 | Similar to `Process.monitor/1`, the caller is allowed to monitor the same process multiple times, 43 | each monitor will be provided with a unique reference and all monitors will fire `:DOWN` messages 44 | when the monitored process goes down. Even though the caller can establish multiple monitors, 45 | ZenMonitor is designed to handle this efficiently, the only cost is an additional ETS row on the 46 | local node and additional processing time at fan-out. 47 | 48 | ### ZenMonitor.demonitor/2 49 | 50 | This is a mostly drop-in replacement for `Process.demonitor/2` when it comes to processes. The 51 | first argument is the reference returned by `ZenMonitor.monitor/1`. It accepts a list of option 52 | atoms, but only honors the `:flush` option at this time. Passing the `:info` option is allowed 53 | but has no effect, this function always returns `true`. 54 | 55 | ### ZenMonitor.compatibility/1 56 | 57 | When operating in a mixed environment where some nodes are ZenMonitor compatible and some are not, 58 | it may be necessary to check the compatibility of a remote node. `ZenMonitor.compatibility/1` 59 | accepts any `ZenMonitor.destination` and will report back one of `:compatible` or `:incompatible` 60 | for the remote's cached compatibility status. 61 | 62 | All remotes start off as `:incompatible` until a positively acknowledged connection is 63 | established. See the `ZenMonitor.connect/1` function for more information on connecting nodes. 64 | 65 | ### ZenMonitor.compatibility_for_node/1 66 | 67 | Performs the same operation as `ZenMonitor.compatibility/1` but it accepts a node atom instead of 68 | a `ZenMonitor.destination`. 69 | 70 | ### ZenMonitor.connect/1 71 | 72 | Attempts a positive connection with the provided remote node. Connections are established by 73 | using the `@gen_module`'s `call/4` method to send a `:ping` message to the process registered 74 | under the atom `ZenMonitor.Proxy` on the remote. If this process responds with a `:pong` atom 75 | then the connection is positively established and the node is marked as `:compatible`. Any other 76 | response or error condition (timeout / noproc / etc) will be considered negative acknowledgement. 77 | 78 | `ZenMonitor.connect/1` is actually a delegate for `ZenMonitor.Local.Connector.connect/1` see the 79 | documentation there for more information about how connect behaves. 80 | 81 | ### Handling Down Messages 82 | 83 | Any `:DOWN` message receivers (most commonly `GenServer.handle_info/2` callbacks) that match on 84 | the reason should be updated to include an outer `{:zen_monitor, original_match}` wrapper. 85 | 86 | ```elixir 87 | def handle_info({:DOWN, ref, :process, pid, :specific_reason}, state) do 88 | ... 89 | end 90 | ``` 91 | 92 | Should be updated to the following. 93 | 94 | ```elixir 95 | def handle_info({:DOWN, ref, :process, pid, {:zen_monitor, :specific_reason}}, state) do 96 | ... 97 | end 98 | ``` 99 | 100 | ## Why? 101 | 102 | `ZenMonitor` was developed at [Discord](https://discordapp.com) to improve the stability of our 103 | real-time communications infrastructure. `ZenMonitor` improves stability in a couple of 104 | different ways. 105 | 106 | ### Traffic Calming 107 | 108 | When a process is being monitored by a large number of remote processes, that process going down 109 | can cause both the node hosting the downed process and the node hosting the monitoring processes 110 | to be suddenly flooded with an large amount of work. This is commonly referred to as a 111 | thundering herd and can overwhelm either node depending on the situation. 112 | 113 | ZenMonitor relies on interval batching and `GenStage` to help calm the deluge into a throttled 114 | stream of `:DOWN` messages that may take more wall clock time to process but has more predictable 115 | scheduler utilization and network consumption. 116 | 117 | ### Message Interspersing 118 | 119 | In the inverse scenario, a single process monitoring a large number of remote processes, a 120 | systemic failure of a large number of monitored processes can result in blocking the message 121 | queue. This can cause other messages being sent to the process to backup behind the `:DOWN` 122 | messages. 123 | 124 | Here's what a message queue might look like if 100,000 monitors fired due to node failure. 125 | 126 | ``` 127 | +------------------------------------------------+ 128 | | {:DOWN, ref, :process, pid_1, :nodedown} | 129 | +------------------------------------------------+ 130 | | {:DOWN, ref, :process, pid_2, :nodedown} | 131 | +------------------------------------------------+ 132 | ... snip 99,996 messages ... 133 | +------------------------------------------------+ 134 | | {:DOWN, ref, :process, pid_99_999, :nodedown} | 135 | +------------------------------------------------+ 136 | | {:DOWN, ref, :process, pid_100_000, :nodedown} | 137 | +------------------------------------------------+ 138 | | :work | 139 | +------------------------------------------------+ 140 | | :work | 141 | +------------------------------------------------+ 142 | | :work | 143 | +------------------------------------------------+ 144 | ... etc ... 145 | ``` 146 | 147 | The process has to process the 100,000 `:DOWN` messages before it can get back to doing work, if 148 | the processing of a `:DOWN` message is non-trivial then this could result in the process 149 | effectively appearing unresponsive to callers expecting it to do `:work`. 150 | 151 | `ZenMonitor.Local.Dispatcher` provides a configurable batch sweeping system that dispatches a 152 | fixed demand_amount of `:DOWN` messages every demand_interval (See the documentation for 153 | `ZenMonitor.Local.Dispatcher` for configuration and defaults). Using `ZenMonitor` the message 154 | queue would look like this. 155 | 156 | ``` 157 | +------------------------------------------------+ 158 | | {:DOWN, ref, :process, pid_1, :nodedown} | 159 | +------------------------------------------------+ 160 | ... snip 4,998 messages ... 161 | +------------------------------------------------+ 162 | | {:DOWN, ref, :process, pid_5000, :nodedown} | 163 | +------------------------------------------------+ 164 | | :work | 165 | +------------------------------------------------+ 166 | ... snip messages during demand_interval ... 167 | +------------------------------------------------+ 168 | | :work | 169 | +------------------------------------------------+ 170 | | {:DOWN, ref, :process, pid_5001, :nodedown} | 171 | +------------------------------------------------+ 172 | ... snip 4,998 messages ... 173 | +------------------------------------------------+ 174 | | {:DOWN, ref, :process, pid_10_000, :nodedown} | 175 | +------------------------------------------------+ 176 | | :work | 177 | +------------------------------------------------+ 178 | ... snip messages during demand_interval ... 179 | +------------------------------------------------+ 180 | | :work | 181 | +------------------------------------------------+ 182 | ... etc ... 183 | ``` 184 | 185 | This means that the process can continue processing work messages while working through more 186 | manageable batches of `:DOWN` messages, this improves the effective responsiveness of the process. 187 | 188 | ### Message Truncation 189 | 190 | `:DOWN` messages include a `reason` field that can include large stack traces and GenServer state 191 | dumps. Large `reason`s generally don't pose an issue, but in a scenario where thousands of 192 | processes are monitoring a process that generates a large `reason` the cumulative effect of 193 | duplicating the large `reason` to each monitoring process can consume all available memory on a 194 | node. 195 | 196 | When a `:DOWN` message is received for dispatch to remote subscribers, the first step is to 197 | truncate the message using `ZenMonitor.Truncator`, see the module documentation for more 198 | information about how truncation is performed and what configuration options are supported. 199 | 200 | This prevents the scenario where a single process with a large stack trace or large state gets 201 | amplified on the receiving node and consumes an large amount of memory. 202 | 203 | ## Design 204 | 205 | ZenMonitor is constructed of two cooperating systems, the _Local ZenMonitor System_ and the 206 | _Proxy ZenMonitor System_. When a process wishes to monitor a remote process, it should inform 207 | the _Local ZenMonitor System_ which will efficiently dispatch the monitoring request to the remote 208 | node's _Proxy ZenMonitor System_. 209 | 210 | ### Local ZenMonitor System 211 | 212 | The _Local ZenMonitor System_ is composed of a few processes, these are managed by the 213 | ZenMonitor.Local.Supervisor. The processes that comprise the _Local ZenMonitor System_ are 214 | described in detail in the following section. 215 | 216 | #### ZenMonitor.Local 217 | 218 | ZenMonitor.Local is responsible for accepting monitoring and demonitoring requests from local 219 | processes. It will send these requests to the Connector processes for efficient transmission 220 | to the responsible ZenMonitor.Proxy processes. 221 | 222 | When a monitored process dies, the ZenMonitor.Proxy will send this information in a summary 223 | message to the ZenMonitor.Local.Connector process which will use the send down_dispatches to 224 | ZenMonitor.Local for eventual delivery by the ZenMonitor.Local.Dispatcher. 225 | 226 | ZenMonitor.Local is also responsible for monitoring the local interested process and performing 227 | clean-up if the local interested process crashes for any reason, this prevents the Local 228 | ZenMonitor System from leaking memory. 229 | 230 | #### ZenMonitor.Local.Tables 231 | 232 | This is a simple process that is responsible for owning shared ETS tables used by various parts of 233 | the Local ZenMonitor System. 234 | 235 | It maintains two tables, `ZenMonitor.Local.Tables.Nodes` and `ZenMonitor.Local.Tables.References` 236 | these tables are public and are normally written to and read from by the ZenMonitor.Local and 237 | ZenMonitor.Local.Connector processes. 238 | 239 | #### ZenMonitor.Local.Connector 240 | 241 | ZenMonitor.Local.Connector is responsible for batching monitoring requests into summary requests 242 | for the remote ZenMonitor.Proxy. The Connector handles the actual distribution connection to the 243 | remote ZenMonitor.Proxy including dealing with incompatible and down nodes. 244 | 245 | When processes go down on the remote node, the Proxy ZenMonitor System will report summaries of 246 | these down processes to the corresponding ZenMonitor.Local.Connector. 247 | 248 | There will be one ZenMonitor.Local.Connector per remote node with monitored processes. 249 | 250 | #### ZenMonitor.Local.Dispatcher 251 | 252 | When a remote node or remote processes fail, messages will be enqueued for delivery. The 253 | ZenMonitor.Local.Dispatcher is responsible for processing these enqueued messages at a steady and 254 | controlled rate. 255 | 256 | ### Proxy ZenMonitor System 257 | 258 | The _Proxy ZenMonitor System_ is composed of a few processes, these are managed by the 259 | `ZenMonitor.Proxy.Supervisor`. The processes that comprise the _Proxy ZenMonitor System_ are 260 | described in detail in the following section. 261 | 262 | #### ZenMonitor.Proxy 263 | 264 | `ZenMonitor.Proxy` is responsible for handling subscription requests from the 265 | _Local ZenMonitor System_ and for maintaining the ERTS Process Monitors on the processes local to 266 | the remote node. 267 | 268 | `ZenMonitor.Proxy` is designed to be efficient with local monitors and will guarantee that for any 269 | local process there is, at most, one ERTS monitor no matter the number remote processes and remote 270 | nodes are interested in monitoring that process. 271 | 272 | When a local process goes down `ZenMonitor.Proxy` will enqueue a new death certificate to the 273 | `ZenMonitor.Proxy.Batcher` processes that correspond to the interested remotes. 274 | 275 | #### ZenMonitor.Proxy.Tables 276 | 277 | This is a simple process that is responsible for owning shared ETS tables used by various parts of 278 | the _Proxy ZenMonitor System_. 279 | 280 | It maintains a single table, `ZenMonitor.Proxy.Tables.Subscribers`. This table is used by both 281 | the `ZenMonitor.Proxy` and `ZenMonitor.Proxy.Batcher` processes. 282 | 283 | 284 | #### ZenMonitor.Proxy.Batcher 285 | 286 | This process has two primary responsibilities, collecting and summarizing death certificates and 287 | monitoring the remote process. 288 | 289 | For every remote `ZenMonitor.Local.Connector` that is interested in monitoring processes on this 290 | node, a corresponding `ZenMonitor.Proxy.Batcher` is spawned that will collect and ultimately 291 | deliver death certificates. The `ZenMonitor.Proxy.Batcher` will also monitor the remote 292 | `ZenMonitor.Local.Connector` and clean up after it if it goes down for any reason. 293 | 294 | ## Running a Compatible Node 295 | 296 | ZenMonitor ships with an Application, `ZenMonitor.Application` which will start the overall 297 | supervisor, `ZenMonitor.Supervisor`. This creates a supervision tree as outlined below. 298 | 299 | ``` 300 | ------------------------- 301 | +----| ZenMonitor.Local.Tables | 302 | | ------------------------- 303 | | 304 | | ------------------ 305 | +----| ZenMontior.Local | 306 | ----------------------------- | ------------------ 307 | +----| ZenMonitor.Local.Supervisor |----| 308 | | ----------------------------- | ------------- ---------------------------- 309 | | +----| GenRegistry |--N--| ZenMonitor.Local.Connector | 310 | | | ------------- ---------------------------- 311 | | | 312 | | | ----------------------------- 313 | | +----| ZenMonitor.Local.Dispatcher | 314 | | ----------------------------- 315 | ----------------------- | 316 | | ZenMonitor.Supervisor |----| 317 | ----------------------- | ------------------------- 318 | | +----| ZenMonitor.Proxy.Tables | 319 | | | ------------------------- 320 | | | 321 | | ----------------------------- | ------------------ 322 | +----| ZenMonitor.Proxy.Supervisor |----+----| ZenMonitor.Proxy | 323 | ----------------------------- | ------------------ 324 | | 325 | | ------------- -------------------------- 326 | +----| GenRegistry |--M--| ZenMonitor.Proxy.Batcher | 327 | ------------- -------------------------- 328 | ``` 329 | 330 | -------------------------------------------------------------------------------- /config/config.exs: -------------------------------------------------------------------------------- 1 | use Mix.Config 2 | 3 | config :zen_monitor, 4 | gen_module: GenServer, 5 | connector_sweep_interval: 100, 6 | batcher_sweep_interval: 100, 7 | demand_interval: 100, 8 | demand_amount: 1000, 9 | max_binary_size: 1024, 10 | truncation_depth: 3 11 | 12 | import_config "#{Mix.env()}.exs" 13 | -------------------------------------------------------------------------------- /config/dev.exs: -------------------------------------------------------------------------------- 1 | use Mix.Config 2 | -------------------------------------------------------------------------------- /config/prod.exs: -------------------------------------------------------------------------------- 1 | use Mix.Config 2 | -------------------------------------------------------------------------------- /config/test.exs: -------------------------------------------------------------------------------- 1 | use Mix.Config 2 | 3 | config :zen_monitor, 4 | connector_sweep_interval: 10, 5 | batcher_sweep_interval: 10, 6 | demand_interval: 10, 7 | demand_amount: 1000 8 | 9 | config :logger, :console, 10 | format: "$time [$level] $levelpad | $metadata | $message\n", 11 | metadata: [:module, :function, :line] 12 | -------------------------------------------------------------------------------- /lib/zen_monitor.ex: -------------------------------------------------------------------------------- 1 | defmodule ZenMonitor do 2 | @moduledoc """ 3 | ZenMonitor provides efficient monitoring of remote processes and controlled dissemination of 4 | any resulting `:DOWN` messages. 5 | 6 | This module provides a convenient client interface which aims to be a drop in replacement for 7 | `Process.monitor/1` and `Process.demonitor/2` 8 | 9 | # Known differences between ZenMonitor and Process 10 | 11 | - `ZenMonitor.demonitor/2` has the same signature as Process.demonitor/2 but does not respect 12 | the `:info` option. 13 | 14 | - ZenMonitor aims to be efficient over distribution, one of the main strategies for achieving 15 | this is relying mainly on local monitors and then batching up all changes over a time period 16 | to be sent as a single message. This design means that additional latency is added to the 17 | delivery of down messages in pursuit of the goal. Where `Process.monitor/1` on a remote 18 | process will provide a :DOWN message as soon as possible, `ZenMonitor.monitor/1` on a remote 19 | process will actually have a number of batching periods to go through before the message 20 | arrives at the monitoring process, here are all the points that add latency. 21 | 22 | 1. When the monitor is enqueued it has to wait until the next sweep happens in the 23 | `ZenMonitor.Local.Connector` until it will be delivered to the `ZenMonitor.Proxy`. 24 | 1. The monitor arrives at the `ZenMonitor.Proxy`, the process crashes and the ERTS `:DOWN` 25 | message is delivered. This will be translated into a death_certificate and sent to a 26 | `ZenMonitor.Proxy.Batcher` for delivery. It will have to wait until the next sweep 27 | happens for it to be sent back to the `ZenMonitor.Local.Connector` for fan-out. 28 | 1. The dead summary including the death_certificate arrives at the 29 | `ZenMonitor.Local.Connector` and a down_dispatch is created for it and enqueued with the 30 | `ZenMonitor.Local`. 31 | 1. The down_dispatch waits in a queue until the `ZenMonitor.Local.Dispatcher` generates 32 | more demand. 33 | 1. Once demand is generated, `ZenMonitor.Local` will hand off the down_dispatch for actual 34 | delivery by `ZenMonitor.Local.Dispatcher`. 35 | 36 | * Steps 1 and 3 employ a strategy of batch sizing to prevent the message from growing too 37 | large. The batch size is controlled by application configuration and is alterable at boot 38 | and runtime. This means though that Steps 1 and 3 can be delayed by N intervals 39 | where `N = ceil(items_ahead_of_event / chunk_size)` 40 | * Step 4 employs a similar batching strategy, a down_dispatch will wait in queue for up to N 41 | intervals where `N = ceil(items_ahead_of_dispatch / chunk_size)` 42 | 43 | - `ZenMonitor` decorates the reason of the `:DOWN` message. If a remote process goes down 44 | because of `original_reason`, this will get decorated as `{:zen_monitor, original_reason}` 45 | when delivered by ZenMonitor. This allows the receiver to differentiate `:DOWN` messages 46 | originating from `ZenMonitor.monitor/1` and those originating from `Process.monitor/1`. 47 | This is necessary when operating in mixed mode. It is the responsibility of the receiver to 48 | unwrap this reason if it requires the `original_reason` for some additional handling of the 49 | `:DOWN` message. 50 | """ 51 | 52 | @gen_module GenServer 53 | 54 | @typedoc """ 55 | `ZenMonitor.destination` are all the types that can be monitored. 56 | 57 | - `pid()` either local or remote 58 | - `{name, node}` represents a named process on the given node 59 | - `name :: atom()` is a named process on the local node 60 | """ 61 | @type destination :: pid() | ({name :: atom, node :: node()}) | (name :: atom()) 62 | 63 | ## Delegates 64 | 65 | @doc """ 66 | Delegate to `ZenMonitor.Local.compatibility/1` 67 | """ 68 | defdelegate compatibility(target), to: ZenMonitor.Local 69 | 70 | @doc """ 71 | Delegate to `ZenMonitor.Local.compatibility_for_node/1` 72 | """ 73 | defdelegate compatibility_for_node(remote), to: ZenMonitor.Local 74 | 75 | @doc """ 76 | Delegate to `ZenMonitor.Local.Connector.connect/1` 77 | """ 78 | defdelegate connect(remote), to: ZenMonitor.Local.Connector 79 | 80 | @doc """ 81 | Delegate to `ZenMonitor.Local.demonitor/2` 82 | """ 83 | defdelegate demonitor(ref, options \\ []), to: ZenMonitor.Local 84 | 85 | @doc """ 86 | Delegate to `ZenMonitor.Local.monitor/1` 87 | """ 88 | defdelegate monitor(target), to: ZenMonitor.Local 89 | 90 | ## Client 91 | 92 | @doc """ 93 | Get the module to use for gen calls from the Application Environment 94 | 95 | This module only needs to support `GenServer.call/3` and `GenServer.cast/2` functionality, see 96 | ZenMonitor's `@gen_module` for the default value 97 | 98 | This can be controlled at boot and runtime with the `{:zen_monitor, :gen_module}` setting, see 99 | `ZenMonitor.gen_module/1` for runtime convenience functionality. 100 | """ 101 | @spec gen_module() :: atom 102 | def gen_module do 103 | Application.get_env(:zen_monitor, :gen_module, @gen_module) 104 | end 105 | 106 | @doc """ 107 | Put the module to use for gen calls into the Application Environment 108 | 109 | This is a simple convenience function for overwriting the `{:zen_monitor, :gen_module}` setting 110 | at runtime. 111 | """ 112 | @spec gen_module(value :: atom) :: :ok 113 | def gen_module(value) do 114 | Application.put_env(:zen_monitor, :gen_module, value) 115 | end 116 | 117 | @doc """ 118 | Get the current monotonic time in milliseconds 119 | 120 | This is a helper because `System.monotonic_time(:milliseconds)` is long and error-prone to 121 | type in multiple call sites. 122 | 123 | See `System.monotonic_time/1` for more information. 124 | """ 125 | @spec now() :: integer 126 | def now do 127 | System.monotonic_time(:millisecond) 128 | end 129 | 130 | @doc """ 131 | Find the node for a destination. 132 | """ 133 | @spec find_node(target :: destination) :: node() 134 | def find_node(pid) when is_pid(pid), do: node(pid) 135 | def find_node({_, node}), do: node 136 | def find_node(_), do: Node.self() 137 | end 138 | -------------------------------------------------------------------------------- /lib/zen_monitor/application.ex: -------------------------------------------------------------------------------- 1 | defmodule ZenMonitor.Application do 2 | @moduledoc """ 3 | OTP Application that acts as the entry point for ZenMonitor. 4 | 5 | This Application will start all necessary processes for a node to be a compatible ZenMonitor 6 | node and to communicate with other compatible ZenMonitor nodes. 7 | 8 | See `ZenMonitor.Supervisor` for more information. 9 | """ 10 | use Application 11 | 12 | alias ZenMonitor.Metrics 13 | 14 | def start(_type, _args) do 15 | children = [ 16 | ZenMonitor.Supervisor 17 | ] 18 | 19 | Metrics.register() 20 | 21 | Supervisor.start_link(children, strategy: :one_for_one) 22 | end 23 | end 24 | -------------------------------------------------------------------------------- /lib/zen_monitor/local.ex: -------------------------------------------------------------------------------- 1 | defmodule ZenMonitor.Local do 2 | @moduledoc """ 3 | ZenMonitor.Local 4 | 5 | Most of the actual logic of monitoring and fan-out is handled by `ZenMonitor.Local.Connector`, 6 | see that module for more information. 7 | 8 | `ZenMonitor.Local` is responsible for monitoring the subscribing local processes and cleaning up 9 | monitors if they crash. 10 | """ 11 | use GenStage 12 | use Instruments.CustomFunctions, prefix: "zen_monitor.local" 13 | alias ZenMonitor.Local.{Connector, Tables} 14 | 15 | @typedoc """ 16 | Effective compatibility of a remote node 17 | """ 18 | @type compatibility :: :compatible | :incompatible 19 | 20 | @typedoc """ 21 | Represents a future down dispatch for a given pid to be delivered by 22 | `ZenMonitor.Local.Dispatcher` 23 | """ 24 | @type down_dispatch :: {pid, {:DOWN, reference, :process, pid, {:zen_monitor, any}}} 25 | 26 | @subscribers_table Module.concat(__MODULE__, "Subscribers") 27 | @hibernation_threshold 1_000 28 | 29 | defmodule State do 30 | @moduledoc """ 31 | Maintains the internal state for ZenMonitor.Local 32 | 33 | - `subscribers` is an ETS table that tracks local subscribers to prevent multiple monitors 34 | - `batch` is the queue of messages awaiting delivery to ZenMonitor.Local.Dispatcher 35 | - `length` is the current length of the batch queue (calculating queue length is an O(n) 36 | operation, it is simple to track it as elements are added / removed) 37 | - `queue_emptied` is the number of times the queue has been emptied. Once this number 38 | exceeds the hibernation_threshold (see `hibernation_threshold/0`) the process will 39 | hibernate 40 | """ 41 | 42 | @type t :: %__MODULE__{ 43 | subscribers: :ets.tid(), 44 | length: integer, 45 | queue_emptied: integer, 46 | batch: :queue.queue() 47 | } 48 | defstruct [ 49 | :subscribers, 50 | length: 0, 51 | queue_emptied: 0, 52 | batch: :queue.new() 53 | ] 54 | end 55 | 56 | ## Delegates 57 | 58 | defdelegate compatibility_for_node(remote), to: ZenMonitor.Local.Connector, as: :compatibility 59 | 60 | ## Client 61 | 62 | def start_link(_opts \\ []) do 63 | GenStage.start_link(__MODULE__, [], name: __MODULE__) 64 | end 65 | 66 | @doc """ 67 | Begin monitoring the given process 68 | 69 | Has the same semantics as `Process.monitor/1`, DOWN messages will be delivered 70 | at a pace controlled by the :zen_monitor, :demand_interval and 71 | :zen_monitor, :demand_amount environment variables 72 | """ 73 | @spec monitor(target :: ZenMonitor.destination()) :: reference 74 | def monitor(target) do 75 | increment("monitor") 76 | ref = make_ref() 77 | me = self() 78 | 79 | # Write the reference out 80 | :ets.insert(Tables.references(), {{me, ref}, target}) 81 | 82 | # Enqueue the monitor into the Connector for async monitor 83 | Connector.monitor(target, ref, me) 84 | 85 | # Perform reciprocal monitoring (if needed) 86 | unless :ets.member(@subscribers_table, me) do 87 | GenStage.cast(__MODULE__, {:monitor_subscriber, me}) 88 | end 89 | 90 | # Return the reference to the caller 91 | ref 92 | end 93 | 94 | @doc """ 95 | Stop monitoring a process by monitor reference 96 | 97 | Has the same semantics as `Process.demonitor/2` (although you can pass the `:info` option, it 98 | has no effect and is not honored, `:flush` is honored) 99 | To demonitor a process you should pass in the reference returned from 100 | `ZenMonitor.Local.monitor/1` for the given process 101 | """ 102 | @spec demonitor(ref :: reference, options :: [:flush]) :: true 103 | def demonitor(ref, options \\ []) when is_reference(ref) do 104 | increment("demonitor") 105 | me = self() 106 | 107 | # First consume the reference 108 | case :ets.take(Tables.references(), {me, ref}) do 109 | [] -> 110 | # Unknown reference, maybe it's been dispatched, consume any :DOWN messages in the inbox 111 | # if :flush is provided. Dispatch atomically consumes the reference, which is why we only 112 | # need to scan the inbox if we don't find a reference. 113 | if :flush in options do 114 | receive do 115 | {:DOWN, ^ref, _, _, _} -> nil 116 | after 117 | 0 -> 118 | nil 119 | end 120 | end 121 | 122 | :ok 123 | 124 | [{{^me, ^ref}, pid}] -> 125 | # Instruct the Connector to demonitor the monitor 126 | Connector.demonitor(pid, ref) 127 | end 128 | 129 | true 130 | end 131 | 132 | @doc """ 133 | Check the compatiblity of the remote node that owns the provided destination 134 | 135 | This is a simple convenience function that looksup the node for the destination and then calls 136 | `ZenMonitor.Local.compatiblity_for_node/1` 137 | """ 138 | @spec compatibility(target :: ZenMonitor.destination()) :: compatibility 139 | def compatibility(target) do 140 | target 141 | |> ZenMonitor.find_node() 142 | |> compatibility_for_node() 143 | end 144 | 145 | @doc """ 146 | Asynchronously enqueue a list of down dispatches for delivery by the Dispatcher 147 | 148 | If called with the empty list, cast will be suppressed. 149 | """ 150 | @spec enqueue(messages :: [down_dispatch]) :: :ok 151 | def enqueue([]), do: :ok 152 | 153 | def enqueue(messages) do 154 | GenStage.cast(__MODULE__, {:enqueue, messages}) 155 | end 156 | 157 | @doc """ 158 | Synchronously checks the length of the ZenMonitor.Local's internal batch 159 | """ 160 | @spec batch_length() :: integer() 161 | def batch_length do 162 | GenStage.call(__MODULE__, :batch_length) 163 | end 164 | 165 | @doc """ 166 | Gets the hibernation threshold from the Application Environment 167 | 168 | Every time the demand empties the queue a counter is incremented. When this counter exceeds the 169 | hibernation threshold the ZenMonitor.Local process will be sent into hibernation. See 170 | ZenMonitor.Local's @hibernation_threshold for the default value 171 | 172 | This can be controlled at boot and runtime with the {:zen_monitor, :hibernation_threshold} 173 | setting, see ZenMonitor.Local.hibernation_threshold/1 for runtime convenience functionality. 174 | """ 175 | @spec hibernation_threshold() :: integer 176 | def hibernation_threshold do 177 | Application.get_env(:zen_monitor, :hibernation_threshold, @hibernation_threshold) 178 | end 179 | 180 | @doc """ 181 | Puts the hibernation threshold into the Application Environment 182 | 183 | This is a simple convenience function for overwriting the 184 | {:zen_monitor, :hibernation_threshold} setting at runtime. 185 | """ 186 | @spec hibernation_threshold(value :: integer) :: :ok 187 | def hibernation_threshold(value) do 188 | Application.put_env(:zen_monitor, :hibernation_threshold, value) 189 | end 190 | 191 | ## Server 192 | 193 | def init(_opts) do 194 | Process.flag(:message_queue_data, :off_heap) 195 | 196 | subscribers = 197 | :ets.new(@subscribers_table, [:protected, :named_table, :set, read_concurrency: true]) 198 | 199 | {:producer, %State{subscribers: subscribers}} 200 | end 201 | 202 | @doc """ 203 | Handles demand from `ZenMonitor.Local.Dispatcher` 204 | 205 | ZenMonitor.Local maintains a queue of pending messages to be sent to local processes, the actual 206 | dispatch of which are throttled by ZenMonitor.Local.Dispatcher. When 207 | ZenMonitor.Local.Dispatcher requests more messages to dispatch, this handler will collect up to 208 | the requested amount from the batch queue to satisfy the demand. 209 | """ 210 | def handle_demand(demand, %State{length: length} = state) do 211 | if length <= demand do 212 | empty_queue(state) 213 | else 214 | chunk_queue(demand, state) 215 | end 216 | end 217 | 218 | # Handle a local subscriber going down 219 | # When a process establishes a remote monitor, ZenMonitor.Local establishes a reciprocal monitor, 220 | # see monitor/1 and handle_cast({:monitor_subscriber, ...}) for more information. 221 | # If the subscriber crashes, all of the ETS records maintained by ZenMonitor.Local and the various 222 | # ZenMonitor.Local.Connectors is no longer needed and will be cleaned up by this handler. 223 | def handle_info( 224 | {:DOWN, _ref, :process, subscriber, _reason}, 225 | %State{subscribers: subscribers} = state 226 | ) do 227 | for [ref, remote_pid] <- :ets.match(Tables.references(), {{subscriber, :"$1"}, :"$2"}) do 228 | # Remove the reference 229 | :ets.delete(Tables.references(), {subscriber, ref}) 230 | 231 | # Instruct the Connector to demonitor 232 | Connector.demonitor(remote_pid, ref) 233 | end 234 | 235 | # Remove the subscriber from the subscribers table 236 | :ets.delete(subscribers, subscriber) 237 | 238 | {:noreply, [], state} 239 | end 240 | 241 | # Handles recipricol subscriber monitoring 242 | # When a process establishes a remote monitor, ZenMonitor.Local will establish a reciprocal 243 | # monitor on the subscriber. This is done so that appropriate cleanup can happen if the 244 | # subscriber goes down. 245 | # This handler guarantees that a local subscriber will only ever have one active reciprocal 246 | # monitor at a time by tracking the subscribers in an ETS table. 247 | def handle_cast({:monitor_subscriber, subscriber}, %State{subscribers: subscribers} = state) do 248 | if :ets.insert_new(subscribers, {subscriber}) do 249 | Process.monitor(subscriber) 250 | end 251 | 252 | {:noreply, [], state} 253 | end 254 | 255 | # Handles enqueuing messages for eventual dispatch 256 | # ZenMonitor.Local.Connector is responsible for generating down dispatches and enqueuing them with 257 | # ZenMonitor.Local. ZenMonitor.Local takes these messages and places them into the 258 | # batch queue to be delivered to ZenMonitor.Local.Dispatcher as demanded. 259 | def handle_cast({:enqueue, messages}, %State{batch: batch, length: length} = state) do 260 | {batch, new_length} = 261 | messages 262 | |> Enum.reduce({batch, length}, fn item, {acc, len} -> 263 | {:queue.in(item, acc), len + 1} 264 | end) 265 | 266 | increment("enqueue", new_length - length) 267 | 268 | {:noreply, [], %State{state | batch: batch, length: new_length}} 269 | end 270 | 271 | # Handles batch length checks 272 | # Returns the current length of the batch 273 | def handle_call(:batch_length, _from, %State{length: length} = state) do 274 | {:reply, length, [], state} 275 | end 276 | 277 | ## Private 278 | 279 | @spec empty_queue(state :: State.t()) :: 280 | {:noreply, [down_dispatch], State.t()} 281 | | {:noreply, [down_dispatch], State.t(), :hibernate} 282 | defp empty_queue(%State{queue_emptied: queue_emptied, batch: batch} = state) do 283 | new_queue_emptied = queue_emptied + 1 284 | response = :queue.to_list(batch) 285 | 286 | if new_queue_emptied >= hibernation_threshold() do 287 | {:noreply, response, %State{state | batch: :queue.new(), length: 0, queue_emptied: 0}, 288 | :hibernate} 289 | else 290 | {:noreply, response, 291 | %State{state | batch: :queue.new(), length: 0, queue_emptied: new_queue_emptied}} 292 | end 293 | end 294 | 295 | @spec chunk_queue(size :: integer(), state :: State.t()) :: 296 | {:noreply, [down_dispatch], State.t()} 297 | defp chunk_queue(size, %State{batch: batch, length: length} = state) do 298 | {messages, new_batch} = :queue.split(size, batch) 299 | {:noreply, :queue.to_list(messages), %State{state | batch: new_batch, length: length - size}} 300 | end 301 | end 302 | -------------------------------------------------------------------------------- /lib/zen_monitor/local/connector.ex: -------------------------------------------------------------------------------- 1 | defmodule ZenMonitor.Local.Connector do 2 | @moduledoc """ 3 | `ZenMonitor.Local.Connector` performs a variety of duties. For every remote that a the local 4 | is interested in monitoring processes on there will be a dedicated `ZenMonitor.Local.Connector`. 5 | This collection of Connectors are managed by a `GenRegistry` registered under the 6 | `ZenMonitor.Local.Connector` atom. 7 | 8 | # Connecting and Monitoring the remote `ZenMonitor.Proxy` 9 | 10 | Connectors, as their name suggests, connect to the `ZenMonitor.Proxy` on the remote node that they 11 | are responsible for. They do this using standard ERTS Distribution, by invoking the remote 12 | Proxy's ping command. A Remote is considered compatible if the ping command returns the :pong 13 | atom, otherwise it will be marked incompatible. 14 | 15 | Connectors manage their remote node's status in the global node status cache, and provide 16 | facilities for efficient querying of remote status, see `compatibility/1` and 17 | `cached_compatibility/1` 18 | 19 | # Batching and Updating the remote `ZenMonitor.Proxy` 20 | 21 | When a local process wishes to monitor a remote process, the Connector will be informed of this 22 | fact with a call to `monitor/3`. The Connector is responsible for maintaining a local record of 23 | this monitor for future fan-out and for efficiently batching up these requests to be delivered 24 | to the remote ZenMonitor.Proxy. 25 | 26 | # Fan-out of Dead Summaries 27 | 28 | Periodically, the `ZenMonitor.Proxy` (technically the `ZenMonitor.Proxy.Batcher`) on the remote 29 | node will send a "Dead Summary". This is a message from the remote that informs the Connector 30 | of all the processes the Connector has monitored that have gone down since the last summary. 31 | 32 | The Connector uses it's local records to generate a batch of _down dispatches_. These are 33 | messages that look identical to the messages provided by `Process.monitor/1` when a process goes 34 | down. It is sometimes necessary for the original monitoring process to be able to discern 35 | whether the `:DOWN` message originated from ERTS or from ZenMonitor, to aid this, ZenMonitor 36 | will wrap the original reason in a tuple of `{:zen_monitor, original_reason}`. 37 | 38 | The fan-out messages are sent to `ZenMonitor.Local` for eventual delivery via 39 | `ZenMonitor.Local.Dispatcher`, see those modules for more information. 40 | 41 | # Fan-out of nodedown / ZenMonitor.Proxy down 42 | 43 | The Connector is also responsible for monitoring the remote node and dealing with nodedown (or 44 | the node becoming incompatible, either due to the `ZenMonitor.Proxy` crashing or a code change). 45 | 46 | If the Connector detects that the remote it is responsible for is down or no longer compatible, 47 | it will fire every established monitor with `{:zen_monitor, :nodedown}`. It uses the same 48 | mechanism as for Dead Summaries, see `ZenMonitor.Local` and `ZenMonitor.Local.Dispatcher` for 49 | more information. 50 | """ 51 | use GenServer 52 | use Instruments.CustomFunctions, prefix: "zen_monitor.local.connector" 53 | 54 | alias ZenMonitor.Local 55 | alias ZenMonitor.Local.Tables 56 | 57 | @base_penalty 1_000 58 | @maximum_penalty 60_000 59 | @max_attempt :math.ceil(:math.log2(@maximum_penalty)) 60 | @chunk_size 5000 61 | @sweep_interval 100 62 | 63 | @type t :: __MODULE__ 64 | @type compatibility :: :compatible | :incompatible 65 | @type cached_compatibility :: compatibility | :miss | {:expired, integer} | :unavailable 66 | @type death_certificate :: {pid, reason :: any} 67 | @type down_dispatch :: {pid, {:DOWN, reference, :process, pid, {:zen_monitor, any}}} 68 | 69 | defmodule State do 70 | @moduledoc """ 71 | Maintains the internal state for the Connector 72 | 73 | - `monitors` is an ETS table for keeping track of monitors for the purpose of fan-out. 74 | - `remote_node_monitored` is a flag used to track whether or not the remote node has been 75 | monitored 76 | - `remote_proxy_ref` is the monitoring reference of the remote node's ZenMonitor.Proxy 77 | - `remote` is the remote node for which the Connector is responsible. 78 | - `batch` is the queue of instructions pending until the next sweep. 79 | - `length` is the current length of the batch queue (calculating queue length is an O(n) 80 | operation, it is simple to track it as elements are added / removed) 81 | """ 82 | @type t :: %__MODULE__{ 83 | monitors: :ets.tab(), 84 | remote_node_monitored: boolean(), 85 | remote_proxy_ref: reference() | nil, 86 | remote: node(), 87 | length: integer(), 88 | batch: :queue.queue() 89 | } 90 | 91 | defstruct [ 92 | :monitors, 93 | :remote, 94 | :remote_proxy_ref, 95 | remote_node_monitored: false, 96 | length: 0, 97 | batch: :queue.new() 98 | ] 99 | end 100 | 101 | ## Client 102 | 103 | def start_link(remote) do 104 | GenServer.start_link(__MODULE__, remote) 105 | end 106 | 107 | @doc """ 108 | Get a connector from the registry by destination 109 | """ 110 | @spec get(target :: ZenMonitor.destination()) :: pid() 111 | def get(target) do 112 | target 113 | |> ZenMonitor.find_node() 114 | |> get_for_node() 115 | end 116 | 117 | @doc """ 118 | Get a connector from the registry by remote node 119 | """ 120 | @spec get_for_node(remote :: node()) :: pid() 121 | def get_for_node(remote) when is_atom(remote) do 122 | case GenRegistry.lookup(__MODULE__, remote) do 123 | {:ok, connector} -> 124 | connector 125 | 126 | {:error, :not_found} -> 127 | {:ok, connector} = GenRegistry.lookup_or_start(__MODULE__, remote, [remote]) 128 | connector 129 | end 130 | end 131 | 132 | @doc """ 133 | Asynchronously monitors a pid. 134 | """ 135 | @spec monitor(target :: ZenMonitor.destination(), ref :: reference(), subscriber :: pid()) :: 136 | :ok 137 | def monitor(target, ref, subscriber) do 138 | target 139 | |> get() 140 | |> GenServer.cast({:monitor, target, ref, subscriber}) 141 | end 142 | 143 | @doc """ 144 | Retrieves all the monitors established between the target and the subscriber 145 | """ 146 | @spec monitors(target :: ZenMonitor.destination(), subscriber :: pid()) :: [reference()] 147 | def monitors(target, subscriber) do 148 | target 149 | |> get() 150 | |> GenServer.call({:monitors, target, subscriber}) 151 | end 152 | 153 | @doc """ 154 | Asynchronously demonitors a pid. 155 | """ 156 | @spec demonitor(target :: ZenMonitor.destination(), ref :: reference()) :: :ok 157 | def demonitor(target, ref) do 158 | target 159 | |> get() 160 | |> GenServer.cast({:demonitor, target, ref}) 161 | end 162 | 163 | @doc """ 164 | Determine the effective compatibility of a remote node 165 | 166 | This will attempt a fast client-side lookup in the ETS table. Only a positive `:compatible` 167 | record will result in `:compatible`, otherwise the effective compatibility is `:incompatible` 168 | """ 169 | @spec compatibility(remote :: node()) :: compatibility 170 | def compatibility(remote) do 171 | case cached_compatibility(remote) do 172 | :compatible -> 173 | :compatible 174 | 175 | _ -> 176 | :incompatible 177 | end 178 | end 179 | 180 | @doc """ 181 | Check the cached compatibility status for a remote node 182 | 183 | This will only perform a fast client-side lookup in the ETS table. If an authoritative entry is 184 | found it will be returned (either `:compatible`, `:incompatible`, or `:unavailable`). If no 185 | entry is found then `:miss` is returned. If an expired entry is found then 186 | `{:expired, attempts}` is returned. 187 | """ 188 | @spec cached_compatibility(remote :: node()) :: cached_compatibility 189 | def cached_compatibility(remote) do 190 | case :ets.lookup(Tables.nodes(), remote) do 191 | [] -> 192 | :miss 193 | 194 | [{^remote, :compatible}] -> 195 | :compatible 196 | 197 | [{^remote, {:incompatible, enforce_until, attempt}}] -> 198 | if enforce_until < ZenMonitor.now() do 199 | {:expired, attempt} 200 | else 201 | :incompatible 202 | end 203 | 204 | [{^remote, :unavailable}] -> 205 | :unavailable 206 | end 207 | end 208 | 209 | @doc """ 210 | Connect to the provided remote 211 | 212 | This function will not consult the cache before calling into the GenServer, the GenServer will 213 | consult with the cache before attempting to connect, this allows for many callers to connect 214 | with the server guaranteeing that only one attempt will actually perform network work. 215 | 216 | If the compatibility of a remote host is needed instead, callers should use the 217 | `compatibility/1` or `cached_compatibility/1` functions. `compatibility/1` will provide the 218 | effective compatibility, `cached_compatibility/1` is mainly used internally but can provide more 219 | detailed information about the cache status of the remote. Neither of these methods, 220 | `compatibility/1` nor `cached_compatibility/1`, will perform network work or call into the 221 | GenServer. 222 | """ 223 | @spec connect(remote :: node()) :: compatibility 224 | def connect(remote) do 225 | remote 226 | |> get_for_node() 227 | |> GenServer.call(:connect) 228 | end 229 | 230 | @doc """ 231 | Gets the sweep interval from the Application Environment 232 | 233 | The sweep interval is the number of milliseconds to wait between sweeps, see 234 | ZenMonitor.Local.Connector's @sweep_interval for the default value 235 | 236 | This can be controlled at boot and runtime with the {:zen_monitor, :connector_sweep_interval} 237 | setting, see `ZenMonitor.Local.Connector.sweep_interval/1` for runtime convenience 238 | functionality. 239 | """ 240 | @spec sweep_interval() :: integer 241 | def sweep_interval do 242 | Application.get_env(:zen_monitor, :connector_sweep_interval, @sweep_interval) 243 | end 244 | 245 | @doc """ 246 | Puts the sweep interval into the Application Environment 247 | 248 | This is a simple convenience function for overwriting the 249 | {:zen_monitor, :connector_sweep_interval} setting at runtime. 250 | """ 251 | @spec sweep_interval(value :: integer) :: :ok 252 | def sweep_interval(value) do 253 | Application.put_env(:zen_monitor, :connector_sweep_interval, value) 254 | end 255 | 256 | @doc """ 257 | Gets the chunk size from the Application Environment 258 | 259 | The chunk size is the maximum number of subscriptions that will be sent during each sweep, see 260 | ZenMonitor.Local.Connector's @chunk_size for the default value 261 | 262 | This can be controlled at boot and runtime with the {:zen_monitor, :connector_chunk_size} 263 | setting, see `ZenMonitor.Local.Connector.chunk_size/1` for runtime convenience functionality. 264 | """ 265 | @spec chunk_size() :: integer 266 | def chunk_size do 267 | Application.get_env(:zen_monitor, :connector_chunk_size, @chunk_size) 268 | end 269 | 270 | @doc """ 271 | Puts the chunk size into the Application Environment 272 | 273 | This is a simple convenience function for overwriting the {:zen_monitor, :connector_chunk_size} 274 | setting at runtime. 275 | """ 276 | @spec chunk_size(value :: integer) :: :ok 277 | def chunk_size(value) do 278 | Application.put_env(:zen_monitor, :connector_chunk_size, value) 279 | end 280 | 281 | ## Server 282 | 283 | def init(remote) do 284 | schedule_sweep() 285 | monitors = :ets.new(:monitors, [:private, :ordered_set]) 286 | {:ok, %State{remote: remote, monitors: monitors}} 287 | end 288 | 289 | # Synchronous connect handler 290 | # Attempts to connect to the remote, this handler does check the cache before connecting to avoid 291 | # a thundering herd. 292 | def handle_call(:connect, _from, %State{} = state) do 293 | {result, state} = do_compatibility(state) 294 | {:reply, result, state} 295 | end 296 | 297 | # Returns all the monitors between a target and a subscriber 298 | def handle_call({:monitors, target, subscriber}, _from, %State{} = state) do 299 | size = :ets.info(state.monitors, :size) 300 | 301 | monitors = 302 | if size == 0 do 303 | # Don't bother doing the match on an empty table 304 | [] 305 | else 306 | case :ets.match(state.monitors, {{target, :"$1"}, subscriber}, size) do 307 | :"$end_of_table" -> 308 | # Match failed 309 | [] 310 | 311 | {monitors, _} -> 312 | # Unwrap the references 313 | List.flatten(monitors) 314 | end 315 | end 316 | 317 | {:reply, monitors, state} 318 | end 319 | 320 | # Handles establishing a new monitor 321 | # 1. Records the monitor into the internal ETS table 322 | # 2. If this is the first monitor for the pid, adds it to the queue for subsequent dispatch to 323 | # the ZenMonitor.Proxy during the next sweep. 324 | def handle_cast( 325 | {:monitor, target, ref, subscriber}, 326 | %State{batch: batch, length: length, monitors: monitors} = state 327 | ) do 328 | # Check if we should subscribe to this target (this check has to happen before we insert the 329 | # new monitor otherwise the new monitor will always be found and we will never enqueue 330 | # anything) 331 | should_subscribe? = unknown_target?(monitors, target) 332 | 333 | # Always add it to the monitor table 334 | :ets.insert(monitors, {{target, ref}, subscriber}) 335 | 336 | # Enqueue the subscribe instruction if it isn't already monitored 337 | new_state = 338 | if should_subscribe? do 339 | increment("enqueue", 1, tags: ["op:subscribe"]) 340 | %State{state | batch: :queue.in({:subscribe, target}, batch), length: length + 1} 341 | else 342 | state 343 | end 344 | 345 | {:noreply, new_state} 346 | end 347 | 348 | # Handles demonitoring a reference for a given pid 349 | # Cleans up the internal ETS record if it exists 350 | def handle_cast( 351 | {:demonitor, target, ref}, 352 | %State{batch: batch, length: length, monitors: monitors} = state 353 | ) do 354 | # Remove it from the monitors table 355 | :ets.delete(monitors, {target, ref}) 356 | 357 | # If that was the last monitor for the target, we should unsubscribe. Unlike monitor we have 358 | # to perform this check after the delete or else the row we are deleting will always make the 359 | # target known. 360 | should_unsubscribe? = unknown_target?(monitors, target) 361 | 362 | # Enqueue the unsubscribe instruction if the target no longer exists 363 | state = 364 | if should_unsubscribe? do 365 | increment("enqueue", 1, tags: ["op:unsubscribe"]) 366 | %State{state | batch: :queue.in({:unsubscribe, target}, batch), length: length + 1} 367 | else 368 | state 369 | end 370 | 371 | {:noreply, state} 372 | end 373 | 374 | # Handles nodedown for the Connector's remote 375 | # When the remote node goes down, every monitor maintained by the Connector should fire 376 | def handle_info({:nodedown, remote}, %State{remote: remote} = state) do 377 | # Mark this node as unavailable 378 | {:incompatible, state} = do_mark_unavailable(state) 379 | 380 | # Mark the remote node as unmonitored (any monitors that existed were just consumed) 381 | state = %State{state | remote_node_monitored: false} 382 | 383 | # Dispatch down to everyone 384 | {:noreply, do_down(state)} 385 | end 386 | 387 | # Handles when the proxy crashes because of noconnection 388 | # This reason indicates that we have lost connection with the remote node, mark it as unavailable. 389 | def handle_info({:DOWN, ref, :process, _, :noconnection}, %State{remote_proxy_ref: ref} = state) do 390 | # Mark this node as unavailable 391 | {:incompatible, state} = do_mark_unavailable(state) 392 | 393 | # Clear the remote_proxy_ref 394 | state = %State{state | remote_proxy_ref: nil} 395 | 396 | # Dispatch down to everyone 397 | {:noreply, do_down(state)} 398 | end 399 | 400 | # Handles when the proxy crashes for any other reason 401 | # Penalize the remote as incompatible and let the normal remote recovery take care of it. 402 | def handle_info({:DOWN, ref, :process, _, _}, %State{remote_proxy_ref: ref} = state) do 403 | # Mark this node as incompatible 404 | {:incompatible, state} = do_mark_incompatible(state, 1) 405 | 406 | # Clear the remote_proxy_ref 407 | state = %State{state | remote_proxy_ref: nil} 408 | 409 | # Dispatch down to everyone 410 | {:noreply, do_down(state)} 411 | end 412 | 413 | # Handle the dead summary from the remote 414 | # Periodically the remote node will send us a summary of everything that has died that we have 415 | # monitored. 416 | # Connector will find and consume all the matching monitors and enqueue the appropriate messages 417 | # for each monitor with ZenMonitor.Local 418 | def handle_info( 419 | {:dead, remote, death_certificates}, 420 | %State{remote: remote, monitors: monitors} = state 421 | ) do 422 | death_certificates 423 | |> messages_for_death_certificates(monitors) 424 | |> Local.enqueue() 425 | 426 | {:noreply, state} 427 | end 428 | 429 | # Handle the periodic sweep 430 | # If the remote is compatible this will create a subscription summary up to chunk_size of all the 431 | # pids that need monitoring since the last sweep. This will be sent to the remote for monitoring. 432 | # If the remote is incompatible, all pids since the last sweep will have their monitors fire with 433 | # `{:zen_monitor, :nodedown}` 434 | def handle_info(:sweep, %State{} = state) do 435 | new_state = 436 | case do_compatibility(state) do 437 | {:compatible, state} -> 438 | do_sweep(state) 439 | 440 | {:incompatible, state} -> 441 | do_down(state) 442 | end 443 | 444 | schedule_sweep() 445 | {:noreply, new_state} 446 | end 447 | 448 | def handle_info(_, %State{} = state) do 449 | increment("unhandled_info") 450 | {:noreply, state} 451 | end 452 | 453 | ## Private 454 | 455 | @spec do_compatibility(state :: State.t()) :: {compatibility, State.t()} 456 | defp do_compatibility(%State{remote: remote} = state) do 457 | case cached_compatibility(remote) do 458 | :miss -> 459 | do_connect(state, 1) 460 | 461 | {:expired, attempt} -> 462 | do_connect(state, attempt + 1) 463 | 464 | :unavailable -> 465 | do_connect(state, 1) 466 | 467 | hit -> 468 | {hit, state} 469 | end 470 | end 471 | 472 | @spec do_connect(State.t(), attempt :: integer) :: {compatibility, State.t()} 473 | defp do_connect(%State{remote: remote} = state, attempt) do 474 | try do 475 | with {:known_node, true} <- {:known_node, known_node?(remote)}, 476 | {:ping, :pong} <- 477 | {:ping, ZenMonitor.gen_module().call({ZenMonitor.Proxy, remote}, :ping)} do 478 | do_mark_compatible(state) 479 | else 480 | {:known_node, false} -> 481 | do_mark_unavailable(state) 482 | 483 | {:ping, _} -> 484 | do_mark_incompatible(state, attempt) 485 | end 486 | catch 487 | :exit, {{:nodedown, _node}, _} -> 488 | do_mark_unavailable(state) 489 | 490 | :exit, _ -> 491 | do_mark_incompatible(state, attempt) 492 | end 493 | end 494 | 495 | @spec do_sweep(state :: State.t()) :: State.t() 496 | defp do_sweep(%State{batch: batch, length: length} = state) do 497 | {summary, overflow, new_length} = chunk(batch, length) 498 | increment("sweep", length - new_length) 499 | do_subscribe(state, summary) 500 | %State{state | batch: overflow, length: new_length} 501 | end 502 | 503 | @spec chunk(batch :: :queue.queue(), length :: integer) :: {[pid], :queue.queue(), integer} 504 | defp chunk(batch, length) do 505 | size = chunk_size() 506 | 507 | if length <= size do 508 | {:queue.to_list(batch), :queue.new(), 0} 509 | else 510 | {summary, overflow} = :queue.split(size, batch) 511 | {:queue.to_list(summary), overflow, length - size} 512 | end 513 | end 514 | 515 | @spec do_subscribe(state :: State.t(), summary :: []) :: :ok 516 | defp do_subscribe(%State{}, []), do: :ok 517 | 518 | defp do_subscribe(%State{remote: remote}, summary) do 519 | ZenMonitor.gen_module().cast({ZenMonitor.Proxy, remote}, {:process, self(), summary}) 520 | end 521 | 522 | @spec do_down(state :: State.t()) :: State.t() 523 | defp do_down(%State{monitors: monitors} = state) do 524 | # Generate messages for every monitor 525 | messages = 526 | for [{{pid, ref}, subscriber}] <- :ets.match(monitors, :"$1") do 527 | {subscriber, {:DOWN, ref, :process, pid, {:zen_monitor, :nodedown}}} 528 | end 529 | 530 | # Clear the monitors table 531 | :ets.delete_all_objects(monitors) 532 | 533 | # Enqueue the messages with ZenMonitor.Local 534 | Local.enqueue(messages) 535 | 536 | # Return a new empty state 537 | %State{state | batch: :queue.new(), length: 0} 538 | end 539 | 540 | @spec do_mark_compatible(State.t()) :: {:compatible, State.t()} 541 | defp do_mark_compatible(%State{remote: remote} = state) do 542 | state = 543 | state 544 | |> monitor_remote_node() 545 | |> monitor_remote_proxy() 546 | 547 | :ets.insert(Tables.nodes(), {remote, :compatible}) 548 | {:compatible, state} 549 | end 550 | 551 | @spec do_mark_incompatible(State.t(), attempt :: integer) :: {:incompatible, State.t()} 552 | defp do_mark_incompatible(%State{remote: remote} = state, attempt) do 553 | state = monitor_remote_node(state) 554 | 555 | :ets.insert( 556 | Tables.nodes(), 557 | {remote, {:incompatible, ZenMonitor.now() + penalty(attempt), attempt}} 558 | ) 559 | 560 | {:incompatible, state} 561 | end 562 | 563 | @spec do_mark_unavailable(State.t()) :: {:incompatible, State.t()} 564 | defp do_mark_unavailable(%State{remote: remote} = state) do 565 | :ets.insert(Tables.nodes(), {remote, :unavailable}) 566 | {:incompatible, state} 567 | end 568 | 569 | @spec monitor_remote_node(State.t()) :: State.t() 570 | defp monitor_remote_node(%State{remote_node_monitored: true} = state), do: state 571 | 572 | defp monitor_remote_node(%State{remote_node_monitored: false, remote: remote} = state) do 573 | Node.monitor(remote, true) 574 | %State{state | remote_node_monitored: true} 575 | end 576 | 577 | @spec monitor_remote_proxy(State.t()) :: State.t() 578 | defp monitor_remote_proxy(%State{remote_proxy_ref: nil, remote: remote} = state) do 579 | %State{state | remote_proxy_ref: Process.monitor({ZenMonitor.Proxy, remote})} 580 | end 581 | 582 | defp monitor_remote_proxy(%State{} = state), do: state 583 | 584 | @spec messages_for_death_certificates( 585 | death_certificates :: [death_certificate], 586 | monitors :: :ets.tab() 587 | ) :: [down_dispatch] 588 | defp messages_for_death_certificates(death_certificates, monitors) do 589 | do_messages_for_death_certificates(death_certificates, monitors, []) 590 | end 591 | 592 | @spec do_messages_for_death_certificates( 593 | death_certificates :: [death_certificate], 594 | monitors :: :ets.tab(), 595 | acc :: [down_dispatch] 596 | ) :: [down_dispatch] 597 | defp do_messages_for_death_certificates([], _monitors, acc), do: Enum.reverse(acc) 598 | 599 | defp do_messages_for_death_certificates([{pid, reason} | rest], monitors, acc) do 600 | acc = 601 | monitors 602 | |> :ets.match({{pid, :"$1"}, :"$2"}) 603 | |> Enum.reduce(acc, fn [ref, subscriber], acc -> 604 | # Consume the monitor 605 | :ets.delete(monitors, {pid, ref}) 606 | 607 | # Add the new message into the accumulator 608 | [{subscriber, {:DOWN, ref, :process, pid, {:zen_monitor, reason}}} | acc] 609 | end) 610 | 611 | do_messages_for_death_certificates(rest, monitors, acc) 612 | end 613 | 614 | @spec known_node?(remote :: node()) :: boolean() 615 | defp known_node?(remote) do 616 | remote == Node.self() or remote in Node.list() 617 | end 618 | 619 | @spec penalty(attempt :: integer) :: integer 620 | defp penalty(attempt) do 621 | min(@maximum_penalty, @base_penalty * round(:math.pow(2, min(attempt, @max_attempt)))) 622 | end 623 | 624 | @spec unknown_target?(monitors :: :ets.tid(), target :: pid) :: boolean 625 | defp unknown_target?(monitors, target) do 626 | # ETS does not make for the most readable code, here's what the following line does. 627 | # Perform a match on the internal monitors table looking for keys that start with 628 | # {target, ...} 629 | # Since we are just interested to see if there are any, but don't care about the content, we 630 | # set the other fields to :_ to ignore them. 631 | # The target is known if there are _any_ results, so we apply a limit to the match of just 1 632 | # result. 633 | # This means that we either get back a tuple of {[[]]], continuation} or :"$end_of_table" 634 | # :"$end_of_table" implies that the match for a single item found nothing, therefore the 635 | # target does not exist and is unknown 636 | :ets.match(monitors, {{target, :_}, :_}, 1) == :"$end_of_table" 637 | end 638 | 639 | @spec schedule_sweep() :: reference 640 | defp schedule_sweep do 641 | Process.send_after(self(), :sweep, sweep_interval()) 642 | end 643 | end 644 | -------------------------------------------------------------------------------- /lib/zen_monitor/local/dispatcher.ex: -------------------------------------------------------------------------------- 1 | defmodule ZenMonitor.Local.Dispatcher do 2 | @moduledoc """ 3 | `ZenMonitor.Local.Dispatcher` is a GenStage Consumer responsible for throttled delivery of down 4 | messages. 5 | 6 | `ZenMonitor.Local` acts as a GenStage Producer, it stores all of the down messages that need to 7 | be dispatched based off of what has been enqueued by the `ZenMonitor.Local.Connector`. 8 | 9 | The Dispatcher will deliver these messages throttled by a maximum rate which is controlled by 10 | the {:zen_monitor, :demand_interval} and {:zen_monitor, :demand_amount} settings. 11 | 12 | To calculate the maximum number of messages processed per second you can use the following 13 | formula: 14 | 15 | maximum_mps = (demand_amount) * (1000 / demand_interval) 16 | 17 | For example, if the demand_amount is 1000, and demand_interval is 100 (milliseconds) the maximum 18 | messages per second are: 19 | 20 | maximum_mps = (1000) * (1000 / 100) 21 | -> (1000) * 10 22 | -> 10_000 23 | 24 | For convenience a `ZenMonitor.Local.Dispatcher.maximum_mps/0` is provided that will perform this 25 | calculation. 26 | """ 27 | use GenStage 28 | use Instruments.CustomFunctions, prefix: "zen_monitor.local.dispatcher" 29 | 30 | alias ZenMonitor.Local.Tables 31 | 32 | @demand_interval 100 33 | @demand_amount 1000 34 | 35 | ## Client 36 | 37 | def start_link(_opts \\ []) do 38 | GenStage.start_link(__MODULE__, [], name: __MODULE__) 39 | end 40 | 41 | @doc """ 42 | Gets the demand interval from the Application Environment 43 | 44 | The demand interval is the number of milliseconds to wait between demanding more events from the 45 | GenStage Producer (`ZenMonitor.Local`) 46 | 47 | This can be controlled at boot and runtime with the {:zen_monitor, :demand_interval} setting, 48 | see `ZenMonitor.Local.Dispatcher.demand_interval/1` for runtime convenience functionality. 49 | """ 50 | @spec demand_interval() :: integer 51 | def demand_interval do 52 | Application.get_env(:zen_monitor, :demand_interval, @demand_interval) 53 | end 54 | 55 | @doc """ 56 | Puts the demand interval into the Application Environment 57 | 58 | This is a simple convenience function for overwrite the {:zen_monitor, :demand_interval} setting 59 | at runtime 60 | """ 61 | @spec demand_interval(value :: integer) :: :ok 62 | def demand_interval(value) do 63 | Application.put_env(:zen_monitor, :demand_interval, value) 64 | end 65 | 66 | @doc """ 67 | Gets the demand amount from the Application Environment 68 | 69 | The demand amount is the number of events tor request from the GenStage Producer 70 | (`ZenMonitor.Local`) every demand interval 71 | 72 | This can be controlled at boot and runtime with the {:zen_monitor, :demand_amount} setting, see 73 | `ZenMonitor.Local.Dispatcher.demand_amount/1` for runtime convenience functionality. 74 | """ 75 | @spec demand_amount() :: integer 76 | def demand_amount do 77 | Application.get_env(:zen_monitor, :demand_amount, @demand_amount) 78 | end 79 | 80 | @doc """ 81 | Puts the demand amount into the Application Environment 82 | 83 | This is a simple convenience function for overwriting the {:zen_monitor, :demand_amount} setting 84 | at runtime. 85 | """ 86 | @spec demand_amount(value :: integer) :: :ok 87 | def demand_amount(value) do 88 | Application.put_env(:zen_monitor, :demand_amount, value) 89 | end 90 | 91 | @doc """ 92 | Calculate the current maximum messages per second 93 | 94 | This is a convenience function to help operators understand the current throughput of the 95 | Dispatcher. 96 | """ 97 | @spec maximum_mps() :: float 98 | def maximum_mps do 99 | demand_amount() * (1000 / demand_interval()) 100 | end 101 | 102 | ## Server 103 | 104 | def init(_opts) do 105 | Process.flag(:message_queue_data, :off_heap) 106 | {:consumer, nil, subscribe_to: [{ZenMonitor.Local, min_demand: 1}]} 107 | end 108 | 109 | @doc """ 110 | Handles the events for dispatch 111 | 112 | Dispatch is a simple two step procedure followed for each message to be dispatched. 113 | 114 | 1. Check if the message is still valid. Messages can become invalid if the monitor was 115 | demonitored after the message was enqueued. 116 | 117 | 2a. If valid: forward the message to the subscriber 118 | 2b. If invalid: skip message 119 | 120 | Event dispatch will calculate an "unfulfilled" demand based off the number of messages skipped 121 | and demand that the producer provide additional events so that MPS is maintained and prevent the 122 | Dispatcher from being starved because of invalid messages. 123 | """ 124 | def handle_events(events, _from, producer) do 125 | delivered = length(events) 126 | increment("events.delivered", delivered) 127 | 128 | messages = 129 | for {subscriber, {:DOWN, ref, :process, _, _} = message} <- events, 130 | still_monitored?(subscriber, ref) do 131 | send(subscriber, message) 132 | end 133 | 134 | # Ensure that filtering does not starve out the Dispatcher 135 | 136 | # Calculate the effective demand by taking the smaller of the current demand_amount and the 137 | # length of events delivered. 138 | effective_demand = min(delivered, demand_amount()) 139 | processed = length(messages) 140 | increment("events.processed", processed) 141 | 142 | # The unfulfilled demand is the difference between the effective demand and the actual events 143 | unfulfilled = effective_demand - processed 144 | 145 | # Ask the producer to fulfill the unfulfilled demand (if this number is 0 or negative, the 146 | # ask helper will handle that for us and not ask for anything) 147 | ask(producer, unfulfilled) 148 | 149 | {:noreply, [], producer} 150 | end 151 | 152 | @doc """ 153 | Handles the callback for the subscription being established with the producer. 154 | 155 | This is the start of the demand loop, once the producer confirms subscription, the initial call 156 | to schedule_demand/0 happens. 157 | """ 158 | def handle_subscribe(:producer, _, from, _state) do 159 | schedule_demand() 160 | {:manual, from} 161 | end 162 | 163 | @doc """ 164 | Handles the periodic generate_demand message 165 | 166 | Asks the producer for demand_amount of events then schedules the next demand generation. 167 | """ 168 | def handle_info(:generate_demand, producer) do 169 | ask(producer, demand_amount()) 170 | schedule_demand() 171 | 172 | {:noreply, [], producer} 173 | end 174 | 175 | ## Private 176 | 177 | @spec ask(producer :: pid, amount :: integer) :: :ok 178 | defp ask(_producer, amount) when amount <= 0, do: :ok 179 | 180 | defp ask(producer, amount) do 181 | GenStage.ask(producer, amount) 182 | end 183 | 184 | @spec still_monitored?(subscriber :: pid, ref :: reference) :: boolean 185 | defp still_monitored?(subscriber, ref) do 186 | :ets.take(Tables.references(), {subscriber, ref}) != [] 187 | end 188 | 189 | @spec schedule_demand() :: reference 190 | defp schedule_demand do 191 | Process.send_after(self(), :generate_demand, demand_interval()) 192 | end 193 | end 194 | -------------------------------------------------------------------------------- /lib/zen_monitor/local/supervisor.ex: -------------------------------------------------------------------------------- 1 | defmodule ZenMonitor.Local.Supervisor do 2 | @moduledoc """ 3 | Supervisor for the `ZenMonitor.Local` components. 4 | 5 | See `ZenMonitor.Local`, `ZenMonitor.Local.Tables`, `ZenMonitor.Local.Connector`, and 6 | `ZenMonitor.Local.Dispatcher` for more information about the supervised processes. 7 | 8 | There are many `ZenMonitor.Local.Connector` processes, which are managed by a `GenRegistry`. 9 | These are keyed by the remote node the Connector is responsible for. 10 | 11 | This supervisor uses the `:rest_for_one` strategy, so the order of the children is important and 12 | should not be altered. 13 | """ 14 | use Supervisor 15 | 16 | def start_link(_opts \\ []) do 17 | Supervisor.start_link(__MODULE__, [], name: __MODULE__) 18 | end 19 | 20 | def init(_opts) do 21 | children = [ 22 | ZenMonitor.Local.Tables, 23 | ZenMonitor.Local, 24 | GenRegistry.Spec.child_spec(ZenMonitor.Local.Connector), 25 | ZenMonitor.Local.Dispatcher 26 | ] 27 | 28 | Supervisor.init(children, strategy: :rest_for_one) 29 | end 30 | end 31 | -------------------------------------------------------------------------------- /lib/zen_monitor/local/tables.ex: -------------------------------------------------------------------------------- 1 | defmodule ZenMonitor.Local.Tables do 2 | @moduledoc """ 3 | `ZenMonitor.Local.Tables` owns tables that are shared between multiple `ZenMonitor.Local` 4 | components. 5 | 6 | See `nodes/0` and `references/0` for more information. 7 | """ 8 | use GenServer 9 | 10 | @node_table Module.concat(__MODULE__, "Nodes") 11 | @reference_table Module.concat(__MODULE__, "References") 12 | 13 | ## Client 14 | 15 | def start_link(_opts \\ []) do 16 | GenServer.start_link(__MODULE__, [], name: __MODULE__) 17 | end 18 | 19 | @doc """ 20 | Nodes holds cached information about remote node compatibility 21 | 22 | This information is stored in one of the following structures: 23 | 24 | For compatible nodes 25 | { remote_node, :compatible } 26 | ^---key---^ ^--value--^ 27 | 28 | For incompatible nodes 29 | { remote_node, {:incompatible, enforce_until, attempts} } 30 | ^---key---^ ^---------------value-----------------^ 31 | 32 | `enforce_until` is the time (as reported by System.monotonic_time(:milliseconds)) after which 33 | this cache entry should no longer be enforced. 34 | 35 | `attempts` is the number of consecutive connect attempts that have failed, this value is useful 36 | for calculating geometric backoff values 37 | """ 38 | @spec nodes() :: :ets.tab() 39 | def nodes do 40 | @node_table 41 | end 42 | 43 | @doc """ 44 | References holds the set of authoritative monitor references 45 | 46 | These references are stored in this structure: 47 | 48 | { {subscriber_pid, monitor_reference}, {remote_node, remote_pid} } 49 | ^-------------key-----------------^ ^----------value--------^ 50 | 51 | There is a compound key of {subscriber_pid, monitor_reference} this allows for lookup of a given 52 | reference (if the subscriber is known, by convention it will be the calling process, self()) or 53 | the retrieval of all active monitors for a subscriber. 54 | """ 55 | @spec references() :: :ets.tab() 56 | def references do 57 | @reference_table 58 | end 59 | 60 | ## Server 61 | 62 | def init(_opts) do 63 | @node_table = :ets.new(@node_table, [:public, :named_table, :set, write_concurrency: true]) 64 | 65 | @reference_table = 66 | :ets.new(@reference_table, [:public, :named_table, :ordered_set, write_concurrency: true]) 67 | 68 | {:ok, nil} 69 | end 70 | end 71 | -------------------------------------------------------------------------------- /lib/zen_monitor/metrics.ex: -------------------------------------------------------------------------------- 1 | defmodule ZenMonitor.Metrics do 2 | @moduledoc """ 3 | Metrics helper for monitoring the ZenMonitor system. 4 | """ 5 | alias Instruments.Probe 6 | 7 | @doc """ 8 | Registers various probes for the ZenMonitor System. 9 | 10 | - ERTS message_queue_len for the `ZenMonitor.Local` and `ZenMonitor.Proxy` processes. 11 | - Internal Batch Queue length for `ZenMonitor.Local` (dispatches to be delivered) 12 | - ETS table size for References (number of monitors) 13 | - ETS table size for Subscribers (number of monitored local processes * interested remotes) 14 | 15 | """ 16 | @spec register() :: :ok 17 | def register do 18 | Probe.define!( 19 | "zen_monitor.local.message_queue_len", 20 | :gauge, 21 | mfa: {__MODULE__, :message_queue_len, [ZenMonitor.Local]} 22 | ) 23 | 24 | Probe.define!( 25 | "zen_monitor.proxy.message_queue_len", 26 | :gauge, 27 | mfa: {__MODULE__, :message_queue_len, [ZenMonitor.Proxy]} 28 | ) 29 | 30 | Probe.define!( 31 | "zen_monitor.local.batch_length", 32 | :gauge, 33 | mfa: {ZenMonitor.Local, :batch_length, []} 34 | ) 35 | 36 | Probe.define!( 37 | "zen_monitor.local.ets.references.size", 38 | :gauge, 39 | mfa: {__MODULE__, :table_size, [ZenMonitor.Local.Tables.references()]} 40 | ) 41 | 42 | Probe.define!( 43 | "zen_monitor.proxy.ets.subscribers.size", 44 | :gauge, 45 | mfa: {__MODULE__, :table_size, [ZenMonitor.Proxy.Tables.subscribers()]} 46 | ) 47 | 48 | :ok 49 | end 50 | 51 | @doc """ 52 | Given a pid or a registered name, this will return the message_queue_len as reported by 53 | `Process.info/2` 54 | """ 55 | @spec message_queue_len(target :: nil | pid() | atom()) :: nil | integer() 56 | def message_queue_len(nil), do: nil 57 | 58 | def message_queue_len(target) when is_pid(target) do 59 | case Process.info(target, :message_queue_len) do 60 | {:message_queue_len, len} -> len 61 | _ -> nil 62 | end 63 | end 64 | 65 | def message_queue_len(target) when is_atom(target) do 66 | target 67 | |> Process.whereis() 68 | |> message_queue_len() 69 | end 70 | 71 | @doc """ 72 | Given a table identifier, returns the size as reported by `:ets.info/2` 73 | """ 74 | @spec table_size(:ets.tid()) :: nil | integer() 75 | def table_size(tid) do 76 | case :ets.info(tid, :size) do 77 | :undefined -> nil 78 | size -> size 79 | end 80 | end 81 | end 82 | -------------------------------------------------------------------------------- /lib/zen_monitor/proxy.ex: -------------------------------------------------------------------------------- 1 | defmodule ZenMonitor.Proxy do 2 | @moduledoc """ 3 | ZenMonitor.Proxy monitors local processes and proxies their down messages to interested 4 | ZenMonitor.Locals on remote nodes for fanout. 5 | """ 6 | use GenServer 7 | 8 | alias ZenMonitor.Truncator 9 | alias ZenMonitor.Proxy.{Batcher, Tables} 10 | 11 | @typedoc """ 12 | Defines the valid operations that can be processed 13 | """ 14 | @type operation :: :subscribe | :unsubscribe 15 | 16 | @typedoc """ 17 | An instruction is a valid operation upon a given destination 18 | """ 19 | @type instruction :: {operation, ZenMonitor.destination()} 20 | 21 | @typedoc """ 22 | A string of instructions with the same operation can be collapsed into a partition for more 23 | efficient processing. 24 | """ 25 | @type partition :: {operation, [ZenMonitor.destination()]} 26 | 27 | defmodule State do 28 | @moduledoc """ 29 | Maintains the internal state for ZenMonitor.Proxy 30 | 31 | `monitors` is an ETS table with all the pids that the Proxy is currently monitoring 32 | """ 33 | @type t :: %__MODULE__{ 34 | monitors: :ets.tid() 35 | } 36 | defstruct [ 37 | :monitors 38 | ] 39 | end 40 | 41 | ## Client 42 | 43 | def start_link(args) do 44 | GenServer.start_link(__MODULE__, args, name: __MODULE__) 45 | end 46 | 47 | @doc """ 48 | Ping is a diagnostic function to check that the proxy is running. 49 | 50 | It is mainly used by ZenMonitor.Local.Connectors to check if ZenMonitor.Proxy is available 51 | and running on a remote node 52 | """ 53 | @spec ping() :: :pong 54 | def ping() do 55 | GenServer.call(__MODULE__, :ping) 56 | end 57 | 58 | ## Server 59 | 60 | def init(_args) do 61 | Process.flag(:message_queue_data, :off_heap) 62 | {:ok, %State{monitors: :ets.new(:monitors, [:private, :set])}} 63 | end 64 | 65 | def handle_call(:ping, _from, %State{} = state) do 66 | {:reply, :pong, state} 67 | end 68 | 69 | def handle_cast({:subscribe, subscriber, targets}, %State{} = state) do 70 | process_operation(:subscribe, subscriber, targets, state) 71 | {:noreply, state} 72 | end 73 | 74 | def handle_cast({:process, subscriber, instructions}, %State{} = state) do 75 | # Create the most efficient instruction partitions 76 | for {operation, targets} <- partition_instructions(instructions) do 77 | process_operation(operation, subscriber, targets, state) 78 | end 79 | 80 | {:noreply, state} 81 | end 82 | 83 | def handle_info({:DOWN, _, :process, pid, reason}, %State{monitors: monitors} = state) do 84 | # Reasons can include stack traces and other dangerous items, truncate them. 85 | truncated_reason = Truncator.truncate(reason) 86 | 87 | # Enqueue the death certificates with the interested subscriber's batchers 88 | for [subscriber] <- :ets.match(Tables.subscribers(), {{pid, :"$1"}}) do 89 | # Delete the subscription 90 | :ets.delete(Tables.subscribers(), {pid, subscriber}) 91 | 92 | # Enqueue the death certificate with the Batcher 93 | subscriber 94 | |> Batcher.get() 95 | |> Batcher.enqueue(pid, truncated_reason) 96 | end 97 | 98 | # Clear the monitor 99 | :ets.delete(monitors, pid) 100 | 101 | {:noreply, state} 102 | end 103 | 104 | ## Private 105 | 106 | @spec process_operation( 107 | operation, 108 | subscriber :: pid(), 109 | targets :: [ZenMonitor.destination()], 110 | State.t() 111 | ) :: :ok 112 | defp process_operation(:subscribe, subscriber, targets, %State{monitors: monitors}) do 113 | # Record that the subscriber is interested in the targets 114 | :ets.insert(Tables.subscribers(), Enum.map(targets, &{{&1, subscriber}})) 115 | 116 | # Record and monitor each of the pids, filtering out already monitored pids 117 | for target <- targets, 118 | :ets.insert_new(monitors, {target}) do 119 | Process.monitor(target) 120 | end 121 | 122 | :ok 123 | end 124 | 125 | defp process_operation(:unsubscribe, subscriber, targets, _state) do 126 | # Remove the subscriptions from the subscribers table 127 | for target <- targets do 128 | :ets.delete(Tables.subscribers(), {target, subscriber}) 129 | end 130 | 131 | :ok 132 | end 133 | 134 | @spec partition_instructions([instruction]) :: [partition] 135 | defp partition_instructions(instructions) do 136 | do_partition_instructions(instructions, []) 137 | end 138 | 139 | @spec do_partition_instructions([instruction], [partition]) :: [partition] 140 | defp do_partition_instructions([], acc) do 141 | # There are no more instructions to process, the accumulator now has all the partitions, but 142 | # in reverse order, reverse and return it 143 | Enum.reverse(acc) 144 | end 145 | 146 | defp do_partition_instructions([{op, target} | rest], acc) do 147 | # Inspect the first instruction in the instruction list, collect all the targets with that 148 | # operation into a new partition. 149 | {partition, remaining} = do_collect_targets(op, rest, [target]) 150 | 151 | # Recursively process any remaining instructions after prepending in the new partition into 152 | # the accumulator 153 | do_partition_instructions(remaining, [{op, partition} | acc]) 154 | end 155 | 156 | @spec do_collect_targets(operation, [instruction], [ZenMonitor.destination()]) :: 157 | {[ZenMonitor.destination()], [instruction]} 158 | defp do_collect_targets(_op, [], acc) do 159 | # There are no more instructions to process, return the accumulator. Note that since 160 | # instructions of the same operation are commutative there is no need to reverse the 161 | # accumulator even though the targets are in reverse order 162 | {acc, []} 163 | end 164 | 165 | defp do_collect_targets(op, [{op, target} | rest], acc) do 166 | # The next instruction matches the current operation, prepend the target into the accumulator 167 | # and recursively process the rest of the instructions 168 | do_collect_targets(op, rest, [target | acc]) 169 | end 170 | 171 | defp do_collect_targets(_op, [{_other, _} | _rest] = remainder, acc) do 172 | # The next instruction does not match the current operations. Similar to when there are no 173 | # more instructions to process, the accumulator is returned as-is. The remaining instructions 174 | # (including the current instruction that didn't match) are returned for further processing. 175 | {acc, remainder} 176 | end 177 | end 178 | -------------------------------------------------------------------------------- /lib/zen_monitor/proxy/batcher.ex: -------------------------------------------------------------------------------- 1 | defmodule ZenMonitor.Proxy.Batcher do 2 | 3 | @moduledoc """ 4 | `ZenMonitor.Proxy.Batcher` is responsible for collecting death_certificates from 5 | `ZenMonitor.Proxy` destined for the Batcher's subscriber (normally the subscriber is a 6 | `ZenMonitor.Local.Connector`) 7 | 8 | Periodically it will sweep and send all of the death_certificates it has collected since the 9 | last sweep to the subscriber for processing. 10 | """ 11 | use GenServer 12 | use Instruments.CustomFunctions, prefix: "zen_monitor.proxy.batcher" 13 | 14 | alias ZenMonitor.Proxy.Tables 15 | 16 | @chunk_size 5000 17 | @lookup_timeout 30_000 18 | @sweep_interval 100 19 | 20 | defmodule State do 21 | @moduledoc """ 22 | Maintains the internal state for the Batcher 23 | 24 | - `subscriber` is the process that death_certificates should be delivered to 25 | - `batch` is the queue of death_certificates pending until the next sweep. 26 | - `length` is the current length of the batch queue (calculating queue length is an O(n) 27 | operation, is is simple to track it as elements are added / removed) 28 | """ 29 | 30 | @type t :: %__MODULE__{ 31 | subscriber: pid, 32 | batch: :queue.queue(), 33 | length: integer 34 | } 35 | defstruct [ 36 | :subscriber, 37 | batch: :queue.new(), 38 | length: 0 39 | ] 40 | end 41 | 42 | ## Client 43 | 44 | def start_link(subscriber) do 45 | GenServer.start_link(__MODULE__, subscriber) 46 | end 47 | 48 | @doc """ 49 | Get a batcher for a given subscriber 50 | """ 51 | @spec get(subscriber :: pid) :: pid 52 | def get(subscriber) do 53 | case GenRegistry.lookup(__MODULE__, subscriber) do 54 | {:ok, batcher} -> 55 | batcher 56 | 57 | {:error, :not_found} -> 58 | {:ok, batcher} = GenRegistry.lookup_or_start(__MODULE__, subscriber, [subscriber], lookup_timeout()) 59 | batcher 60 | end 61 | end 62 | 63 | @doc """ 64 | Enqueues a new death certificate into the batcher 65 | """ 66 | @spec enqueue(batcher :: pid, pid, reason :: any) :: :ok 67 | def enqueue(batcher, pid, reason) do 68 | GenServer.cast(batcher, {:enqueue, pid, reason}) 69 | end 70 | 71 | @doc """ 72 | Gets the sweep interval from the Application Environment 73 | 74 | The sweep interval is the number of milliseconds to wait between sweeps, see 75 | ZenMonitor.Proxy.Batcher's @sweep_interval for the default value 76 | 77 | This can be controlled at boot and runtime with the {:zen_monitor, :batcher_sweep_interval} 78 | setting, see `ZenMonitor.Proxy.Batcher.sweep_interval/1` for runtime convenience functionality. 79 | """ 80 | @spec sweep_interval() :: integer 81 | def sweep_interval do 82 | Application.get_env(:zen_monitor, :batcher_sweep_interval, @sweep_interval) 83 | end 84 | 85 | @doc """ 86 | Puts the sweep interval into the Application Environment 87 | 88 | This is a simple convenience function to overwrite the {:zen_monitor, :batcher_sweep_interval} 89 | setting at runtime 90 | """ 91 | @spec sweep_interval(value :: integer) :: :ok 92 | def sweep_interval(value) do 93 | Application.put_env(:zen_monitor, :batcher_sweep_interval, value) 94 | end 95 | 96 | @doc """ 97 | Gets the lookup timeout from the Application Environment 98 | 99 | The lookup timeout is the maximum amount of time in milliseconds that the calling process will 100 | wait to lookup or start a Batcher before exiting. 101 | 102 | This can be controlled at boot and runtime with the `{:zen_monitor, :batcher_lookup_timeout}` 103 | setting, see `ZenMonitor.Proxy.Batcher.lookup_timeout/1` for runtime convenience functionality. 104 | """ 105 | @spec lookup_timeout() :: timeout() 106 | def lookup_timeout do 107 | Application.get_env(:zen_monitor, :batcher_lookup_timeout, @lookup_timeout) 108 | end 109 | 110 | @doc """ 111 | Puts the lookup timeout into the Application Environment 112 | 113 | This is a simple convenience function to overwrite the {:zen_monitor, :batcher_lookup_timeout} 114 | setting at runtime. 115 | """ 116 | @spec lookup_timeout(timeout :: timeout()) :: :ok 117 | def lookup_timeout(timeout) do 118 | Application.put_env(:zen_monitor, :batcher_lookup_timeout, timeout) 119 | end 120 | 121 | 122 | @doc """ 123 | Gets the chunk size from the Application Environment 124 | 125 | The chunk size is the maximum number of death certificates that will be sent during each sweep, 126 | see ZenMonitor.Proxy.Batcher's @chunk_size for the default value 127 | 128 | This can be controlled at boot and runtime with the {:zen_monitor, :batcher_chunk_size} 129 | setting, see ZenMonitor.Proxy.Batcher.chunk_size/1 for runtime convenience functionality. 130 | """ 131 | @spec chunk_size() :: integer 132 | def chunk_size do 133 | Application.get_env(:zen_monitor, :batcher_chunk_size, @chunk_size) 134 | end 135 | 136 | @doc """ 137 | Puts the chunk size into the Application Environment 138 | 139 | This is a simple convenience function to overwrite the {:zen_monitor, :batcher_chunk_size} 140 | setting at runtime. 141 | """ 142 | @spec chunk_size(value :: integer) :: :ok 143 | def chunk_size(value) do 144 | Application.put_env(:zen_monitor, :batcher_chunk_size, value) 145 | end 146 | 147 | ## Server 148 | 149 | def init(subscriber) do 150 | Process.monitor(subscriber) 151 | schedule_sweep() 152 | {:ok, %State{subscriber: subscriber}} 153 | end 154 | 155 | @doc """ 156 | Handle enqueuing a new death_certificate 157 | 158 | Simply puts it in the batch queue. 159 | """ 160 | def handle_cast({:enqueue, pid, reason}, %State{batch: batch, length: length} = state) do 161 | increment("enqueue") 162 | {:noreply, %State{state | batch: :queue.in({pid, reason}, batch), length: length + 1}} 163 | end 164 | 165 | # Handle the subscriber crashing 166 | # When the subscriber crashes there is no point in continuing to run, so the Batcher stops. 167 | def handle_info( 168 | {:DOWN, _, :process, subscriber, reason}, 169 | %State{subscriber: subscriber} = state 170 | ) do 171 | # The subscriber process has crashed, clean up the subscribers table 172 | :ets.match_delete(Tables.subscribers(), {{:_, subscriber}}) 173 | {:stop, {:shutdown, {:subscriber_down, reason}}, state} 174 | end 175 | 176 | # Handle sweep 177 | # Every sweep the batcher will send the death_certificates batched up since the last sweep to the 178 | # subscriber. After that it will schedule another sweep. 179 | def handle_info(:sweep, %State{} = state) do 180 | new_state = do_sweep(state) 181 | schedule_sweep() 182 | {:noreply, new_state} 183 | end 184 | 185 | ## Private 186 | 187 | @spec do_sweep(state :: State.t()) :: State.t() 188 | defp do_sweep(%State{length: 0} = state), do: state 189 | 190 | defp do_sweep(%State{subscriber: subscriber, batch: batch, length: length} = state) do 191 | {summary, overflow, new_length} = chunk(batch, length) 192 | increment("sweep", length - new_length) 193 | Process.send(subscriber, {:dead, node(), :queue.to_list(summary)}, [:noconnect]) 194 | %State{state | batch: overflow, length: new_length} 195 | end 196 | 197 | @spec chunk(batch :: :queue.queue(), length :: integer) :: 198 | {:queue.queue(), :queue.queue(), integer} 199 | defp chunk(batch, length) do 200 | size = chunk_size() 201 | 202 | if length <= size do 203 | {batch, :queue.new(), 0} 204 | else 205 | {summary, overflow} = :queue.split(size, batch) 206 | {summary, overflow, length - size} 207 | end 208 | end 209 | 210 | @spec schedule_sweep() :: reference 211 | defp schedule_sweep do 212 | Process.send_after(self(), :sweep, sweep_interval()) 213 | end 214 | end 215 | -------------------------------------------------------------------------------- /lib/zen_monitor/proxy/supervisor.ex: -------------------------------------------------------------------------------- 1 | defmodule ZenMonitor.Proxy.Supervisor do 2 | @moduledoc """ 3 | Supervisor for the `ZenMonitor.Proxy` components. 4 | 5 | See `ZenMonitor.Proxy`, `ZenMonitor.Proxy.Tables`, and `ZenMonitor.Proxy.Batcher` for more 6 | information about the supervised processes. 7 | 8 | There are many `ZenMonitor.Proxy.Batcher` processes, which are managed by a `GenRegistry`. 9 | These are keyed by the pid of the `ZenMonitor.Local.Connector` the Batcher is responsible for. 10 | 11 | This supervisor uses the `:rest_for_one` strategy, so the order of the children is important and 12 | should not be altered. 13 | """ 14 | use Supervisor 15 | 16 | def start_link(_opts \\ []) do 17 | Supervisor.start_link(__MODULE__, [], name: __MODULE__) 18 | end 19 | 20 | def init(_opts) do 21 | children = [ 22 | ZenMonitor.Proxy.Tables, 23 | ZenMonitor.Proxy, 24 | GenRegistry.Spec.child_spec(ZenMonitor.Proxy.Batcher) 25 | ] 26 | 27 | Supervisor.init(children, strategy: :rest_for_one) 28 | end 29 | end 30 | -------------------------------------------------------------------------------- /lib/zen_monitor/proxy/tables.ex: -------------------------------------------------------------------------------- 1 | defmodule ZenMonitor.Proxy.Tables do 2 | @moduledoc """ 3 | `ZenMonitor.Proxy.Tables` owns the tables that are shared between multiple `ZenMonitor.Proxy` 4 | components. 5 | 6 | See `subscribers/0` for more information. 7 | """ 8 | use GenServer 9 | 10 | @subscriber_table Module.concat(__MODULE__, "Subscribers") 11 | 12 | ## Client 13 | 14 | def start_link(_opts \\ []) do 15 | GenServer.start_link(__MODULE__, [], name: __MODULE__) 16 | end 17 | 18 | @doc """ 19 | Subscribers holds information about who is subscribed to each pid. 20 | 21 | This information is stored in the following structure: 22 | 23 | { { monitored_pid, subscriber } } 24 | ^-----------key-------------^ 25 | 26 | `monitored_pid` is the local process that is being monitored. 27 | 28 | `subscriber` is the remote `ZenMonitor.Local.Connector` that is interested in the `monitored_pid` 29 | """ 30 | @spec subscribers() :: :ets.tab() 31 | def subscribers do 32 | @subscriber_table 33 | end 34 | 35 | ## Server 36 | 37 | def init(_opts) do 38 | @subscriber_table = 39 | :ets.new(@subscriber_table, [:public, :named_table, :ordered_set, write_concurrency: true]) 40 | 41 | {:ok, nil} 42 | end 43 | end 44 | -------------------------------------------------------------------------------- /lib/zen_monitor/supervisor.ex: -------------------------------------------------------------------------------- 1 | defmodule ZenMonitor.Supervisor do 2 | @moduledoc """ 3 | ZenMonitor.Supervisor is a convenience Supervisor that starts the Local and Proxy Supervisors 4 | 5 | See ZenMonitor.Local.Supervisor and ZenMonitor.Proxy.Supervisor for more information. 6 | """ 7 | use Supervisor 8 | 9 | def start_link(_opts \\ []) do 10 | Supervisor.start_link(__MODULE__, [], name: __MODULE__) 11 | end 12 | 13 | def init(_opts) do 14 | children = [ 15 | ZenMonitor.Local.Supervisor, 16 | ZenMonitor.Proxy.Supervisor 17 | ] 18 | 19 | Supervisor.init(children, strategy: :one_for_one) 20 | end 21 | end 22 | -------------------------------------------------------------------------------- /lib/zen_monitor/truncator.ex: -------------------------------------------------------------------------------- 1 | defmodule ZenMonitor.Truncator do 2 | @moduledoc """ 3 | ZenMonitor.Truncator is used to truncate error messages to prevent error expansion issues. 4 | 5 | ## Error Expansion 6 | 7 | At the core of ZenMonitor is a system that collects local `:DOWN` messages, batches them up and 8 | relays them in bulk. This opens up a failure mode where each `:DOWN` message individually is 9 | deliverable, but the bulk summary grows to an unsupportable size due to the aggregation of large 10 | reason payloads. 11 | 12 | If no truncation is performed then the payload can cause instability on the sender or the 13 | receiver side. 14 | 15 | ## Truncation Behavior 16 | 17 | ZenMonitor will truncate error reasons if they exceed a certain size to prevent Error Expansion 18 | from breaking either the sender or the receiver. 19 | 20 | Truncation is performed recursively on the term up to a maximum depth which can be provided to 21 | the `ZenMonitor.Truncator.truncate/2` function. 22 | 23 | See below for an explanation of how the Truncator treats different values 24 | 25 | ### Pass-Through Values 26 | 27 | There are a number of types that the Truncator will pass through unmodified. 28 | 29 | - Atoms 30 | - Pids 31 | - Numbers 32 | - References 33 | - Ports 34 | - Binaries less than `@max_binary_size` (see the Binary section below for more information) 35 | 36 | ### Binaries 37 | 38 | There is a configurable value `@max_binary_size` any binary encountered over this size will be 39 | truncated to `@max_binary_size - 3` and a trailing '...' will be appended to indicate the value 40 | has been truncated. This guarantees that no binary will appear in the term with size greater 41 | than `@max_binary_size` 42 | 43 | ### Tuples 44 | 45 | 0-tuples through 4-tuples will be passed through with their interior terms recursively 46 | truncated. If a tuple has more than 4 elements, it will be replaced with the `:truncated` atom. 47 | 48 | ### Lists 49 | 50 | Lists with 0 to 4 elements will be passed through with each element recursively truncated. If a 51 | list has more than 4 elements, it will be replaced with the `:truncated` atom. 52 | 53 | ### Maps 54 | 55 | Maps with a `map_size/1` less than 5 will be passed through with each value recursively 56 | truncated. If a map has a size of 5 or greater then it will be replaced with the `:truncated` 57 | atom. 58 | 59 | ### Structs 60 | 61 | Structs are converted into maps and then the map rules are applied, they are then converted back 62 | into structs. The effect is that a Struct with 4 fields or fewer will be retained (with all 63 | values recursively truncated) while Structs with 5 or more fields will be replaced with the 64 | `:truncated` atom. 65 | 66 | ### Recursion Limit 67 | 68 | The Truncator will only descend up to the `depth` argument passed into 69 | `ZenMonitor.Truncator.truncate/2`, regardless of the value, if the recursion descends deeper 70 | than this value then the `:truncated` atom will be used in place of the original value. 71 | 72 | ## Configuration 73 | 74 | `ZenMonitor.Truncator` exposes two different configuration options, and allows for one call-site 75 | override. The configuration options are evaluated at compile time, changing these values at 76 | run-time (through a facility like `Application.put_env/3`) will have no effect. 77 | 78 | Both configuration options reside under the `:zen_monitor` app key. 79 | 80 | `:max_binary_size` is size in bytes over which the Truncator will truncate the binary. The 81 | largest binary returned by the Truncator is defined to be the max_binary_size + 3, this is 82 | because when the truncator Truncator a binary it will append `...` to indicate that truncation 83 | has occurred. 84 | 85 | `:truncation_depth` is the default depth that the Truncator will recursively descend into the 86 | term to be truncated. This is the value used for `ZenMonitor.Truncator.truncate/2` if no second 87 | argument is provided, providing a call-site second argument will override this configuration. 88 | """ 89 | 90 | @max_binary_size Application.get_env(:zen_monitor, :max_binary_size, 1024) 91 | @truncation_binary_size @max_binary_size - 3 92 | @truncation_depth Application.get_env(:zen_monitor, :truncation_depth, 3) 93 | 94 | @doc """ 95 | Truncates a term to a given depth 96 | 97 | See the module documentation for more information about how truncation works. 98 | """ 99 | @spec truncate(term, depth :: pos_integer()) :: term 100 | def truncate(term, depth \\ @truncation_depth) do 101 | do_truncate(term, 0, depth) 102 | end 103 | 104 | ## Private 105 | 106 | defp do_truncate({:shutdown, _} = shutdown, 0, _) do 107 | shutdown 108 | end 109 | 110 | defp do_truncate(_, current, max_depth) when current >= max_depth do 111 | :truncated 112 | end 113 | 114 | defp do_truncate(atom, _, _) when is_atom(atom), do: atom 115 | 116 | defp do_truncate(pid, _, _) when is_pid(pid), do: pid 117 | 118 | defp do_truncate(number, _, _) when is_number(number), do: number 119 | 120 | defp do_truncate(bin, _, _) when is_binary(bin) and byte_size(bin) <= @max_binary_size, do: bin 121 | 122 | defp do_truncate(<>, _, _) do 123 | first_chunk <> "..." 124 | end 125 | 126 | defp do_truncate(ref, _, _) when is_reference(ref), do: ref 127 | 128 | defp do_truncate(port, _, _) when is_port(port), do: port 129 | 130 | # Tuples 131 | defp do_truncate({a, b, c, d}, current, max_depth) do 132 | next = current + 1 133 | 134 | {do_truncate(a, next, max_depth), do_truncate(b, next, max_depth), 135 | do_truncate(c, next, max_depth), do_truncate(d, next, max_depth)} 136 | end 137 | 138 | defp do_truncate({a, b, c}, current, max_depth) do 139 | next = current + 1 140 | 141 | {do_truncate(a, next, max_depth), do_truncate(b, next, max_depth), 142 | do_truncate(c, next, max_depth)} 143 | end 144 | 145 | defp do_truncate({a, b}, current, max_depth) do 146 | next = current + 1 147 | {do_truncate(a, next, max_depth), do_truncate(b, next, max_depth)} 148 | end 149 | 150 | defp do_truncate({a}, current, max_depth) do 151 | next = current + 1 152 | {do_truncate(a, next, max_depth)} 153 | end 154 | 155 | defp do_truncate({} = tuple, _, _) do 156 | tuple 157 | end 158 | 159 | # Lists 160 | defp do_truncate([_, _, _, _] = l, current, max_depth) do 161 | do_truncate_list(l, current, max_depth) 162 | end 163 | 164 | defp do_truncate([_, _, _] = l, current, max_depth) do 165 | do_truncate_list(l, current, max_depth) 166 | end 167 | 168 | defp do_truncate([_, _] = l, current, max_depth) do 169 | do_truncate_list(l, current, max_depth) 170 | end 171 | 172 | defp do_truncate([_] = l, current, max_depth) do 173 | do_truncate_list(l, current, max_depth) 174 | end 175 | 176 | defp do_truncate([], _, _) do 177 | [] 178 | end 179 | 180 | # Maps / Structs 181 | defp do_truncate(%struct_module{} = struct, current, max_depth) do 182 | truncated_value = 183 | struct 184 | |> Map.from_struct() 185 | |> do_truncate(current, max_depth) 186 | 187 | if is_map(truncated_value) do 188 | # Don't use Kernel.struct/2 because that crashes if this node 189 | # does not have the code for struct_module. 190 | Map.put(truncated_value, :__struct__, struct_module) 191 | else 192 | truncated_value 193 | end 194 | end 195 | 196 | defp do_truncate(%{} = m, current, max_depth) when map_size(m) < 5 do 197 | for {k, v} <- m, into: %{} do 198 | {k, do_truncate(v, current + 1, max_depth)} 199 | end 200 | end 201 | 202 | # Catch all 203 | defp do_truncate(_, _, _) do 204 | :truncated 205 | end 206 | 207 | defp do_truncate_list(l, current, max_depth) do 208 | Enum.map(l, &do_truncate(&1, current + 1, max_depth)) 209 | end 210 | end 211 | -------------------------------------------------------------------------------- /mix.exs: -------------------------------------------------------------------------------- 1 | defmodule ZenMonitor.Mixfile do 2 | use Mix.Project 3 | 4 | def project do 5 | [ 6 | app: :zen_monitor, 7 | name: "ZenMonitor", 8 | version: "2.1.0", 9 | elixir: "~> 1.7", 10 | start_permanent: Mix.env() == :prod, 11 | aliases: aliases(), 12 | deps: deps(), 13 | docs: docs(), 14 | elixirc_paths: elixirc_paths(Mix.env()), 15 | package: package() 16 | ] 17 | end 18 | 19 | def application do 20 | [ 21 | extra_applications: [:logger, :instruments], 22 | mod: {ZenMonitor.Application, []} 23 | ] 24 | end 25 | 26 | defp aliases do 27 | [ 28 | test: "test --no-start" 29 | ] 30 | end 31 | 32 | defp deps do 33 | [ 34 | {:gen_stage, "~> 1.0"}, 35 | {:instruments, "~> 2.1"}, 36 | {:gen_registry, "~> 1.0"}, 37 | {:ex_doc, "~> 0.27.3", only: :dev, runtime: false}, 38 | {:dialyxir, "~> 1.0", only: :dev, runtime: false} 39 | ] 40 | end 41 | 42 | defp docs do 43 | [ 44 | name: "ZenMonitor", 45 | extras: ["README.md"], 46 | main: "readme", 47 | source_url: "https://github.com/discordapp/zen_monitor", 48 | groups_for_modules: [ 49 | "Programmer Interface": [ 50 | ZenMonitor 51 | ], 52 | "Local ZenMonitor System": [ 53 | ZenMonitor.Local, 54 | ZenMonitor.Local.State, 55 | ZenMonitor.Local.Connector, 56 | ZenMonitor.Local.Connector.State, 57 | ZenMonitor.Local.Dispatcher, 58 | ZenMonitor.Local.Tables 59 | ], 60 | "Proxy ZenMonitor System": [ 61 | ZenMonitor.Proxy, 62 | ZenMonitor.Proxy.State, 63 | ZenMonitor.Proxy.Batcher, 64 | ZenMonitor.Proxy.Batcher.State, 65 | ZenMonitor.Proxy.Tables 66 | ], 67 | "Supervisors / OTP / Utilities": [ 68 | ZenMonitor.Application, 69 | ZenMonitor.Supervisor, 70 | ZenMonitor.Local.Supervisor, 71 | ZenMonitor.Proxy.Supervisor, 72 | ZenMonitor.Metrics, 73 | ZenMonitor.Truncator 74 | ] 75 | ] 76 | ] 77 | end 78 | 79 | defp elixirc_paths(:test) do 80 | elixirc_paths(:any) ++ ["test/support"] 81 | end 82 | 83 | defp elixirc_paths(_) do 84 | ["lib"] 85 | end 86 | 87 | defp package() do 88 | [ 89 | name: :zen_monitor, 90 | description: "ZenMonitor provides efficient monitoring of remote processes.", 91 | maintainers: ["Discord Core Infrastructure"], 92 | licenses: ["MIT"], 93 | links: %{ 94 | "GitHub" => "https://github.com/discordapp/zen_monitor" 95 | } 96 | ] 97 | end 98 | end 99 | -------------------------------------------------------------------------------- /mix.lock: -------------------------------------------------------------------------------- 1 | %{ 2 | "dialyxir": {:hex, :dialyxir, "1.0.0", "6a1fa629f7881a9f5aaf3a78f094b2a51a0357c843871b8bc98824e7342d00a5", [:mix], [{:erlex, ">= 0.2.6", [hex: :erlex, repo: "hexpm", optional: false]}], "hexpm", "aeb06588145fac14ca08d8061a142d52753dbc2cf7f0d00fc1013f53f8654654"}, 3 | "earmark": {:hex, :earmark, "1.4.10", "bddce5e8ea37712a5bfb01541be8ba57d3b171d3fa4f80a0be9bcf1db417bcaf", [:mix], [{:earmark_parser, ">= 1.4.10", [hex: :earmark_parser, repo: "hexpm", optional: false]}], "hexpm", "12dbfa80810478e521d3ffb941ad9fbfcbbd7debe94e1341b4c4a1b2411c1c27"}, 4 | "earmark_parser": {:hex, :earmark_parser, "1.4.29", "149d50dcb3a93d9f3d6f3ecf18c918fb5a2d3c001b5d3305c926cddfbd33355b", [:mix], [], "hexpm", "4902af1b3eb139016aed210888748db8070b8125c2342ce3dcae4f38dcc63503"}, 5 | "erlex": {:hex, :erlex, "0.2.6", "c7987d15e899c7a2f34f5420d2a2ea0d659682c06ac607572df55a43753aa12e", [:mix], [], "hexpm", "2ed2e25711feb44d52b17d2780eabf998452f6efda104877a3881c2f8c0c0c75"}, 6 | "ex_doc": {:hex, :ex_doc, "0.27.3", "d09ed7ab590b71123959d9017f6715b54a448d76b43cf909eb0b2e5a78a977b2", [:mix], [{:earmark_parser, "~> 1.4.19", [hex: :earmark_parser, repo: "hexpm", optional: false]}, {:makeup_elixir, "~> 0.14", [hex: :makeup_elixir, repo: "hexpm", optional: false]}, {:makeup_erlang, "~> 0.1", [hex: :makeup_erlang, repo: "hexpm", optional: false]}], "hexpm", "ee60b329d08195039bfeb25231a208749be4f2274eae42ce38f9be0538a2f2e6"}, 7 | "gen_registry": {:hex, :gen_registry, "1.0.2", "b7175cf940e5d13da5a90d283974e7f9c64d9b87cb4ceb4f2cbacf95e5260215", [:mix], [], "hexpm", "51ebb0556e9469faeb737d9c8d6112df7fbd27c68bdf308e3d3572a231e7f5d8"}, 8 | "gen_stage": {:hex, :gen_stage, "1.1.2", "b1656cd4ba431ed02c5656fe10cb5423820847113a07218da68eae5d6a260c23", [:mix], [], "hexpm", "9e39af23140f704e2b07a3e29d8f05fd21c2aaf4088ff43cb82be4b9e3148d02"}, 9 | "instruments": {:hex, :instruments, "2.1.1", "e6629f71048e963e941263494420720c41554d95b1779ffe3404bf22cecb4efd", [:mix], [{:recon, "~> 2.3.1", [hex: :recon, repo: "hexpm", optional: false]}, {:statix, "~> 1.2.1", [hex: :statix, repo: "hexpm", optional: false]}], "hexpm", "f295ddf3fb09ac37f915cd1b2bd2f4dbcafc2706730237a4bf30aa846a65ada5"}, 10 | "makeup": {:hex, :makeup, "1.1.0", "6b67c8bc2882a6b6a445859952a602afc1a41c2e08379ca057c0f525366fc3ca", [:mix], [{:nimble_parsec, "~> 1.2.2 or ~> 1.3", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "0a45ed501f4a8897f580eabf99a2e5234ea3e75a4373c8a52824f6e873be57a6"}, 11 | "makeup_elixir": {:hex, :makeup_elixir, "0.16.0", "f8c570a0d33f8039513fbccaf7108c5d750f47d8defd44088371191b76492b0b", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}, {:nimble_parsec, "~> 1.2.3", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "28b2cbdc13960a46ae9a8858c4bebdec3c9a6d7b4b9e7f4ed1502f8159f338e7"}, 12 | "makeup_erlang": {:hex, :makeup_erlang, "0.1.1", "3fcb7f09eb9d98dc4d208f49cc955a34218fc41ff6b84df7c75b3e6e533cc65f", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}], "hexpm", "174d0809e98a4ef0b3309256cbf97101c6ec01c4ab0b23e926a9e17df2077cbb"}, 13 | "nimble_parsec": {:hex, :nimble_parsec, "1.2.3", "244836e6e3f1200c7f30cb56733fd808744eca61fd182f731eac4af635cc6d0b", [:mix], [], "hexpm", "c8d789e39b9131acf7b99291e93dae60ab48ef14a7ee9d58c6964f59efb570b0"}, 14 | "recon": {:hex, :recon, "2.3.6", "2bcad0cf621fb277cabbb6413159cd3aa30265c2dee42c968697988b30108604", [:rebar3], [], "hexpm", "f55198650a8ec01d3efc04797abe550c7d023e7ff8b509f373cf933032049bd8"}, 15 | "statix": {:hex, :statix, "1.2.1", "4f23c8cc2477ea0de89fed5e34f08c54b0d28b838f7b8f26613155f2221bb31e", [:mix], [], "hexpm", "7f988988fddcce19ae376bb8e47aa5ea5dabf8d4ba78d34d1ae61eb537daf72e"}, 16 | } 17 | -------------------------------------------------------------------------------- /test/black_box_test.exs: -------------------------------------------------------------------------------- 1 | defmodule ZenMonitor.BlackBox.Test do 2 | @moduledoc """ 3 | This test suite treats the ZenMonitor system as a black box and simply asserts that the client 4 | facing behavior is correct. 5 | """ 6 | use ExUnit.Case 7 | 8 | alias ZenMonitor.Local.{Connector, Dispatcher} 9 | alias ZenMonitor.Proxy.Batcher 10 | 11 | setup do 12 | start_supervised(ZenMonitor.Supervisor) 13 | {:ok, down: :down@down, remotes: []} 14 | end 15 | 16 | @doc """ 17 | Reduces the intervals for all the batching parts of ZenMonitor so that the default 18 | assert_receive / refute_receive timeouts are an order of magnitude larger. 19 | """ 20 | def fast_zen_monitor(ctx) do 21 | # Tune the local dispatcher 22 | original_demand_interval = Dispatcher.demand_interval() 23 | Dispatcher.demand_interval(10) 24 | 25 | # Tune the local connector 26 | original_connector_interval = Connector.sweep_interval() 27 | Connector.sweep_interval(10) 28 | 29 | # Tune the remote batchers 30 | original_batch_intervals = 31 | Enum.map([node() | ctx.remotes], fn remote -> 32 | original = :rpc.call(remote, Batcher, :sweep_interval, []) 33 | :rpc.call(remote, Batcher, :sweep_interval, [10]) 34 | {remote, original} 35 | end) 36 | 37 | on_exit(fn -> 38 | # Restore the local settings 39 | Dispatcher.demand_interval(original_demand_interval) 40 | Connector.sweep_interval(original_connector_interval) 41 | 42 | # Restore the remote settings 43 | Enum.each(original_batch_intervals, fn {remote, original} -> 44 | :rpc.call(remote, Batcher, :sweep_interval, [original]) 45 | end) 46 | end) 47 | 48 | :ok 49 | end 50 | 51 | def start_compatible_remote(ctx) do 52 | {:ok, compatible, nil} = ChildNode.start_link(:zen_monitor, :Compatible) 53 | 54 | # Perform an initial connect like discovery would 55 | Node.connect(compatible) 56 | 57 | on_exit(fn -> 58 | Node.monitor(compatible, true) 59 | 60 | receive do 61 | {:nodedown, ^compatible} -> :ok 62 | end 63 | end) 64 | 65 | {:ok, compatible: compatible, remotes: [compatible | ctx.remotes()]} 66 | end 67 | 68 | def start_incompatible_remote(_) do 69 | {:ok, incompatible, nil} = ChildNode.start_link(:elixir, :Incompatible) 70 | 71 | # Perform an initial connect like discovery would 72 | Node.connect(incompatible) 73 | 74 | on_exit(fn -> 75 | Node.monitor(incompatible, true) 76 | 77 | receive do 78 | {:nodedown, ^incompatible} -> :ok 79 | end 80 | end) 81 | 82 | {:ok, incompatible: incompatible} 83 | end 84 | 85 | def start_compatible_processes(ctx) do 86 | compatible_pid = Node.spawn(ctx.compatible, Process, :sleep, [:infinity]) 87 | compatible_pid_b = Node.spawn(ctx.compatible, Process, :sleep, [:infinity]) 88 | 89 | {:ok, compatible_pid: compatible_pid, compatible_pid_b: compatible_pid_b} 90 | end 91 | 92 | def start_incompatible_processes(ctx) do 93 | incompatible_pid = Node.spawn(ctx.incompatible, Process, :sleep, [:infinity]) 94 | incompatible_pid_b = Node.spawn(ctx.incompatible, Process, :sleep, [:infinity]) 95 | 96 | {:ok, incompatible_pid: incompatible_pid, incompatible_pid_b: incompatible_pid_b} 97 | end 98 | 99 | def start_local_processes(_) do 100 | local_pid = spawn(fn -> Process.sleep(:infinity) end) 101 | local_pid_b = spawn(fn -> Process.sleep(:infinity) end) 102 | 103 | {:ok, local_pid: local_pid, local_pid_b: local_pid_b} 104 | end 105 | 106 | describe "Monitoring a local process" do 107 | setup [:fast_zen_monitor, :start_local_processes] 108 | 109 | test "monitoring a local process returns a reference", ctx do 110 | ref = ZenMonitor.monitor(ctx.local_pid) 111 | assert is_reference(ref) 112 | end 113 | 114 | test "local process returns a :DOWN message if it goes down", ctx do 115 | target = ctx.local_pid() 116 | 117 | # Monitor the local process 118 | ref = ZenMonitor.monitor(target) 119 | Helper.await_monitor_established(ref, target) 120 | 121 | # Kill the local process 122 | Process.exit(target, :kill) 123 | Helper.await_monitor_cleared(ref, target) 124 | 125 | # Assert that we receive the down messages 126 | assert_receive {:DOWN, ^ref, :process, ^target, {:zen_monitor, :killed}} 127 | 128 | # Make sure that we don't receive any additional messages 129 | refute_receive {:DOWN, _, _, _, _} 130 | end 131 | 132 | test "multiple monitors all get fired", ctx do 133 | target = ctx.local_pid() 134 | 135 | # Monitor the local process multiple times 136 | ref_a = ZenMonitor.monitor(target) 137 | ref_b = ZenMonitor.monitor(target) 138 | ref_c = ZenMonitor.monitor(target) 139 | Helper.await_monitors_established([ref_a, ref_b, ref_c], target) 140 | 141 | # Kill the local process 142 | Process.exit(target, :kill) 143 | Helper.await_monitors_cleared([ref_a, ref_b, ref_c], target) 144 | 145 | # Assert that we receive down message for each monitor 146 | assert_receive {:DOWN, ^ref_a, :process, ^target, {:zen_monitor, :killed}} 147 | assert_receive {:DOWN, ^ref_b, :process, ^target, {:zen_monitor, :killed}} 148 | assert_receive {:DOWN, ^ref_c, :process, ^target, {:zen_monitor, :killed}} 149 | 150 | # Make sure that we don't receive any additional messages 151 | refute_receive {:DOWN, _, _, _, _} 152 | end 153 | 154 | test "an already down local process returns a :DOWN message", ctx do 155 | target = ctx.local_pid() 156 | 157 | # Kill the local process, before the monitors 158 | Process.exit(target, :kill) 159 | 160 | # Monitor the local process 161 | ref = ZenMonitor.monitor(target) 162 | Helper.await_monitor_cleared(ref, target) 163 | 164 | # Assert that we receive the correct reason 165 | assert_receive {:DOWN, ^ref, :process, ^target, {:zen_monitor, :noproc}} 166 | 167 | # Make sure that we don't receive any additional messages 168 | refute_receive {:DOWN, _, _, _, _} 169 | end 170 | 171 | test "multiple monitors for already down local process returns :DOWN messages", ctx do 172 | target = ctx.local_pid() 173 | 174 | # Kill the local process, before the monitors 175 | Process.exit(target, :kill) 176 | 177 | # Monitor the local process multiple times 178 | ref_a = ZenMonitor.monitor(target) 179 | ref_b = ZenMonitor.monitor(target) 180 | ref_c = ZenMonitor.monitor(target) 181 | 182 | Helper.await_monitors_cleared([ref_a, ref_b, ref_c], target) 183 | 184 | # Assert that we receive multiple :DOWN messages with the correct reason 185 | assert_receive {:DOWN, ^ref_a, :process, ^target, {:zen_monitor, :noproc}} 186 | assert_receive {:DOWN, ^ref_b, :process, ^target, {:zen_monitor, :noproc}} 187 | assert_receive {:DOWN, ^ref_c, :process, ^target, {:zen_monitor, :noproc}} 188 | 189 | # Make sure that we don't receive any additional messages 190 | refute_receive {:DOWN, _, _, _, _} 191 | end 192 | 193 | test "mixed monitors established before and after process down", ctx do 194 | target = ctx.local_pid() 195 | 196 | # Establish some monitors before the pid is killed 197 | ref_alive_a = ZenMonitor.monitor(target) 198 | ref_alive_b = ZenMonitor.monitor(target) 199 | Helper.await_monitors_established([ref_alive_a, ref_alive_b], target) 200 | 201 | # Kill the local process 202 | Process.exit(target, :kill) 203 | Helper.await_monitors_cleared([ref_alive_a, ref_alive_b], target) 204 | 205 | # Assert that the initial monitors fire 206 | assert_receive {:DOWN, ^ref_alive_a, :process, ^target, {:zen_monitor, :killed}} 207 | assert_receive {:DOWN, ^ref_alive_b, :process, ^target, {:zen_monitor, :killed}} 208 | 209 | # Establish some monitors after the pid is killed 210 | ref_dead_a = ZenMonitor.monitor(target) 211 | ref_dead_b = ZenMonitor.monitor(target) 212 | 213 | Helper.await_monitors_cleared([ref_dead_a, ref_dead_b], target) 214 | 215 | # Assert that the new monitors got the expected :DOWN messages with the correct reason 216 | assert_receive {:DOWN, ^ref_dead_a, :process, ^target, {:zen_monitor, :noproc}} 217 | assert_receive {:DOWN, ^ref_dead_b, :process, ^target, {:zen_monitor, :noproc}} 218 | 219 | # Make sure that we don't receive any additional messages 220 | refute_receive {:DOWN, _, _, _, _} 221 | end 222 | 223 | test "multiple down processes all report back as :DOWN", ctx do 224 | target = ctx.local_pid() 225 | other = ctx.local_pid_b() 226 | 227 | # Establish multiple monitors for each process 228 | target_ref_a = ZenMonitor.monitor(target) 229 | target_ref_b = ZenMonitor.monitor(target) 230 | other_ref_a = ZenMonitor.monitor(other) 231 | other_ref_b = ZenMonitor.monitor(other) 232 | 233 | Helper.await_monitors_established([target_ref_a, target_ref_b], target) 234 | Helper.await_monitors_established([other_ref_a, other_ref_b], other) 235 | 236 | # Kill both local processes 237 | Process.exit(target, :kill) 238 | Process.exit(other, :kill) 239 | 240 | Helper.await_monitors_cleared([target_ref_a, target_ref_b], target) 241 | Helper.await_monitors_cleared([other_ref_a, other_ref_b], other) 242 | 243 | # Assert that we receive all the expected :DOWN messages 244 | assert_receive {:DOWN, ^target_ref_a, :process, ^target, {:zen_monitor, :killed}} 245 | assert_receive {:DOWN, ^target_ref_b, :process, ^target, {:zen_monitor, :killed}} 246 | assert_receive {:DOWN, ^other_ref_a, :process, ^other, {:zen_monitor, :killed}} 247 | assert_receive {:DOWN, ^other_ref_b, :process, ^other, {:zen_monitor, :killed}} 248 | end 249 | 250 | test "multiple already down process all report back as :DOWN", ctx do 251 | target = ctx.local_pid() 252 | other = ctx.local_pid_b() 253 | 254 | # Kill both local processes 255 | Process.exit(target, :kill) 256 | Process.exit(other, :kill) 257 | 258 | # Establish multiple monitors for each process 259 | target_ref_a = ZenMonitor.monitor(target) 260 | target_ref_b = ZenMonitor.monitor(target) 261 | other_ref_a = ZenMonitor.monitor(other) 262 | other_ref_b = ZenMonitor.monitor(other) 263 | 264 | Helper.await_monitors_cleared([target_ref_a, target_ref_b], target) 265 | Helper.await_monitors_cleared([other_ref_a, other_ref_b], other) 266 | 267 | # Assert that we receive all the expected :DOWN messages 268 | assert_receive {:DOWN, ^target_ref_a, :process, ^target, {:zen_monitor, :noproc}} 269 | assert_receive {:DOWN, ^target_ref_b, :process, ^target, {:zen_monitor, :noproc}} 270 | assert_receive {:DOWN, ^other_ref_a, :process, ^other, {:zen_monitor, :noproc}} 271 | assert_receive {:DOWN, ^other_ref_b, :process, ^other, {:zen_monitor, :noproc}} 272 | end 273 | 274 | test "mixed down processes all report back as :DOWN", ctx do 275 | target = ctx.local_pid() 276 | other = ctx.local_pid_b() 277 | 278 | # Kill target before establishing any monitors 279 | Process.exit(target, :kill) 280 | 281 | # Establish multiple monitors for each process 282 | target_ref_a = ZenMonitor.monitor(target) 283 | target_ref_b = ZenMonitor.monitor(target) 284 | other_ref_a = ZenMonitor.monitor(other) 285 | other_ref_b = ZenMonitor.monitor(other) 286 | Helper.await_monitors_established([other_ref_a, other_ref_b], other) 287 | 288 | # Kill other after establishing the monitors 289 | Process.exit(other, :kill) 290 | Helper.await_monitors_cleared([target_ref_a, target_ref_b], target) 291 | Helper.await_monitors_cleared([other_ref_a, other_ref_b], other) 292 | 293 | # Assert that we receive all the expected :DOWN messages 294 | assert_receive {:DOWN, ^target_ref_a, :process, ^target, {:zen_monitor, :noproc}} 295 | assert_receive {:DOWN, ^target_ref_b, :process, ^target, {:zen_monitor, :noproc}} 296 | assert_receive {:DOWN, ^other_ref_a, :process, ^other, {:zen_monitor, :killed}} 297 | assert_receive {:DOWN, ^other_ref_b, :process, ^other, {:zen_monitor, :killed}} 298 | end 299 | end 300 | 301 | describe "Monitoring a remote process on a compatible node" do 302 | setup [:start_compatible_remote, :start_compatible_processes, :fast_zen_monitor] 303 | 304 | test "monitoring a remote process returns a reference", ctx do 305 | ref = ZenMonitor.monitor(ctx.compatible_pid) 306 | assert is_reference(ref) 307 | end 308 | 309 | test "remote process returns a :DOWN message if it goes down", ctx do 310 | target = ctx.compatible_pid() 311 | 312 | # Monitor the remote process 313 | ref = ZenMonitor.monitor(target) 314 | Helper.await_monitor_established(ref, target) 315 | 316 | # Kill the remote process 317 | Process.exit(target, :kill) 318 | Helper.await_monitor_cleared(ref, target) 319 | 320 | # Assert that we receive the down messages 321 | assert_receive {:DOWN, ^ref, :process, ^target, {:zen_monitor, :killed}} 322 | 323 | # Make sure that we don't receive any additional messages 324 | refute_receive {:DOWN, _, _, _, _} 325 | end 326 | 327 | test "multiple monitors all get fired", ctx do 328 | target = ctx.compatible_pid() 329 | 330 | # Monitor the remote process multiple times 331 | ref_a = ZenMonitor.monitor(target) 332 | ref_b = ZenMonitor.monitor(target) 333 | ref_c = ZenMonitor.monitor(target) 334 | Helper.await_monitors_established([ref_a, ref_b, ref_c], target) 335 | 336 | # Kill the remote process 337 | Process.exit(target, :kill) 338 | Helper.await_monitors_cleared([ref_a, ref_b, ref_c], target) 339 | 340 | # Assert that we receive down message for each monitor 341 | assert_receive {:DOWN, ^ref_a, :process, ^target, {:zen_monitor, :killed}} 342 | assert_receive {:DOWN, ^ref_b, :process, ^target, {:zen_monitor, :killed}} 343 | assert_receive {:DOWN, ^ref_c, :process, ^target, {:zen_monitor, :killed}} 344 | 345 | # Make sure that we don't receive any additional messages 346 | refute_receive {:DOWN, _, _, _, _} 347 | end 348 | 349 | test "an already down remote process returns a :DOWN message", ctx do 350 | target = ctx.compatible_pid() 351 | 352 | # Kill the remote process, before the monitors 353 | Process.exit(target, :kill) 354 | 355 | # Monitor the remote process 356 | ref = ZenMonitor.monitor(target) 357 | Helper.await_monitor_cleared(ref, target) 358 | # Assert that we receive the correct reason 359 | assert_receive {:DOWN, ^ref, :process, ^target, {:zen_monitor, :noproc}} 360 | 361 | # Make sure that we don't receive any additional messages 362 | refute_receive {:DOWN, _, _, _, _} 363 | end 364 | 365 | test "multiple monitors for alread down remote process returns :DOWN messages", ctx do 366 | target = ctx.compatible_pid() 367 | 368 | # Kill the remote process, before the monitors 369 | Process.exit(target, :kill) 370 | 371 | # Monitor the remote process multiple times 372 | ref_a = ZenMonitor.monitor(target) 373 | ref_b = ZenMonitor.monitor(target) 374 | ref_c = ZenMonitor.monitor(target) 375 | 376 | Helper.await_monitors_cleared([ref_a, ref_b, ref_c], target) 377 | 378 | # Assert that we receive multiple :DOWN messages with the correct reason 379 | assert_receive {:DOWN, ^ref_a, :process, ^target, {:zen_monitor, :noproc}} 380 | assert_receive {:DOWN, ^ref_b, :process, ^target, {:zen_monitor, :noproc}} 381 | assert_receive {:DOWN, ^ref_c, :process, ^target, {:zen_monitor, :noproc}} 382 | 383 | # Make sure that we don't receive any additional messages 384 | refute_receive {:DOWN, _, _, _, _} 385 | end 386 | 387 | test "mixed monitors established before and after process down", ctx do 388 | target = ctx.compatible_pid() 389 | 390 | # Establish some monitors before the pid is killed 391 | ref_alive_a = ZenMonitor.monitor(target) 392 | ref_alive_b = ZenMonitor.monitor(target) 393 | Helper.await_monitors_established([ref_alive_a, ref_alive_b], target) 394 | 395 | # Kill the remote process 396 | Process.exit(target, :kill) 397 | Helper.await_monitors_cleared([ref_alive_a, ref_alive_b], target) 398 | 399 | # Assert that the initial monitors fire 400 | assert_receive {:DOWN, ^ref_alive_a, :process, ^target, {:zen_monitor, :killed}} 401 | assert_receive {:DOWN, ^ref_alive_b, :process, ^target, {:zen_monitor, :killed}} 402 | 403 | # Establish some monitors after the pid is killed 404 | ref_dead_a = ZenMonitor.monitor(target) 405 | ref_dead_b = ZenMonitor.monitor(target) 406 | Helper.await_monitors_cleared([ref_dead_a, ref_dead_b], target) 407 | 408 | # Assert that the new monitors got the expected :DOWN messages with the correct reason 409 | assert_receive {:DOWN, ^ref_dead_a, :process, ^target, {:zen_monitor, :noproc}} 410 | assert_receive {:DOWN, ^ref_dead_b, :process, ^target, {:zen_monitor, :noproc}} 411 | 412 | # Make sure that we don't receive any additional messages 413 | refute_receive {:DOWN, _, _, _, _} 414 | end 415 | 416 | test "multiple down processes all report back as :DOWN", ctx do 417 | target = ctx.compatible_pid() 418 | other = ctx.compatible_pid_b() 419 | 420 | # Establish multiple monitors for each process 421 | target_ref_a = ZenMonitor.monitor(target) 422 | target_ref_b = ZenMonitor.monitor(target) 423 | other_ref_a = ZenMonitor.monitor(other) 424 | other_ref_b = ZenMonitor.monitor(other) 425 | Helper.await_monitors_established([target_ref_a, target_ref_b], target) 426 | Helper.await_monitors_established([other_ref_a, other_ref_b], other) 427 | 428 | # Kill both remote processes 429 | Process.exit(target, :kill) 430 | Process.exit(other, :kill) 431 | Helper.await_monitors_cleared([target_ref_a, target_ref_b], target) 432 | Helper.await_monitors_cleared([other_ref_a, other_ref_b], other) 433 | 434 | # Assert that we receive all the expected :DOWN messages 435 | assert_receive {:DOWN, ^target_ref_a, :process, ^target, {:zen_monitor, :killed}} 436 | assert_receive {:DOWN, ^target_ref_b, :process, ^target, {:zen_monitor, :killed}} 437 | assert_receive {:DOWN, ^other_ref_a, :process, ^other, {:zen_monitor, :killed}} 438 | assert_receive {:DOWN, ^other_ref_b, :process, ^other, {:zen_monitor, :killed}} 439 | end 440 | 441 | test "multiple already down process all report back as :DOWN", ctx do 442 | target = ctx.compatible_pid() 443 | other = ctx.compatible_pid_b() 444 | 445 | # Kill both remote processes 446 | Process.exit(target, :kill) 447 | Process.exit(other, :kill) 448 | 449 | # Establish multiple monitors for each process 450 | target_ref_a = ZenMonitor.monitor(target) 451 | target_ref_b = ZenMonitor.monitor(target) 452 | other_ref_a = ZenMonitor.monitor(other) 453 | other_ref_b = ZenMonitor.monitor(other) 454 | 455 | Helper.await_monitors_cleared([target_ref_a, target_ref_b], target) 456 | Helper.await_monitors_cleared([other_ref_a, other_ref_b], other) 457 | 458 | # Assert that we receive all the expected :DOWN messages 459 | assert_receive {:DOWN, ^target_ref_a, :process, ^target, {:zen_monitor, :noproc}} 460 | assert_receive {:DOWN, ^target_ref_b, :process, ^target, {:zen_monitor, :noproc}} 461 | assert_receive {:DOWN, ^other_ref_a, :process, ^other, {:zen_monitor, :noproc}} 462 | assert_receive {:DOWN, ^other_ref_b, :process, ^other, {:zen_monitor, :noproc}} 463 | end 464 | 465 | test "mixed down processes all report back as :DOWN", ctx do 466 | target = ctx.compatible_pid() 467 | other = ctx.compatible_pid_b() 468 | 469 | # Kill target before establishing any monitors 470 | Process.exit(target, :kill) 471 | 472 | # Establish multiple monitors for each process 473 | target_ref_a = ZenMonitor.monitor(target) 474 | target_ref_b = ZenMonitor.monitor(target) 475 | other_ref_a = ZenMonitor.monitor(other) 476 | other_ref_b = ZenMonitor.monitor(other) 477 | Helper.await_monitors_established([other_ref_a, other_ref_b], other) 478 | 479 | # Kill other after establishing the monitors 480 | Process.exit(other, :kill) 481 | Helper.await_monitors_cleared([other_ref_a, other_ref_b], other) 482 | 483 | # Assert that we receive all the expected :DOWN messages 484 | assert_receive {:DOWN, ^target_ref_a, :process, ^target, {:zen_monitor, :noproc}} 485 | assert_receive {:DOWN, ^target_ref_b, :process, ^target, {:zen_monitor, :noproc}} 486 | assert_receive {:DOWN, ^other_ref_a, :process, ^other, {:zen_monitor, :killed}} 487 | assert_receive {:DOWN, ^other_ref_b, :process, ^other, {:zen_monitor, :killed}} 488 | end 489 | 490 | test "all monitored processes report back as :DOWN if the node dies", ctx do 491 | remote = ctx.compatible() 492 | target = ctx.compatible_pid() 493 | other = ctx.compatible_pid_b() 494 | 495 | # Monitor both remote processes 496 | target_ref = ZenMonitor.monitor(target) 497 | other_ref = ZenMonitor.monitor(other) 498 | Helper.await_monitor_established(target_ref, target) 499 | Helper.await_monitor_established(other_ref, other) 500 | 501 | # Stop the remote node 502 | assert :ok = :slave.stop(remote) 503 | Helper.await_monitor_cleared(target_ref, target) 504 | Helper.await_monitor_cleared(other_ref, other) 505 | 506 | # Assert that the :DOWN messages were dispatched with :nodedown 507 | assert_receive {:DOWN, ^target_ref, :process, ^target, {:zen_monitor, :nodedown}} 508 | assert_receive {:DOWN, ^other_ref, :process, ^other, {:zen_monitor, :nodedown}} 509 | end 510 | end 511 | 512 | describe "Monitoring a remote process on an incompatible node" do 513 | setup [:start_incompatible_remote, :start_incompatible_processes, :fast_zen_monitor] 514 | 515 | test "monitoring a remote process returns a reference", ctx do 516 | ref = ZenMonitor.monitor(ctx.incompatible_pid) 517 | assert is_reference(ref) 518 | end 519 | 520 | test "monitoring returns down with :nodedown", ctx do 521 | target = ctx.incompatible_pid() 522 | 523 | # Attempt to monitor incompatible node 524 | ref = ZenMonitor.monitor(target) 525 | 526 | # Assert that the :DOWN message with :incompatible are delivered 527 | assert_receive {:DOWN, ^ref, :process, ^target, {:zen_monitor, :nodedown}} 528 | end 529 | 530 | test "monitoring multiple returns multiple downs with :nodedown", ctx do 531 | target = ctx.incompatible_pid() 532 | other = ctx.incompatible_pid_b() 533 | 534 | # Attempt to monitor all the incompatible processes 535 | target_ref = ZenMonitor.monitor(target) 536 | other_ref = ZenMonitor.monitor(other) 537 | 538 | # Assert that the :DOWN messages with :incompatible are delivered 539 | assert_receive {:DOWN, ^target_ref, :process, ^target, {:zen_monitor, :nodedown}} 540 | assert_receive {:DOWN, ^other_ref, :process, ^other, {:zen_monitor, :nodedown}} 541 | end 542 | end 543 | 544 | describe "Monitoring a remote process on a compatible node that becomes incompatible" do 545 | setup [:start_compatible_remote, :start_compatible_processes, :fast_zen_monitor] 546 | 547 | test "monitoring a remote process returns a reference", ctx do 548 | ref = ZenMonitor.monitor(ctx.compatible_pid()) 549 | assert is_reference(ref) 550 | end 551 | 552 | test "subscribing to a previously compatible host will cause :nodedown", ctx do 553 | remote = ctx.compatible() 554 | target = ctx.compatible_pid() 555 | other = ctx.compatible_pid_b() 556 | 557 | # Perform an initial monitor 558 | target_ref = ZenMonitor.monitor(target) 559 | Helper.await_monitor_established(target_ref, target) 560 | 561 | # Check that the remote is considered compatible 562 | assert :compatible = ZenMonitor.compatibility_for_node(remote) 563 | 564 | # Make the remote incompatible by killing the ZenMonitor running on it 565 | assert :ok = :rpc.call(remote, Application, :stop, [:zen_monitor]) 566 | 567 | # Perform an additional monitor 568 | other_ref = ZenMonitor.monitor(other) 569 | 570 | Helper.await_monitor_cleared(target_ref, target) 571 | 572 | # Assert that we get notified for both monitored processes 573 | assert_receive {:DOWN, ^target_ref, :process, ^target, {:zen_monitor, :nodedown}} 574 | assert_receive {:DOWN, ^other_ref, :process, ^other, {:zen_monitor, :nodedown}} 575 | 576 | # Check that the remote is no longer considered compatible 577 | assert :incompatible = ZenMonitor.compatibility_for_node(remote) 578 | end 579 | end 580 | 581 | describe "Monitoring has process-level multi-tenancy" do 582 | setup [:start_compatible_remote, :start_compatible_processes, :fast_zen_monitor] 583 | 584 | test "only the down process sends a :DOWN message", ctx do 585 | target = ctx.compatible_pid() 586 | other = ctx.compatible_pid_b() 587 | 588 | # Monitor both remote processes 589 | target_ref = ZenMonitor.monitor(target) 590 | other_ref = ZenMonitor.monitor(other) 591 | Helper.await_monitor_established(target_ref, target) 592 | Helper.await_monitor_established(other_ref, other) 593 | 594 | # Kill the target process 595 | Process.exit(target, :kill) 596 | Helper.await_monitor_cleared(target_ref, target) 597 | 598 | # Assert that we receive a :DOWN for the target 599 | assert_receive {:DOWN, ^target_ref, :process, ^target, {:zen_monitor, _}} 600 | 601 | # Assert that we do not receive a :DOWN for the other process 602 | refute_receive {:DOWN, ^other_ref, :process, ^other, {:zen_monitor, _}} 603 | end 604 | 605 | test "only the already down process sends a :DOWN message", ctx do 606 | target = ctx.compatible_pid() 607 | other = ctx.compatible_pid_b() 608 | 609 | # Kill the target process 610 | Process.exit(target, :kill) 611 | 612 | # Monitor both remote processes 613 | target_ref = ZenMonitor.monitor(target) 614 | other_ref = ZenMonitor.monitor(other) 615 | 616 | # Assert that we receive a :DOWN for the target 617 | assert_receive {:DOWN, ^target_ref, :process, ^target, {:zen_monitor, _}} 618 | 619 | # Assert that we do not receive a :DOWN for the other process 620 | refute_receive {:DOWN, ^other_ref, :process, ^other, {:zen_monitor, _}} 621 | end 622 | end 623 | 624 | describe "Demonitor" do 625 | setup [:start_compatible_remote, :start_compatible_processes, :fast_zen_monitor] 626 | 627 | test "prevents :DOWN from being delivered", ctx do 628 | target = ctx.compatible_pid() 629 | 630 | # Monitor the remote process 631 | ref = ZenMonitor.monitor(target) 632 | 633 | # Demonitor the reference 634 | ZenMonitor.demonitor(ref) 635 | 636 | # Kill the process 637 | Process.exit(target, :kill) 638 | 639 | # Assert that nothing was delivered 640 | refute_receive {:DOWN, ^ref, :process, ^target, _} 641 | end 642 | 643 | test ":DOWN sent before demonitor still exists", ctx do 644 | target = ctx.compatible_pid() 645 | 646 | # Monitor the remote process 647 | ref = ZenMonitor.monitor(target) 648 | Helper.await_monitor_established(ref, target) 649 | 650 | # Kill the remote process 651 | Process.exit(target, :kill) 652 | Helper.await_monitor_cleared(ref, target) 653 | 654 | # Demonitor the reference 655 | ZenMonitor.demonitor(ref) 656 | 657 | # Assert that a down message had already been received 658 | assert_received {:DOWN, ^ref, :process, ^target, {:zen_monitor, _}} 659 | end 660 | 661 | test "only effects the demonitored reference", ctx do 662 | target = ctx.compatible_pid() 663 | 664 | # Monitor the remote process twice 665 | ref_to_demonitor = ZenMonitor.monitor(target) 666 | ref_to_keep = ZenMonitor.monitor(target) 667 | Helper.await_monitors_established([ref_to_demonitor, ref_to_keep], target) 668 | 669 | # Demonitor one of the references 670 | ZenMonitor.demonitor(ref_to_demonitor) 671 | 672 | # Kill the remote process 673 | Process.exit(target, :kill) 674 | Helper.await_monitor_cleared(ref_to_keep, target) 675 | 676 | # Assert that the monitor that was not demonitored fired 677 | assert_receive {:DOWN, ^ref_to_keep, :process, ^target, {:zen_monitor, _}} 678 | 679 | # Assert that the demonitored monitor did not fire 680 | refute_receive {:DOWN, ^ref_to_demonitor, :process, ^target, _} 681 | end 682 | end 683 | 684 | describe "Demonitor Flush" do 685 | setup [:start_compatible_remote, :start_compatible_processes, :fast_zen_monitor] 686 | 687 | test "prevents :DOWN from being delivered", ctx do 688 | target = ctx.compatible_pid() 689 | 690 | # Monitor the remote process 691 | ref = ZenMonitor.monitor(target) 692 | Helper.await_monitor_established(ref, target) 693 | 694 | # Demonitor the reference 695 | ZenMonitor.demonitor(ref, [:flush]) 696 | 697 | # Kill the process 698 | Process.exit(target, :kill) 699 | Helper.await_monitor_cleared(ref, target) 700 | 701 | # Assert that nothing was delivered 702 | refute_receive {:DOWN, ^ref, :process, ^target, _} 703 | end 704 | 705 | test ":DOWN sent before demonitor will be consumed by the flush", ctx do 706 | target = ctx.compatible_pid() 707 | 708 | # Monitor the remote process 709 | ref = ZenMonitor.monitor(target) 710 | Helper.await_monitor_established(ref, target) 711 | 712 | # Kill the remote process 713 | Process.exit(target, :kill) 714 | Helper.await_monitor_cleared(ref, target) 715 | 716 | # Demonitor the reference 717 | ZenMonitor.demonitor(ref, [:flush]) 718 | 719 | # Assert that no down message has been received 720 | refute_receive {:DOWN, ^ref, :process, ^target, {:zen_monitor, _}} 721 | end 722 | 723 | test ":flush only removes the flushed reference", ctx do 724 | target = ctx.compatible_pid() 725 | 726 | # Monitor the remote process twice 727 | ref_to_flush = ZenMonitor.monitor(target) 728 | ref_to_demonitor = ZenMonitor.monitor(target) 729 | ref_to_keep = ZenMonitor.monitor(target) 730 | Helper.await_monitors_established([ref_to_flush, ref_to_demonitor, ref_to_keep], target) 731 | 732 | # Kill the remote process 733 | Process.exit(target, :kill) 734 | Helper.await_monitors_cleared([ref_to_flush, ref_to_demonitor, ref_to_keep], target) 735 | 736 | # Flush one of the references 737 | ZenMonitor.demonitor(ref_to_flush, [:flush]) 738 | 739 | # Demonitor one of the references 740 | ZenMonitor.demonitor(ref_to_demonitor) 741 | 742 | # Assert that the monitor that was not demonitored fired 743 | assert_receive {:DOWN, ^ref_to_keep, :process, ^target, {:zen_monitor, _}} 744 | 745 | # Assert that the demonitored non-flush monitor fired 746 | assert_receive {:DOWN, ^ref_to_demonitor, :process, ^target, {:zen_monitor, _}} 747 | 748 | # Assert that the demonitored and flushed monitor did not fire 749 | refute_receive {:DOWN, ^ref_to_flush, :process, ^target, _} 750 | end 751 | end 752 | 753 | describe "Compatibility For Node" do 754 | setup [:start_compatible_remote, :start_incompatible_remote] 755 | 756 | test "when remote is compatible", ctx do 757 | assert :compatible = ZenMonitor.connect(ctx.compatible) 758 | assert :compatible = ZenMonitor.compatibility_for_node(ctx.compatible) 759 | end 760 | 761 | test "when remote is incompatible", ctx do 762 | assert :incompatible = ZenMonitor.connect(ctx.incompatible) 763 | assert :incompatible = ZenMonitor.compatibility_for_node(ctx.incompatible) 764 | end 765 | 766 | test "when remote is down", ctx do 767 | assert :incompatible = ZenMonitor.connect(ctx.down) 768 | assert :incompatible = ZenMonitor.compatibility_for_node(ctx.down) 769 | end 770 | end 771 | end 772 | -------------------------------------------------------------------------------- /test/local/connector_test.exs: -------------------------------------------------------------------------------- 1 | defmodule ZenMonitor.Local.Connector.Test do 2 | use ExUnit.Case 3 | 4 | alias ZenMonitor.Local.{Connector, Dispatcher} 5 | alias ZenMonitor.Proxy.Batcher 6 | alias ZenMonitor.Test.Support.ObservableGen 7 | 8 | setup do 9 | {:ok, compatible, nil} = ChildNode.start_link(:zen_monitor, :Compatible) 10 | {:ok, incompatible, nil} = ChildNode.start_link(:elixir, :Incompatible) 11 | 12 | start_supervised(ZenMonitor.Supervisor) 13 | 14 | on_exit(fn -> 15 | Node.monitor(compatible, true) 16 | Node.monitor(incompatible, true) 17 | 18 | receive do 19 | {:nodedown, ^compatible} -> :ok 20 | end 21 | 22 | receive do 23 | {:nodedown, ^incompatible} -> :ok 24 | end 25 | end) 26 | 27 | {:ok, compatible: compatible, incompatible: incompatible, down: :down@down} 28 | end 29 | 30 | def disable_sweep(_) do 31 | # Set sweep interval to 1 minute (effectively disable for this describe block) 32 | original_sweep_interval = Connector.sweep_interval() 33 | Connector.sweep_interval(60_000) 34 | 35 | on_exit(fn -> 36 | Connector.sweep_interval(original_sweep_interval) 37 | end) 38 | 39 | :ok 40 | end 41 | 42 | def reduce_chunk_size(_) do 43 | # Set chunk size to 2 for testing convenience 44 | original_chunk_size = Connector.chunk_size() 45 | Connector.chunk_size(2) 46 | 47 | on_exit(fn -> 48 | Connector.chunk_size(original_chunk_size) 49 | end) 50 | 51 | :ok 52 | end 53 | 54 | def start_remote_process(ctx) do 55 | compatible_pid = Node.spawn(ctx.compatible, Process, :sleep, [:infinity]) 56 | compatible_pid_b = Node.spawn(ctx.compatible, Process, :sleep, [:infinity]) 57 | compatible_pid_c = Node.spawn(ctx.compatible, Process, :sleep, [:infinity]) 58 | 59 | incompatible_pid = Node.spawn(ctx.incompatible, Process, :sleep, [:infinity]) 60 | incompatible_pid_b = Node.spawn(ctx.incompatible, Process, :sleep, [:infinity]) 61 | incompatible_pid_c = Node.spawn(ctx.incompatible, Process, :sleep, [:infinity]) 62 | 63 | { 64 | :ok, 65 | compatible_pid: compatible_pid, 66 | compatible_pid_b: compatible_pid_b, 67 | compatible_pid_c: compatible_pid_c, 68 | incompatible_pid: incompatible_pid, 69 | incompatible_pid_b: incompatible_pid_b, 70 | incompatible_pid_c: incompatible_pid_c 71 | } 72 | end 73 | 74 | def observe_gen(_) do 75 | # Start up an observer 76 | {:ok, observer} = ObservableGen.start_link(self()) 77 | 78 | # Replace the original rpc_module with the ObservableRPC 79 | original_gen_module = ZenMonitor.gen_module() 80 | ZenMonitor.gen_module(ObservableGen) 81 | 82 | on_exit(fn -> 83 | ZenMonitor.gen_module(original_gen_module) 84 | end) 85 | 86 | {:ok, observer: observer} 87 | end 88 | 89 | def observe_zen_monitor(_) do 90 | Process.unregister(ZenMonitor.Local) 91 | Process.register(self(), ZenMonitor.Local) 92 | :ok 93 | end 94 | 95 | @doc """ 96 | Reduces the intervals for all the batching parts of ZenMonitor so that the default 97 | assert_receive / refute_receive timeouts are an order of magnitude larger. 98 | """ 99 | def fast_zen_monitor(ctx) do 100 | # Tune the local dispatcher 101 | original_demand_interval = Dispatcher.demand_interval() 102 | Dispatcher.demand_interval(10) 103 | 104 | # Tune the local connector 105 | original_connector_interval = Connector.sweep_interval() 106 | Connector.sweep_interval(10) 107 | 108 | # Tune the remote batchers 109 | original_batch_intervals = 110 | Enum.map([node(), ctx.compatible], fn remote -> 111 | original = :rpc.call(remote, Batcher, :sweep_interval, []) 112 | :rpc.call(remote, Batcher, :sweep_interval, [10]) 113 | {remote, original} 114 | end) 115 | 116 | on_exit(fn -> 117 | # Restore the local settings 118 | Dispatcher.demand_interval(original_demand_interval) 119 | Connector.sweep_interval(original_connector_interval) 120 | 121 | # Restore the remote settings 122 | Enum.each(original_batch_intervals, fn {remote, original} -> 123 | :rpc.call(remote, Batcher, :sweep_interval, [original]) 124 | end) 125 | end) 126 | 127 | :ok 128 | end 129 | 130 | describe "Getting a connector" do 131 | test "get connector for compatible remote node", ctx do 132 | connector = Connector.get_for_node(ctx.compatible) 133 | assert Process.alive?(connector) 134 | end 135 | 136 | test "get connector for incompatible remote node", ctx do 137 | connector = Connector.get_for_node(ctx.incompatible) 138 | assert Process.alive?(connector) 139 | end 140 | 141 | test "multiple gets return the same connector", ctx do 142 | connector_a = Connector.get_for_node(ctx.compatible) 143 | connector_b = Connector.get_for_node(ctx.compatible) 144 | 145 | assert connector_a == connector_b 146 | end 147 | 148 | test "new connector after connector is killed", ctx do 149 | original = Connector.get_for_node(ctx.compatible) 150 | assert Process.alive?(original) 151 | 152 | Process.exit(original, :kill) 153 | refute Process.alive?(original) 154 | 155 | replacement = Connector.get_for_node(ctx.compatible) 156 | 157 | replacement = if replacement == original do 158 | Process.sleep(50) 159 | Connector.get_for_node(ctx.compatible) 160 | else 161 | replacement 162 | end 163 | 164 | assert Process.alive?(replacement) 165 | 166 | assert original != replacement 167 | end 168 | 169 | test "each remote has its own connector", ctx do 170 | connector_a = Connector.get_for_node(ctx.compatible) 171 | connector_b = Connector.get_for_node(ctx.incompatible) 172 | 173 | assert connector_a != connector_b 174 | end 175 | end 176 | 177 | describe "Performing a connect" do 178 | setup [:observe_gen] 179 | 180 | test "connecting to a compatible remote node", ctx do 181 | compatible = ctx.compatible() 182 | 183 | assert :compatible = Connector.connect(compatible) 184 | assert_receive {:observe, :call, {ZenMonitor.Proxy, ^compatible}, :ping, _} 185 | end 186 | 187 | test "connecting to an incompatible remote node", ctx do 188 | incompatible = ctx.incompatible 189 | 190 | assert :incompatible = Connector.connect(incompatible) 191 | assert_receive {:observe, :call, {ZenMonitor.Proxy, ^incompatible}, :ping, _} 192 | end 193 | 194 | test "connecting to a down remote node", ctx do 195 | down = ctx.down() 196 | 197 | assert :incompatible = Connector.connect(down) 198 | refute_receive {:observe, :call, _, _, _} 199 | end 200 | end 201 | 202 | describe "Connect status caching" do 203 | setup [:disable_sweep] 204 | 205 | test "miss when never connected", ctx do 206 | assert :miss = Connector.cached_compatibility(ctx.compatible) 207 | end 208 | 209 | test "compatible hit after successful connection", ctx do 210 | assert :miss = Connector.cached_compatibility(ctx.compatible) 211 | assert :compatible = Connector.connect(ctx.compatible) 212 | assert :compatible = Connector.cached_compatibility(ctx.compatible) 213 | end 214 | 215 | test "incompatible hit after unsuccessful connection", ctx do 216 | assert :miss = Connector.cached_compatibility(ctx.incompatible) 217 | assert :incompatible = Connector.connect(ctx.incompatible) 218 | assert :incompatible = Connector.cached_compatibility(ctx.incompatible) 219 | end 220 | 221 | test "incompatible cache entries expire", ctx do 222 | assert :miss = Connector.cached_compatibility(ctx.incompatible) 223 | assert :incompatible = Connector.connect(ctx.incompatible) 224 | assert :incompatible = Connector.cached_compatibility(ctx.incompatible) 225 | 226 | assert Helper.wait_until(fn -> 227 | {:expired, _} = Connector.cached_compatibility(ctx.incompatible) 228 | true 229 | end) 230 | end 231 | 232 | test "remote node crash causes an unavailable cache", ctx do 233 | assert :miss = Connector.cached_compatibility(ctx.compatible) 234 | assert :compatible = Connector.connect(ctx.compatible) 235 | assert :ok = :slave.stop(ctx.compatible) 236 | 237 | assert Helper.wait_until(fn -> 238 | Connector.cached_compatibility(ctx.compatible) == :unavailable 239 | end) 240 | end 241 | end 242 | 243 | describe "Compatibility checking" do 244 | test "all nodes start off incompatible", ctx do 245 | assert :incompatible = Connector.compatibility(ctx.compatible) 246 | assert :incompatible = Connector.compatibility(ctx.incompatible) 247 | assert :incompatible = Connector.compatibility(ctx.down) 248 | end 249 | 250 | test "after connecting to a compatible node it becomes compatible", ctx do 251 | remote = ctx.compatible() 252 | 253 | assert :incompatible = Connector.compatibility(remote) 254 | assert :compatible = Connector.connect(remote) 255 | assert :compatible = Connector.compatibility(remote) 256 | end 257 | 258 | test "after connecting to an incompatible node it remains incompatible", ctx do 259 | remote = ctx.incompatible() 260 | 261 | assert :incompatible = Connector.compatibility(remote) 262 | assert :incompatible = Connector.connect(remote) 263 | assert :incompatible = Connector.compatibility(remote) 264 | end 265 | 266 | test "after connecting to a down node it remains incompatible", ctx do 267 | remote = ctx.down() 268 | 269 | assert :incompatible = Connector.compatibility(remote) 270 | assert :incompatible = Connector.connect(remote) 271 | assert :incompatible = Connector.compatibility(remote) 272 | end 273 | end 274 | 275 | describe "Monitoring a remote process (local bookkeeping)" do 276 | setup [:disable_sweep, :start_remote_process] 277 | 278 | test "unmonitored pid is added to queue", ctx do 279 | ref = make_ref() 280 | 281 | connector = Connector.get_for_node(ctx.compatible) 282 | initial_state = :sys.get_state(connector) 283 | 284 | assert initial_state.length == 0 285 | assert :queue.len(initial_state.batch) == 0 286 | 287 | Connector.monitor(ctx.compatible_pid, ref, self()) 288 | 289 | # This assertion isn't actually needed, but since monitor is async, this is an easy way to 290 | # check if the operation has completed 291 | assert :compatible = Connector.connect(ctx.compatible) 292 | 293 | updated_state = :sys.get_state(connector) 294 | 295 | expected_pid = ctx.compatible_pid 296 | assert updated_state.length == 1 297 | assert :queue.len(updated_state.batch) == 1 298 | assert {:value, {:subscribe, ^expected_pid}} = :queue.peek(updated_state.batch) 299 | end 300 | 301 | test "already monitored pid is not added to the queue", ctx do 302 | ref_1 = make_ref() 303 | ref_2 = make_ref() 304 | 305 | connector = Connector.get_for_node(ctx.compatible) 306 | initial_state = :sys.get_state(connector) 307 | 308 | assert initial_state.length == 0 309 | assert :queue.len(initial_state.batch) == 0 310 | 311 | # Monitor the same pid twice 312 | Connector.monitor(ctx.compatible_pid, ref_1, self()) 313 | Connector.monitor(ctx.compatible_pid, ref_2, self()) 314 | 315 | # This assertion isn't actually needed, but since monitor is async, this is an easy way to 316 | # check if the operation has completed 317 | assert :compatible = Connector.connect(ctx.compatible) 318 | 319 | updated_state = :sys.get_state(connector) 320 | 321 | expected_pid = ctx.compatible_pid 322 | assert updated_state.length == 1 323 | assert :queue.len(updated_state.batch) == 1 324 | assert {:value, {:subscribe, ^expected_pid}} = :queue.peek(updated_state.batch) 325 | end 326 | end 327 | 328 | describe "Demonitoring a process" do 329 | setup [:observe_zen_monitor, :disable_sweep, :start_remote_process, :fast_zen_monitor] 330 | 331 | test "works on unknown process / ref", ctx do 332 | assert :ok = Connector.demonitor(ctx.compatible_pid, make_ref()) 333 | end 334 | 335 | test "doesn't send down", ctx do 336 | reference = make_ref() 337 | target = ctx.compatible_pid 338 | connector = Connector.get_for_node(ctx.compatible) 339 | 340 | # Monitor the target 341 | Connector.monitor(target, reference, self()) 342 | 343 | # Force a sweep 344 | send(connector, :sweep) 345 | 346 | # Demonitor the target 347 | Connector.demonitor(target, reference) 348 | 349 | # Kill the target 350 | Process.exit(target, :kill) 351 | 352 | # Assert that no dispatches are sent for the target 353 | refute_receive _ 354 | end 355 | 356 | test "is isolated to the demonitored process only", ctx do 357 | subscriber = self() 358 | target_reference = make_ref() 359 | other_reference = make_ref() 360 | target = ctx.compatible_pid 361 | other = ctx.compatible_pid_b 362 | connector = Connector.get_for_node(ctx.compatible) 363 | 364 | # Monitor both processes 365 | Connector.monitor(target, target_reference, subscriber) 366 | Connector.monitor(other, other_reference, subscriber) 367 | 368 | # Force a sweep 369 | send(connector, :sweep) 370 | 371 | # Demonitor the target 372 | Connector.demonitor(target, target_reference) 373 | 374 | # Kill both processes 375 | Process.exit(target, :kill) 376 | Process.exit(other, :kill) 377 | 378 | # Assert that a dispatch is enqueued for other only 379 | assert_receive {:"$gen_cast", 380 | {:enqueue, 381 | [ 382 | {^subscriber, 383 | {:DOWN, ^other_reference, :process, ^other, {:zen_monitor, _}}} 384 | ]}} 385 | end 386 | 387 | test "incorrect reference does nothing", ctx do 388 | subscriber = self() 389 | right_reference = make_ref() 390 | wrong_reference = make_ref() 391 | target = ctx.compatible_pid 392 | connector = Connector.get_for_node(ctx.compatible) 393 | 394 | # Monitor the target 395 | Connector.monitor(target, right_reference, subscriber) 396 | 397 | # Force a sweep 398 | send(connector, :sweep) 399 | 400 | # Demonitor but with the right target / wrong reference 401 | Connector.demonitor(target, wrong_reference) 402 | 403 | # Kill the target 404 | Process.exit(target, :kill) 405 | 406 | # Assert that a dispatch is still enqueued 407 | assert_receive {:"$gen_cast", 408 | {:enqueue, 409 | [ 410 | {^subscriber, 411 | {:DOWN, ^right_reference, :process, ^target, {:zen_monitor, _}}} 412 | ]}} 413 | end 414 | 415 | test "incorrect pid does nothing", ctx do 416 | subscriber = self() 417 | reference = make_ref() 418 | right_target = ctx.compatible_pid 419 | wrong_target = ctx.compatible_pid_b 420 | connector = Connector.get_for_node(ctx.compatible) 421 | 422 | # Monitor the target 423 | Connector.monitor(right_target, reference, subscriber) 424 | 425 | # Force a sweep 426 | send(connector, :sweep) 427 | 428 | # Demonitor but with the wrong target / right reference 429 | Connector.demonitor(wrong_target, reference) 430 | 431 | # Kill the target 432 | Process.exit(right_target, :kill) 433 | 434 | # Assert that a dispatch is still enqueued 435 | assert_receive {:"$gen_cast", 436 | {:enqueue, 437 | [ 438 | {^subscriber, 439 | {:DOWN, ^reference, :process, ^right_target, {:zen_monitor, _}}} 440 | ]}} 441 | end 442 | 443 | test "demonitoring the only monitor adds an unsubscribe to the queue", ctx do 444 | subscriber = self() 445 | reference = make_ref() 446 | target = ctx.compatible_pid 447 | connector = Connector.get_for_node(ctx.compatible) 448 | 449 | # Monitor the target 450 | Connector.monitor(target, reference, subscriber) 451 | 452 | # Check the monitor state 453 | monitor_state = :sys.get_state(connector) 454 | assert monitor_state.length == 1 455 | assert :queue.len(monitor_state.batch) == 1 456 | assert {:value, {:subscribe, ^target}} = :queue.peek(monitor_state.batch) 457 | 458 | # Force a sweep 459 | send(connector, :sweep) 460 | 461 | # Demonitor the target 462 | Connector.demonitor(target, reference) 463 | 464 | # Check the demonitor state 465 | demonitor_state = :sys.get_state(connector) 466 | assert demonitor_state.length == 1 467 | assert :queue.len(demonitor_state.batch) == 1 468 | assert {:value, {:unsubscribe, ^target}} = :queue.peek(demonitor_state.batch) 469 | end 470 | 471 | test "demonitoring one of many monitors does not add an unsubscribe to the queue", ctx do 472 | subscriber = self() 473 | reference = make_ref() 474 | other_reference = make_ref() 475 | target = ctx.compatible_pid 476 | connector = Connector.get_for_node(ctx.compatible) 477 | 478 | # Monitor the target multiple times 479 | Connector.monitor(target, reference, subscriber) 480 | Connector.monitor(target, other_reference, subscriber) 481 | 482 | # Check the monitor state 483 | monitor_state = :sys.get_state(connector) 484 | assert monitor_state.length == 1 485 | assert :queue.len(monitor_state.batch) == 1 486 | assert {:value, {:subscribe, ^target}} = :queue.peek(monitor_state.batch) 487 | 488 | # Force a sweep 489 | send(connector, :sweep) 490 | 491 | # Demonitor one of the references 492 | Connector.demonitor(target, reference) 493 | 494 | # Check the demonitor state 495 | demonitor_state = :sys.get_state(connector) 496 | assert demonitor_state.length == 0 497 | assert :queue.len(demonitor_state.batch) == 0 498 | end 499 | 500 | test "demonitoring the last of many monitors adds an unsubscribe to the queue", ctx do 501 | subscriber = self() 502 | reference = make_ref() 503 | other_reference = make_ref() 504 | target = ctx.compatible_pid 505 | connector = Connector.get_for_node(ctx.compatible) 506 | 507 | # Monitor the target multiple times 508 | Connector.monitor(target, reference, subscriber) 509 | Connector.monitor(target, other_reference, subscriber) 510 | 511 | # Check the monitor state 512 | monitor_state = :sys.get_state(connector) 513 | assert monitor_state.length == 1 514 | assert :queue.len(monitor_state.batch) == 1 515 | assert {:value, {:subscribe, ^target}} = :queue.peek(monitor_state.batch) 516 | 517 | # Force a sweep 518 | send(connector, :sweep) 519 | 520 | # Demonitor all of the references 521 | Connector.demonitor(target, reference) 522 | Connector.demonitor(target, other_reference) 523 | 524 | # Check the demonitor state 525 | demonitor_state = :sys.get_state(connector) 526 | assert demonitor_state.length == 1 527 | assert :queue.len(demonitor_state.batch) == 1 528 | assert {:value, {:unsubscribe, ^target}} = :queue.peek(demonitor_state.batch) 529 | end 530 | end 531 | 532 | describe "Handles nodedown" do 533 | setup [:fast_zen_monitor, :observe_zen_monitor, :disable_sweep, :start_remote_process] 534 | 535 | test "marks the node as unavailable", ctx do 536 | assert :compatible = Connector.connect(ctx.compatible) 537 | 538 | # Stop the node 539 | :slave.stop(ctx.compatible) 540 | 541 | # Assert that it becomes incompatible 542 | assert Helper.wait_until(fn -> 543 | Connector.cached_compatibility(ctx.compatible) == :unavailable 544 | end) 545 | end 546 | 547 | test "fires all monitors", ctx do 548 | subscriber = self() 549 | target_ref_a_1 = make_ref() 550 | target_ref_a_2 = make_ref() 551 | target_ref_b_1 = make_ref() 552 | target_ref_b_2 = make_ref() 553 | target_ref_c_1 = make_ref() 554 | target_ref_c_2 = make_ref() 555 | target_a = ctx.compatible_pid 556 | target_b = ctx.compatible_pid_b 557 | target_c = ctx.compatible_pid_c 558 | connector = Connector.get_for_node(ctx.compatible) 559 | 560 | # Make some monitors 561 | Connector.monitor(target_a, target_ref_a_1, subscriber) 562 | Connector.monitor(target_a, target_ref_a_2, subscriber) 563 | Connector.monitor(target_b, target_ref_b_1, subscriber) 564 | Connector.monitor(target_b, target_ref_b_2, subscriber) 565 | Connector.monitor(target_c, target_ref_c_1, subscriber) 566 | Connector.monitor(target_c, target_ref_c_2, subscriber) 567 | 568 | # Force a sweep 569 | send(connector, :sweep) 570 | 571 | # Wait for the monitors to establish 572 | Process.sleep(50) 573 | 574 | # Stop the node 575 | :slave.stop(ctx.compatible) 576 | 577 | # Assert that all the expected messages get enqueued 578 | assert_receive {:"$gen_cast", 579 | {:enqueue, 580 | [ 581 | {^subscriber, 582 | {:DOWN, ^target_ref_a_1, :process, ^target_a, {:zen_monitor, :nodedown}}}, 583 | {^subscriber, 584 | {:DOWN, ^target_ref_a_2, :process, ^target_a, {:zen_monitor, :nodedown}}}, 585 | {^subscriber, 586 | {:DOWN, ^target_ref_b_1, :process, ^target_b, {:zen_monitor, :nodedown}}}, 587 | {^subscriber, 588 | {:DOWN, ^target_ref_b_2, :process, ^target_b, {:zen_monitor, :nodedown}}}, 589 | {^subscriber, 590 | {:DOWN, ^target_ref_c_1, :process, ^target_c, {:zen_monitor, :nodedown}}}, 591 | {^subscriber, 592 | {:DOWN, ^target_ref_c_2, :process, ^target_c, {:zen_monitor, :nodedown}}} 593 | ]}} 594 | end 595 | end 596 | 597 | describe "Handles summaries" do 598 | setup [:fast_zen_monitor, :observe_zen_monitor, :disable_sweep, :start_remote_process] 599 | 600 | test "empty summary does nothing", ctx do 601 | connector = Connector.get_for_node(ctx.compatible) 602 | 603 | # Send the connector an empty list of death certificates 604 | send(connector, {:dead, ctx.compatible, []}) 605 | 606 | # Assert that ZenMonitor.Local doesn't receive an enqueue 607 | refute_receive _ 608 | end 609 | 610 | test "unmonitored pids does nothing", ctx do 611 | connector = Connector.get_for_node(ctx.compatible) 612 | 613 | # Send some unmonitored pids 614 | send( 615 | connector, 616 | {:dead, ctx.compatible, 617 | [ 618 | {ctx.compatible_pid, :test_a}, 619 | {ctx.compatible_pid_b, :test_b}, 620 | {ctx.compatible_pid_c, :test_c} 621 | ]} 622 | ) 623 | 624 | # Assert that ZenMonitor.Local doesn't receive an enqueue 625 | refute_receive _ 626 | end 627 | 628 | test "monitored pids get enqueued", ctx do 629 | subscriber = self() 630 | reference_a = make_ref() 631 | reference_b = make_ref() 632 | reference_c = make_ref() 633 | target_a = ctx.compatible_pid 634 | target_b = ctx.compatible_pid_b 635 | target_c = ctx.compatible_pid_c 636 | connector = Connector.get_for_node(ctx.compatible) 637 | 638 | # Monitor some pids 639 | Connector.monitor(target_a, reference_a, subscriber) 640 | Connector.monitor(target_b, reference_b, subscriber) 641 | Connector.monitor(target_c, reference_c, subscriber) 642 | 643 | # Send the monitored pids in the summary 644 | send( 645 | connector, 646 | {:dead, ctx.compatible, [{target_a, :test_a}, {target_b, :test_b}, {target_c, :test_c}]} 647 | ) 648 | 649 | # Assert that ZenMonitor.Local receives an enqueue 650 | assert_receive {:"$gen_cast", 651 | {:enqueue, 652 | [ 653 | {^subscriber, 654 | {:DOWN, ^reference_a, :process, ^target_a, {:zen_monitor, :test_a}}}, 655 | {^subscriber, 656 | {:DOWN, ^reference_b, :process, ^target_b, {:zen_monitor, :test_b}}}, 657 | {^subscriber, 658 | {:DOWN, ^reference_c, :process, ^target_c, {:zen_monitor, :test_c}}} 659 | ]}} 660 | end 661 | 662 | test "mixed pids, monitored enqueue, unmonitored ignored", ctx do 663 | subscriber = self() 664 | reference_a = make_ref() 665 | reference_c = make_ref() 666 | target_a = ctx.compatible_pid 667 | target_b = ctx.compatible_pid_b 668 | target_c = ctx.compatible_pid_c 669 | connector = Connector.get_for_node(ctx.compatible) 670 | 671 | # Monitor some pids (intentionally skip target_b) 672 | Connector.monitor(target_a, reference_a, subscriber) 673 | Connector.monitor(target_c, reference_c, subscriber) 674 | 675 | # Send the mixed pids in the summary 676 | send( 677 | connector, 678 | {:dead, ctx.compatible, [{target_a, :test_a}, {target_b, :test_b}, {target_c, :test_c}]} 679 | ) 680 | 681 | # Assert that ZenMonitor.Local receives an enqueue 682 | assert_receive {:"$gen_cast", 683 | {:enqueue, 684 | [ 685 | {^subscriber, 686 | {:DOWN, ^reference_a, :process, ^target_a, {:zen_monitor, :test_a}}}, 687 | {^subscriber, 688 | {:DOWN, ^reference_c, :process, ^target_c, {:zen_monitor, :test_c}}} 689 | ]}} 690 | 691 | # Assert that no other messages arrive 692 | refute_receive _ 693 | end 694 | end 695 | 696 | describe "Periodic Sweep to compatible remote" do 697 | setup [:observe_gen, :disable_sweep, :reduce_chunk_size, :start_remote_process] 698 | 699 | test "sweep does not send a subscription if there are no newly monitored pids", ctx do 700 | connector = Connector.get_for_node(ctx.compatible) 701 | 702 | # Force the connector to sweep nothing 703 | send(connector, :sweep) 704 | 705 | # Assert that no subscription is sent because nothing is pending 706 | refute_receive {:observe, :cast, _, _} 707 | end 708 | 709 | test "sweep sends the monitored pids since the last sweep", ctx do 710 | target = ctx.compatible_pid 711 | remote = ctx.compatible 712 | connector = Connector.get_for_node(remote) 713 | 714 | # Monitor the target pid 715 | Connector.monitor(target, make_ref(), self()) 716 | 717 | # Force a sweep 718 | send(connector, :sweep) 719 | 720 | assert_receive { 721 | :observe, 722 | :cast, 723 | {ZenMonitor.Proxy, ^remote}, 724 | {:process, ^connector, [{:subscribe, ^target}]} 725 | } 726 | end 727 | 728 | test "sweep ignores already monitored pids on subsequenet sweeps", ctx do 729 | target = ctx.compatible_pid 730 | remote = ctx.compatible 731 | connector = Connector.get_for_node(remote) 732 | 733 | # Monitor the target pid 734 | Connector.monitor(target, make_ref(), self()) 735 | 736 | # Force a sweep 737 | send(connector, :sweep) 738 | 739 | # Flush out the initial message 740 | assert_receive { 741 | :observe, 742 | :cast, 743 | {ZenMonitor.Proxy, ^remote}, 744 | {:process, ^connector, [{:subscribe, ^target}]} 745 | } 746 | 747 | # Monitor the target pid again 748 | Connector.monitor(target, make_ref(), self()) 749 | 750 | # Force another sweep 751 | send(connector, :sweep) 752 | 753 | # Assert that no additional subscriptions are sent 754 | refute_receive {:observe, :cast, _, _} 755 | end 756 | 757 | test "sweep transmits pids in the order received", ctx do 758 | first = ctx.compatible_pid 759 | second = ctx.compatible_pid_b 760 | remote = ctx.compatible 761 | connector = Connector.get_for_node(remote) 762 | 763 | # Monitor the targets 764 | Connector.monitor(first, make_ref(), self()) 765 | Connector.monitor(second, make_ref(), self()) 766 | 767 | # Force a sweep 768 | send(connector, :sweep) 769 | 770 | # Assert that we got a subscription in the correct order (first, second) 771 | assert_receive { 772 | :observe, 773 | :cast, 774 | {ZenMonitor.Proxy, ^remote}, 775 | {:process, ^connector, [{:subscribe, ^first}, {:subscribe, ^second}]} 776 | } 777 | end 778 | 779 | test "sweep will only transmit the requested chunk size", ctx do 780 | target_a = ctx.compatible_pid() 781 | target_b = ctx.compatible_pid_b() 782 | target_c = ctx.compatible_pid_c() 783 | remote = ctx.compatible() 784 | connector = Connector.get_for_node(remote) 785 | 786 | # Monitor all targets 787 | Connector.monitor(target_a, make_ref(), self()) 788 | Connector.monitor(target_b, make_ref(), self()) 789 | Connector.monitor(target_c, make_ref(), self()) 790 | 791 | # Force a sweep 792 | send(connector, :sweep) 793 | 794 | # Assert that we got a subscription for the first chunk (target_a, target_b) 795 | assert_receive { 796 | :observe, 797 | :cast, 798 | {ZenMonitor.Proxy, ^remote}, 799 | {:process, ^connector, [{:subscribe, ^target_a}, {:subscribe, ^target_b}]} 800 | } 801 | 802 | # Force another sweep 803 | send(connector, :sweep) 804 | 805 | # Assert that we got a subscription for the second chunk (target_b) 806 | assert_receive { 807 | :observe, 808 | :cast, 809 | {ZenMonitor.Proxy, ^remote}, 810 | {:process, ^connector, [{:subscribe, ^target_c}]} 811 | } 812 | end 813 | end 814 | 815 | describe "Periodic Sweep to incompatible remote" do 816 | setup [:observe_zen_monitor, :disable_sweep, :reduce_chunk_size, :start_remote_process] 817 | 818 | test "sweep does not send a message if there are no newly monitored pids", ctx do 819 | connector = Connector.get_for_node(ctx.incompatible) 820 | 821 | # Force the connector to sweep nothing 822 | send(connector, :sweep) 823 | 824 | # Assert that no dead message is sent 825 | refute_receive {:"$gen_cast", {:enqueue, _}} 826 | end 827 | 828 | test "sweep sends nodedown for incompatible remote", ctx do 829 | subscriber = self() 830 | reference = make_ref() 831 | target = ctx.incompatible_pid 832 | remote = ctx.incompatible 833 | connector = Connector.get_for_node(remote) 834 | 835 | # Monitor the target pid 836 | Connector.monitor(target, reference, subscriber) 837 | 838 | # Force a sweep 839 | send(connector, :sweep) 840 | 841 | # Assert that the message is enqueued with ZenMonitor.Local (via GenServer.cast) 842 | assert_receive { 843 | :"$gen_cast", 844 | { 845 | :enqueue, 846 | [{^subscriber, {:DOWN, ^reference, :process, ^target, {:zen_monitor, :nodedown}}}] 847 | } 848 | } 849 | end 850 | end 851 | end 852 | -------------------------------------------------------------------------------- /test/local/dispatcher_test.exs: -------------------------------------------------------------------------------- 1 | defmodule ZenMonitor.Local.Dispatcher.Test do 2 | use ExUnit.Case 3 | 4 | alias ZenMonitor.Local.{Connector, Dispatcher} 5 | alias ZenMonitor.Proxy.Batcher 6 | 7 | setup do 8 | {:ok, remote, nil} = ChildNode.start_link(:zen_monitor, :Remote) 9 | 10 | start_supervised(ZenMonitor.Supervisor) 11 | {:ok, _} = Application.ensure_all_started(:instruments) 12 | 13 | on_exit(fn -> 14 | Node.monitor(remote, true) 15 | 16 | receive do 17 | {:nodedown, ^remote} -> :ok 18 | end 19 | end) 20 | 21 | {:ok, remote: remote} 22 | end 23 | 24 | @doc """ 25 | Reduces the intervals for all the batching parts of ZenMonitor so that the default 26 | assert_receive / refute_receive timeouts are an order of magnitude larger. 27 | """ 28 | def fast_zen_monitor(ctx) do 29 | # Tune the local dispatcher 30 | original_demand_interval = Dispatcher.demand_interval() 31 | Dispatcher.demand_interval(10) 32 | 33 | # Tune the local connector 34 | original_connector_interval = Connector.sweep_interval() 35 | Connector.sweep_interval(10) 36 | 37 | # Tune the remote batcher 38 | original_batcher_interval = :rpc.call(ctx.remote, Batcher, :sweep_interval, []) 39 | :ok = :rpc.call(ctx.remote, Batcher, :sweep_interval, [10]) 40 | 41 | on_exit(fn -> 42 | # Restore the local settings 43 | Dispatcher.demand_interval(original_demand_interval) 44 | Connector.sweep_interval(original_connector_interval) 45 | 46 | # Restore the remote settings 47 | :rpc.call(ctx.remote, Batcher, :sweep_interval, [original_batcher_interval]) 48 | end) 49 | 50 | :ok 51 | end 52 | 53 | def start_remote_process(ctx) do 54 | remote_pid = Node.spawn(ctx.remote, Process, :sleep, [:infinity]) 55 | alternate_remote_pid = Node.spawn(ctx.remote, Process, :sleep, [:infinity]) 56 | 57 | {:ok, remote_pid: remote_pid, alternate_remote_pid: alternate_remote_pid} 58 | end 59 | 60 | describe "Event Dispatch" do 61 | setup [:fast_zen_monitor, :start_remote_process] 62 | 63 | test "no messages are sent when there is nothing monitored" do 64 | # Assert that we receive no messages 65 | refute_receive _ 66 | end 67 | 68 | test "no messages are sent when the monitored process is still running", ctx do 69 | # Monitor the remote process 70 | ZenMonitor.monitor(ctx.remote_pid) 71 | 72 | # Assert that we receive no messages 73 | refute_receive _ 74 | end 75 | 76 | test "a message is dispatched when the monitored process dies", ctx do 77 | target = ctx.remote_pid() 78 | 79 | # Monitor the remote process 80 | ref = ZenMonitor.monitor(target) 81 | 82 | # Kill the remote process 83 | Process.exit(target, :kill) 84 | 85 | # Assert delivery of a :DOWN for the killed process 86 | assert_receive {:DOWN, ^ref, :process, ^target, {:zen_monitor, _}}, 1000 87 | end 88 | 89 | test "only the dead process gets a message dispatched", ctx do 90 | target = ctx.remote_pid() 91 | alternate = ctx.alternate_remote_pid() 92 | 93 | # Monitor both remote processes 94 | ref = ZenMonitor.monitor(target) 95 | ZenMonitor.monitor(alternate) 96 | 97 | # Kill the target remote process 98 | Process.exit(target, :kill) 99 | 100 | # Assert delivery of a :DOWN for the killed process and nothing else 101 | assert_receive {:DOWN, ^ref, :process, ^target, {:zen_monitor, _}}, 1000 102 | refute_receive _ 103 | end 104 | 105 | test "monitoring a dead process should dispatch a :DOWN with :noproc", ctx do 106 | target = ctx.remote_pid() 107 | 108 | # Kill the remote process 109 | Process.exit(target, :kill) 110 | 111 | # Monitor the now dead remote process 112 | ref = ZenMonitor.monitor(target) 113 | 114 | # Assert delivery of a :DOWN :noproc 115 | assert_receive {:DOWN, ^ref, :process, ^target, {:zen_monitor, :noproc}}, 1000 116 | end 117 | 118 | test "monitoring a process after down and dispatched message dispatches another message", 119 | ctx do 120 | target = ctx.remote_pid() 121 | 122 | # Monitor the remote process 123 | ref = ZenMonitor.monitor(target) 124 | 125 | # Kill the target remote process 126 | Process.exit(target, :kill) 127 | 128 | # Assert initial delivery 129 | assert_receive {:DOWN, ^ref, :process, ^target, {:zen_monitor, _}}, 1000 130 | 131 | # Re-monitor the remote process 132 | another_ref = ZenMonitor.monitor(target) 133 | 134 | # Assert delivery of a :DOWN :noproc 135 | assert_receive {:DOWN, ^another_ref, :process, ^target, {:zen_monitor, :noproc}}, 1000 136 | end 137 | 138 | test "all monitored processes get delivered at nodedown", ctx do 139 | target = ctx.remote_pid() 140 | alternate = ctx.alternate_remote_pid() 141 | 142 | # Monitor both remote processes 143 | target_ref = ZenMonitor.monitor(target) 144 | alternate_ref = ZenMonitor.monitor(alternate) 145 | 146 | # Wait for the monitors to get dispatched to the remote 147 | Process.sleep(50) 148 | 149 | # Kill the remote node 150 | :slave.stop(ctx.remote) 151 | 152 | # Assert delivery of both :DOWN :nodedown messages 153 | assert_receive {:DOWN, ^target_ref, :process, ^target, {:zen_monitor, :nodedown}}, 1000 154 | 155 | assert_receive {:DOWN, ^alternate_ref, :process, ^alternate, {:zen_monitor, :nodedown}}, 156 | 1000 157 | end 158 | end 159 | end 160 | -------------------------------------------------------------------------------- /test/local/local_test.exs: -------------------------------------------------------------------------------- 1 | defmodule ZenMonitor.Local.Test do 2 | @moduledoc """ 3 | Tests the ZenMonitor.Local module. 4 | 5 | Since the bulk of monitor/1 and compatibility/1 are delegated to ZenMonitor.Local.Connector, see 6 | the ZenMonitor.Local.Connector.Test module for tests concerning that functionality. 7 | 8 | Most of the other functionality of this module is internal and handled by the 9 | ZenMonitor.BlackBox.Test 10 | """ 11 | use ExUnit.Case 12 | 13 | alias ZenMonitor.Local 14 | alias ZenMonitor.Local.Tables 15 | 16 | setup do 17 | start_supervised(ZenMonitor.Supervisor) 18 | :ok 19 | end 20 | 21 | def pids(count) do 22 | Enum.map(1..count, fn _ -> spawn(fn -> Process.sleep(:infinity) end) end) 23 | end 24 | 25 | describe "Demonitoring a reference" do 26 | test "demonitored references are consumed from the references table" do 27 | [pid] = pids(1) 28 | ref = Local.monitor(pid) 29 | 30 | assert :ets.member(Tables.references(), {self(), ref}) 31 | 32 | assert true = Local.demonitor(ref) 33 | 34 | refute :ets.member(Tables.references(), {self(), ref}) 35 | end 36 | 37 | test "demonitor without flush does not clear already delivered :DOWN message" do 38 | ref = make_ref() 39 | 40 | # Simulate receiving a :DOWN message about the reference 41 | send(self(), {:DOWN, ref, :process, :pid, :reason}) 42 | 43 | assert true = Local.demonitor(ref) 44 | 45 | assert_received {:DOWN, ^ref, _, _, _} 46 | end 47 | 48 | test "demonitor with flush will clear already delivered :DOWN message" do 49 | ref = make_ref() 50 | 51 | # Simulate receiving a :DOWN message about the reference 52 | send(self(), {:DOWN, ref, :process, :pid, :reason}) 53 | 54 | assert true = Local.demonitor(ref, [:flush]) 55 | 56 | refute_received {:DOWN, ^ref, _, _, _} 57 | end 58 | end 59 | 60 | describe "Handles subscriber down" do 61 | test "cleans up references" do 62 | me = self() 63 | 64 | # Spawn a new subscriber process, have it send us some information and sleep 65 | subscriber = 66 | spawn(fn -> 67 | [pid] = pids(1) 68 | ref = Local.monitor(pid) 69 | 70 | send(me, {:monitor, ref}) 71 | 72 | Process.sleep(:infinity) 73 | end) 74 | 75 | assert_receive {:monitor, ref} 76 | 77 | # Assert that the reference was recorded 78 | assert :ets.member(Tables.references(), {subscriber, ref}) 79 | 80 | # Kill the subscriber 81 | Process.exit(subscriber, :kill) 82 | 83 | # Assert that the reference was cleaned up 84 | assert Helper.wait_until(fn -> 85 | not :ets.member(Tables.references(), {subscriber, ref}) 86 | end) 87 | end 88 | end 89 | end 90 | -------------------------------------------------------------------------------- /test/proxy/batcher_test.exs: -------------------------------------------------------------------------------- 1 | defmodule ZenMonitor.Proxy.Batcher.Test do 2 | @moduledoc """ 3 | Tests for the ZenMonitor.Proxy.Batcher module 4 | 5 | ZenMonitor is a distributed system, in this suite the ZenMonitor.Proxy.Batcher that we will be 6 | exercising is running on the local node. Since the ZenMonitor.Proxy system works off of 7 | subscriber pids we will make the test process the subscriber and forgo the need for ChildNodes. 8 | """ 9 | use ExUnit.Case 10 | 11 | alias ZenMonitor.Proxy.{Batcher, Tables} 12 | 13 | # Batchers stop when their subscriber goes DOWN, this tag tells ExUnit to suppress stops reports 14 | @moduletag :capture_log 15 | 16 | setup do 17 | start_supervised(ZenMonitor.Supervisor) 18 | :ok 19 | end 20 | 21 | def disable_sweep(_) do 22 | # Set sweep interval to 1 minute (effectively disable for this describe block) 23 | original_sweep_interval = Batcher.sweep_interval() 24 | Batcher.sweep_interval(60_000) 25 | 26 | on_exit(fn -> 27 | Batcher.sweep_interval(original_sweep_interval) 28 | end) 29 | 30 | :ok 31 | end 32 | 33 | def reduce_chunk_size(_) do 34 | # Set chunk size to 2 for testing convenience 35 | original_chunk_size = Batcher.chunk_size() 36 | Batcher.chunk_size(2) 37 | 38 | on_exit(fn -> 39 | Batcher.chunk_size(original_chunk_size) 40 | end) 41 | 42 | :ok 43 | end 44 | 45 | describe "Getting a Batcher" do 46 | test "batcher for pid" do 47 | batcher = Batcher.get(self()) 48 | assert Process.alive?(batcher) 49 | end 50 | 51 | test "multiple gets for the same pid should return the same batcher" do 52 | batcher_a = Batcher.get(self()) 53 | batcher_b = Batcher.get(self()) 54 | 55 | assert batcher_a == batcher_b 56 | end 57 | 58 | test "batcher is replaced if it dies" do 59 | original = Batcher.get(self()) 60 | assert Process.alive?(original) 61 | 62 | Process.exit(original, :kill) 63 | refute Process.alive?(original) 64 | 65 | replacement = Batcher.get(self()) 66 | 67 | # Give a chance for the GenRegistry to react to the above 68 | # death. 69 | replacement = if replacement == original do 70 | Process.sleep(50) 71 | Batcher.get(self()) 72 | else 73 | replacement 74 | end 75 | 76 | assert Process.alive?(replacement) 77 | 78 | assert original != replacement 79 | end 80 | 81 | test "each pid gets its own batcher" do 82 | batcher_a = Batcher.get(self()) 83 | batcher_b = Batcher.get(:other) 84 | 85 | assert batcher_a != batcher_b 86 | end 87 | end 88 | 89 | describe "Enqueuing Certificates" do 90 | setup [:disable_sweep] 91 | 92 | test "certificate gets added to the current batch" do 93 | batcher = Batcher.get(self()) 94 | 95 | initial_state = :sys.get_state(batcher) 96 | assert initial_state.length == 0 97 | assert :queue.len(initial_state.batch) == 0 98 | 99 | Batcher.enqueue(batcher, :test_pid, :test_reason) 100 | 101 | updated_state = :sys.get_state(batcher) 102 | assert updated_state.length == 1 103 | assert :queue.len(updated_state.batch) == 1 104 | assert {:value, {:test_pid, :test_reason}} = :queue.peek(updated_state.batch) 105 | end 106 | end 107 | 108 | describe "Periodic Sweeps" do 109 | setup [:disable_sweep, :reduce_chunk_size] 110 | 111 | test "sweep should do nothing if the batch is empty" do 112 | batcher = Batcher.get(self()) 113 | 114 | # Force a sweep 115 | send(batcher, :sweep) 116 | 117 | refute_receive _ 118 | end 119 | 120 | test "sweep should deliver a summary to the subscriber" do 121 | batcher = Batcher.get(self()) 122 | Batcher.enqueue(batcher, :test_pid, :test_reason) 123 | 124 | # Force a sweep 125 | send(batcher, :sweep) 126 | 127 | # Assert that we received the expected summary 128 | assert_receive {:dead, _, [{:test_pid, :test_reason}]} 129 | end 130 | 131 | test "sweep should deliver the summary in the same order it received them" do 132 | batcher = Batcher.get(self()) 133 | 134 | # Enqueue a full chunk of unique certificates 135 | Batcher.enqueue(batcher, :test_pid_1, :test_reason_1) 136 | Batcher.enqueue(batcher, :test_pid_2, :test_reason_2) 137 | 138 | # Force a sweep 139 | send(batcher, :sweep) 140 | 141 | # Assert that we received the expected summary in the right order (1, 2) 142 | assert_receive {:dead, _, [{:test_pid_1, :test_reason_1}, {:test_pid_2, :test_reason_2}]} 143 | end 144 | 145 | test "sweep will only deliver the requested chunk size" do 146 | batcher = Batcher.get(self()) 147 | 148 | # Enqueue two chunks worth of certificates 149 | Batcher.enqueue(batcher, :test_pid_1, :test_reason_1) 150 | Batcher.enqueue(batcher, :test_pid_2, :test_reason_2) 151 | Batcher.enqueue(batcher, :test_pid_3, :test_reason_3) 152 | Batcher.enqueue(batcher, :test_pid_4, :test_reason_4) 153 | 154 | # Force a sweep 155 | send(batcher, :sweep) 156 | 157 | # Assert that we received the first chunk in order (1, 2) 158 | assert_receive {:dead, _, [{:test_pid_1, :test_reason_1}, {:test_pid_2, :test_reason_2}]} 159 | 160 | # Force an additional sweep 161 | send(batcher, :sweep) 162 | 163 | # Assert that we received the second chunk in order (3, 4) 164 | assert_receive {:dead, _, [{:test_pid_3, :test_reason_3}, {:test_pid_4, :test_reason_4}]} 165 | end 166 | end 167 | 168 | describe "Handling Subscriber Down" do 169 | test "batcher cleans up subscriptions" do 170 | # Spawn a subscriber that we can kill later 171 | subscriber = spawn(fn -> Process.sleep(:infinity) end) 172 | 173 | # Start a batcher for the subscribers 174 | Batcher.get(subscriber) 175 | 176 | # Insert some subscriptions into the subscriber table 177 | :ets.insert(Tables.subscribers(), [ 178 | {{:test_pid_a, subscriber}}, 179 | {{:test_pid_b, subscriber}}, 180 | {{:test_pid_a, :other_subscriber}}, 181 | {{:test_pid_c, :other_subscriber}} 182 | ]) 183 | 184 | # Kill the subscriber 185 | Process.exit(subscriber, :kill) 186 | 187 | # Assert that the subscribers table gets cleaned up 188 | assert Helper.wait_until(fn -> 189 | Tables.subscribers() 190 | |> :ets.tab2list() 191 | |> length() == 2 192 | end) 193 | 194 | # Assert that the rows that remain are the expected ones 195 | assert :ets.member(Tables.subscribers(), {:test_pid_a, :other_subscriber}) 196 | assert :ets.member(Tables.subscribers(), {:test_pid_c, :other_subscriber}) 197 | end 198 | end 199 | end 200 | -------------------------------------------------------------------------------- /test/proxy/proxy_test.exs: -------------------------------------------------------------------------------- 1 | defmodule ZenMonitor.Proxy.Test do 2 | @moduledoc """ 3 | Tests for the ZenMonitor.Proxy module 4 | """ 5 | use ExUnit.Case 6 | 7 | alias ZenMonitor.Proxy 8 | alias ZenMonitor.Proxy.{Batcher, Tables} 9 | alias ZenMonitor.Test.Support.Subscriber 10 | 11 | # Batchers stop when their subscriber goes DOWN, this tag tells ExUnit to suppress stops reports 12 | @moduletag :capture_log 13 | 14 | setup do 15 | # Speed up the Batcher so its interval is much faster than the default timeout for 16 | # assert_receive / refute_receive 17 | original_sweep_interval = Batcher.sweep_interval() 18 | Batcher.sweep_interval(10) 19 | start_supervised(ZenMonitor.Supervisor) 20 | 21 | on_exit(fn -> 22 | # Restore original setting 23 | Batcher.sweep_interval(original_sweep_interval) 24 | end) 25 | 26 | {:ok, proxy: Process.whereis(ZenMonitor.Proxy)} 27 | end 28 | 29 | def pids(count) do 30 | Enum.map(1..count, fn _ -> spawn(fn -> Process.sleep(:infinity) end) end) 31 | end 32 | 33 | def row_count(table) do 34 | :ets.tab2list(table) |> length() 35 | end 36 | 37 | def monitor_count(pid) do 38 | Process.info(pid, :monitors) |> elem(1) |> length 39 | end 40 | 41 | describe "Ping" do 42 | test "returns :pong" do 43 | assert :pong = Proxy.ping() 44 | end 45 | end 46 | 47 | describe "Process" do 48 | test "no monitors before subscription", ctx do 49 | assert {:monitors, []} = Process.info(ctx.proxy, :monitors) 50 | end 51 | 52 | test "subscriptions add entries to the subscribers table", ctx do 53 | subscriber = self() 54 | targets = pids(3) 55 | instructions = Enum.map(targets, &{:subscribe, &1}) 56 | 57 | # Send the subscribe instructions 58 | GenServer.cast(ctx.proxy, {:process, subscriber, instructions}) 59 | 60 | # Assert that three entries get written 61 | assert Helper.wait_until(fn -> 62 | row_count(Tables.subscribers()) == 3 63 | end) 64 | 65 | # Assert each entry individually 66 | [t1, t2, t3] = targets 67 | assert :ets.member(Tables.subscribers(), {t1, subscriber}) 68 | assert :ets.member(Tables.subscribers(), {t2, subscriber}) 69 | assert :ets.member(Tables.subscribers(), {t3, subscriber}) 70 | end 71 | 72 | test "multiple subscriptions to the same pid do not get additional subscriber rows", ctx do 73 | subscriber = self() 74 | [target] = pids(1) 75 | instructions = [{:subscribe, target}] 76 | 77 | # Create the initial subscription 78 | GenServer.cast(ctx.proxy, {:process, subscriber, instructions}) 79 | 80 | # Assert that the entry gets written to the subscriber table 81 | assert Helper.wait_until(fn -> 82 | row_count(Tables.subscribers()) == 1 83 | end) 84 | 85 | # Assert that it's the row we expect 86 | assert :ets.member(Tables.subscribers(), {target, subscriber}) 87 | 88 | # Perform a duplicate subscription 89 | GenServer.cast(ctx.proxy, {:process, subscriber, instructions}) 90 | 91 | # Assert that no new entry gets written 92 | assert Helper.wait_until(fn -> 93 | row_count(Tables.subscribers()) != 2 94 | end) 95 | end 96 | 97 | test "subscriptions result in processes being monitored", ctx do 98 | subscriber = self() 99 | targets = pids(3) 100 | instructions = Enum.map(targets, &{:subscribe, &1}) 101 | 102 | # Create the Subscriptions 103 | GenServer.cast(ctx.proxy, {:process, subscriber, instructions}) 104 | 105 | # Wait for three monitors to show up 106 | assert Helper.wait_until(fn -> 107 | monitor_count(ctx.proxy) == 3 108 | end) 109 | 110 | {:monitors, monitors} = Process.info(ctx.proxy, :monitors) 111 | pids = Keyword.get_values(monitors, :process) |> Enum.sort() 112 | targets = Enum.sort(targets) 113 | 114 | assert pids == targets 115 | end 116 | 117 | test "duplicate subscriptions do not result in multiple monitors", ctx do 118 | subscriber = self() 119 | [target] = pids(1) 120 | instructions = [{:subscribe, target}] 121 | 122 | # Create an initial subscription 123 | GenServer.cast(ctx.proxy, {:process, subscriber, instructions}) 124 | 125 | # Wait for the monitor to be established 126 | assert Helper.wait_until(fn -> 127 | monitor_count(ctx.proxy) == 1 128 | end) 129 | 130 | # Create a duplicate subscription 131 | GenServer.cast(ctx.proxy, {:process, subscriber, instructions}) 132 | 133 | # Assert that no new monitors are created 134 | assert Helper.wait_until(fn -> 135 | monitor_count(ctx.proxy) != 2 136 | end) 137 | end 138 | 139 | test "unsubscribe removes the subscriber", ctx do 140 | subscriber = self() 141 | [target] = pids(1) 142 | subscribe_instructions = [{:subscribe, target}] 143 | unsubscribe_instructions = [{:unsubscribe, target}] 144 | 145 | # Create an initial subscription 146 | GenServer.cast(ctx.proxy, {:process, subscriber, subscribe_instructions}) 147 | 148 | # Wait for the subscriber row to be written 149 | assert Helper.wait_until(fn -> 150 | row_count(Tables.subscribers()) == 1 151 | end) 152 | 153 | # Unsubscribe 154 | GenServer.cast(ctx.proxy, {:process, subscriber, unsubscribe_instructions}) 155 | 156 | # Make sure the subscriber is removed 157 | assert Helper.wait_until(fn -> 158 | row_count(Tables.subscribers()) == 0 159 | end) 160 | end 161 | 162 | test "instruction order is respected (terminal subscribed)", ctx do 163 | subscriber = self() 164 | [target] = pids(1) 165 | instructions = [{:unsubscribe, target}, {:subscribe, target}] 166 | 167 | # Process the instructions 168 | GenServer.cast(ctx.proxy, {:process, subscriber, instructions}) 169 | 170 | # Confirm that the subscriber exists 171 | assert Helper.wait_until(fn -> 172 | row_count(Tables.subscribers()) == 1 173 | end) 174 | 175 | assert :ets.member(Tables.subscribers(), {target, subscriber}) 176 | end 177 | 178 | test "instruction order is respected (terminal unsubscribed)", ctx do 179 | subscriber = self() 180 | [target] = pids(1) 181 | instructions = [{:subscribe, target}, {:unsubscribe, target}] 182 | 183 | # Process the instructions 184 | GenServer.cast(ctx.proxy, {:process, subscriber, instructions}) 185 | 186 | # Confirm that no subscription exists 187 | assert Helper.wait_until(fn -> 188 | row_count(Tables.subscribers()) == 0 189 | end) 190 | end 191 | 192 | test "unsubscribe is isolated to the unsubscriber", ctx do 193 | subscriber = Subscriber.start(self()) 194 | [target] = pids(1) 195 | subscribe_instructions = [{:subscribe, target}] 196 | unsubscribe_instructions = [{:unsubscribe, target}] 197 | 198 | # Subscribe both parties to the target 199 | GenServer.cast(ctx.proxy, {:process, self(), subscribe_instructions}) 200 | GenServer.cast(ctx.proxy, {:process, subscriber, subscribe_instructions}) 201 | 202 | # Assert that the subscribers get written to the table 203 | assert Helper.wait_until(fn -> 204 | row_count(Tables.subscribers()) == 2 205 | end) 206 | 207 | assert :ets.member(Tables.subscribers(), {target, subscriber}) 208 | assert :ets.member(Tables.subscribers(), {target, self()}) 209 | 210 | # Unsubscribe on of the parties 211 | GenServer.cast(ctx.proxy, {:process, self(), unsubscribe_instructions}) 212 | 213 | # Assert that the correct row was removed 214 | assert Helper.wait_until(fn -> 215 | row_count(Tables.subscribers()) == 1 216 | end) 217 | 218 | assert :ets.member(Tables.subscribers(), {target, subscriber}) 219 | refute :ets.member(Tables.subscribers(), {target, self()}) 220 | end 221 | 222 | test "unsubscribe is isolated to the target", ctx do 223 | subscriber = self() 224 | [target, other] = pids(2) 225 | 226 | instructions = [ 227 | {:subscribe, target}, 228 | {:unsubscribe, target}, 229 | {:subscribe, other} 230 | ] 231 | 232 | # Process the instructions 233 | GenServer.cast(ctx.proxy, {:process, subscriber, instructions}) 234 | 235 | # Assert that only the other target exists in the table 236 | assert Helper.wait_until(fn -> 237 | row_count(Tables.subscribers()) == 1 238 | end) 239 | 240 | assert :ets.member(Tables.subscribers(), {other, subscriber}) 241 | refute :ets.member(Tables.subscribers(), {target, subscriber}) 242 | end 243 | 244 | test "ERTS monitors persist after unsubscribe", ctx do 245 | subscriber = self() 246 | [target] = pids(1) 247 | subscribe_instructions = [{:subscribe, target}] 248 | unsubscribe_instructions = [{:unsubscribe, target}] 249 | 250 | # Create the initial subscription 251 | GenServer.cast(ctx.proxy, {:process, subscriber, subscribe_instructions}) 252 | 253 | # Assert that the subscriber row is written 254 | assert Helper.wait_until(fn -> 255 | row_count(Tables.subscribers()) == 1 256 | end) 257 | 258 | # Assert that the monitor is established 259 | assert Helper.wait_until(fn -> 260 | monitor_count(ctx.proxy) == 1 261 | end) 262 | 263 | # Unsubscribe 264 | GenServer.cast(ctx.proxy, {:process, subscriber, unsubscribe_instructions}) 265 | 266 | # Assert that the subscriber row was cleaned up 267 | assert Helper.wait_until(fn -> 268 | row_count(Tables.subscribers()) == 0 269 | end) 270 | 271 | # Assert that the monitor persists 272 | assert Helper.wait_until(fn -> 273 | monitor_count(ctx.proxy) == 1 274 | end) 275 | end 276 | end 277 | 278 | describe "DOWN handling" do 279 | test "removes subscribers for the down pid", ctx do 280 | subscriber = Subscriber.start(self()) 281 | [target, other] = pids(2) 282 | instructions = [{:subscribe, target}, {:subscribe, other}] 283 | 284 | # Subscribe both parties to the same target and another process that will be kept alive 285 | GenServer.cast(ctx.proxy, {:process, self(), instructions}) 286 | GenServer.cast(ctx.proxy, {:process, subscriber, instructions}) 287 | 288 | # Assert that the subscribers get written to the table 289 | assert Helper.wait_until(fn -> 290 | row_count(Tables.subscribers()) == 4 291 | end) 292 | 293 | # Kill the target 294 | Process.exit(target, :kill) 295 | 296 | # Assert delivery of messages 297 | assert_receive {:dead, _, [{^target, _}]} 298 | assert_receive {:forward, {:dead, _, [{^target, _}]}} 299 | 300 | # Make sure the subscriber rows were cleared out 301 | assert row_count(Tables.subscribers()) == 2 302 | subscribers = :ets.match(Tables.subscribers(), {{other, :"$1"}}) 303 | assert [self()] in subscribers 304 | assert [subscriber] in subscribers 305 | end 306 | 307 | test "truncates reasons", ctx do 308 | subscriber = Subscriber.start(self()) 309 | [target, other] = pids(2) 310 | instructions = [{:subscribe, target}, {:subscribe, other}] 311 | 312 | # Subscribe both parties to the same target and another process that will be kept alive 313 | GenServer.cast(ctx.proxy, {:process, self(), instructions}) 314 | GenServer.cast(ctx.proxy, {:process, subscriber, instructions}) 315 | 316 | # Assert that the subscribers get written to the table 317 | assert Helper.wait_until(fn -> 318 | row_count(Tables.subscribers()) == 4 319 | end) 320 | 321 | reason = {:this, :is, :an, {:especially, {:deeply, {:nested, {:tuple}}}}} 322 | # Kill the target 323 | Process.exit(target, reason) 324 | 325 | # Assert delivery of messages 326 | assert_receive {:dead, _, [{^target, reason}]} 327 | assert_receive {:forward, {:dead, _, [{^target, _}]}} 328 | 329 | assert {:this, :is, :an, {:especially, {:truncated, :truncated}}} == reason 330 | end 331 | 332 | test "truncates state", ctx do 333 | defmodule Crasher do 334 | use GenServer 335 | 336 | def start do 337 | state = for i <- 1..10000, into: %{}, do: {i, i * 2} 338 | GenServer.start(__MODULE__, state) 339 | end 340 | 341 | def init(args) do 342 | {:ok, args} 343 | end 344 | 345 | def crash(pid), do: GenServer.call(pid, :crash) 346 | 347 | def handle_call(nil, _from, state), do: {:reply, :ok, state} 348 | end 349 | 350 | {:ok, crasher} = Crasher.start() 351 | instructions = [{:subscribe, crasher}] 352 | 353 | GenServer.cast(ctx.proxy, {:process, self(), instructions}) 354 | # Assert that the subscribers get written to the table 355 | assert Helper.wait_until(fn -> 356 | row_count(Tables.subscribers()) == 1 357 | end) 358 | 359 | spawn(Crasher, :crash, [crasher]) 360 | 361 | assert_receive {:dead, _, [{^crasher, reason}]}, 500 362 | assert {:function_clause, frames} = reason 363 | 364 | for frame <- frames do 365 | # this generates a big stack, everything should be truncated. 366 | assert [:truncated] = frame |> Tuple.to_list() |> Enum.uniq() 367 | end 368 | end 369 | end 370 | end 371 | -------------------------------------------------------------------------------- /test/stress_test.exs: -------------------------------------------------------------------------------- 1 | defmodule ZenMonitor.Stress.Test do 2 | use ExUnit.Case 3 | 4 | alias ZenMonitor.Local.Connector 5 | 6 | @fast_interval 10 7 | @slow_interval 100 8 | 9 | @small_chunk 10 10 | @big_chunk 100_000 11 | 12 | setup do 13 | # Make the Batcher and Dispatcher dispatch at a controlled rate 14 | tune(node(), :batcher, :slow) 15 | tune(node(), :dispatcher, :slow) 16 | 17 | # Make the Connector flush everything very quickly 18 | tune(node(), :connector, :fast) 19 | 20 | start_supervised(ZenMonitor.Supervisor) 21 | {:ok, compatible, nil} = ChildNode.start_link(:zen_monitor, :Compatible) 22 | 23 | on_exit(fn -> 24 | Node.monitor(compatible, true) 25 | 26 | receive do 27 | {:nodedown, ^compatible} -> :ok 28 | end 29 | end) 30 | 31 | # Make the remote batcher flush at a controlled rate 32 | tune(compatible, :batcher, :slow) 33 | 34 | {:ok, down: :down@down, compatible: compatible, remotes: [compatible]} 35 | end 36 | 37 | def tune(remote, :batcher, :fast) do 38 | :rpc.call(remote, Application, :put_env, [ 39 | :zen_monitor, 40 | :batcher_sweep_interval, 41 | @fast_interval 42 | ]) 43 | 44 | :rpc.call(remote, Application, :put_env, [:zen_monitor, :batcher_chunk_size, @big_chunk]) 45 | end 46 | 47 | def tune(remote, :batcher, :slow) do 48 | :rpc.call(remote, Application, :put_env, [ 49 | :zen_monitor, 50 | :batcher_sweep_interval, 51 | @slow_interval 52 | ]) 53 | 54 | :rpc.call(remote, Application, :put_env, [:zen_monitor, :batcher_chunk_size, @small_chunk]) 55 | end 56 | 57 | def tune(remote, :connector, :fast) do 58 | :rpc.call(remote, Application, :put_env, [ 59 | :zen_monitor, 60 | :connector_sweep_interval, 61 | @fast_interval 62 | ]) 63 | 64 | :rpc.call(remote, Application, :put_env, [:zen_monitor, :connector_chunk_size, @big_chunk]) 65 | end 66 | 67 | def tune(remote, :connector, :slow) do 68 | :rpc.call(remote, Application, :put_env, [ 69 | :zen_monitor, 70 | :connector_sweep_interval, 71 | @slow_interval 72 | ]) 73 | 74 | :rpc.call(remote, Application, :put_env, [:zen_monitor, :connector_chunk_size, @small_chunk]) 75 | end 76 | 77 | def tune(remote, :dispatcher, :fast) do 78 | :rpc.call(remote, Application, :put_env, [:zen_monitor, :demand_interavl, @fast_interval]) 79 | :rpc.call(remote, Application, :put_env, [:zen_monitor, :demand_amount, @big_chunk]) 80 | end 81 | 82 | def tune(remote, :dispatcher, :slow) do 83 | :rpc.call(remote, Application, :put_env, [:zen_monitor, :demand_interval, @slow_interval]) 84 | :rpc.call(remote, Application, :put_env, [:zen_monitor, :demand_amount, @small_chunk]) 85 | end 86 | 87 | def start_processes(remote, amount) do 88 | Enum.map(1..amount, fn _ -> 89 | Node.spawn(remote, Process, :sleep, [:infinity]) 90 | end) 91 | end 92 | 93 | def stop_processes(targets) do 94 | spawn(fn -> 95 | Enum.each(targets, &Process.exit(&1, :kill)) 96 | end) 97 | end 98 | 99 | def flush_messages() do 100 | send(self(), :flush) 101 | 102 | receive_until_flush([]) 103 | end 104 | 105 | def receive_until_flush(acc) do 106 | receive do 107 | msg -> 108 | if match?(:flush, msg) do 109 | Enum.reverse(acc) 110 | else 111 | receive_until_flush([msg | acc]) 112 | end 113 | after 114 | 0 -> 115 | raise "Flush not found!" 116 | end 117 | end 118 | 119 | describe "Massive remote failure" do 120 | test "local environment configured correctly" do 121 | assert @slow_interval == Application.get_env(:zen_monitor, :batcher_sweep_interval) 122 | assert @small_chunk == Application.get_env(:zen_monitor, :batcher_chunk_size) 123 | 124 | assert @slow_interval == Application.get_env(:zen_monitor, :demand_interval) 125 | assert @small_chunk == Application.get_env(:zen_monitor, :demand_amount) 126 | 127 | assert @fast_interval == Application.get_env(:zen_monitor, :connector_sweep_interval) 128 | assert @big_chunk == Application.get_env(:zen_monitor, :connector_chunk_size) 129 | end 130 | 131 | test "remote environment configured correctly", ctx do 132 | assert @slow_interval == 133 | :rpc.call(ctx.compatible, Application, :get_env, [ 134 | :zen_monitor, 135 | :batcher_sweep_interval 136 | ]) 137 | 138 | assert @small_chunk == 139 | :rpc.call(ctx.compatible, Application, :get_env, [ 140 | :zen_monitor, 141 | :batcher_chunk_size 142 | ]) 143 | end 144 | 145 | test "down messages are throttled", ctx do 146 | # Start a lot of remote processes 147 | remote_pids = start_processes(ctx.compatible, 100_000) 148 | 149 | # Monitor everything 150 | assert :ok = Enum.each(remote_pids, &ZenMonitor.monitor/1) 151 | 152 | # Make sure the connector flushes the monitors over to the remote 153 | connector = Connector.get(ctx.compatible) 154 | 155 | assert Helper.wait_until(fn -> 156 | :sys.get_state(connector).length == 0 157 | end) 158 | 159 | # Assert that the message queue is empty 160 | assert {:message_queue_len, 0} = Process.info(self(), :message_queue_len) 161 | 162 | # Choose some processes to kill 163 | targets = 164 | remote_pids 165 | |> Enum.shuffle() 166 | |> Enum.slice(0, 10_000) 167 | 168 | # Start stopping all the targets 169 | stop_processes(targets) 170 | 171 | # Wait for 10 intervals 172 | Process.sleep(@slow_interval * 10) 173 | 174 | # Get the message queue 175 | messages = flush_messages() 176 | 177 | # Check that we got an appropriate amount of messages 178 | flush_length = length(messages) 179 | assert @small_chunk * 5 <= flush_length 180 | assert flush_length <= @small_chunk * 15 181 | 182 | # Check each message is a :DOWN for a stopped process 183 | for message <- messages do 184 | assert {:DOWN, _, :process, received_pid, {:zen_monitor, _}} = message 185 | assert received_pid in targets 186 | end 187 | end 188 | 189 | test "does not crash ZenMonitor", ctx do 190 | # Save the current ZenMonitor pids 191 | connector = Connector.get(ctx.compatible) 192 | local = Process.whereis(ZenMonitor.Local) 193 | proxy = :rpc.call(ctx.compatible, Process, :whereis, [ZenMonitor.Proxy]) 194 | batcher = :rpc.call(ctx.compatible, ZenMonitor.Proxy.Batcher, :get, [connector]) 195 | 196 | # Start a lot of remote processes 197 | remote_pids = start_processes(ctx.compatible, 100_000) 198 | 199 | # Monitor everything 200 | assert :ok = Enum.each(remote_pids, &ZenMonitor.monitor/1) 201 | 202 | # Make sure the connector flushes the monitors over to the remote 203 | assert Helper.wait_until(fn -> 204 | :sys.get_state(connector).length == 0 205 | end) 206 | 207 | # Kill all remote processes 208 | stopper = stop_processes(remote_pids) 209 | 210 | # Wait for the stopper to finish its job 211 | assert Helper.wait_until(fn -> 212 | not Process.alive?(stopper) 213 | end) 214 | 215 | # Make sure that nothing crashed 216 | assert Process.alive?(local) 217 | assert Process.alive?(connector) 218 | assert :rpc.call(ctx.compatible, Process, :alive, [proxy]) 219 | assert :rpc.call(ctx.compatible, Process, :alive, [batcher]) 220 | end 221 | end 222 | end 223 | -------------------------------------------------------------------------------- /test/support/child_node.ex: -------------------------------------------------------------------------------- 1 | defmodule ChildNode do 2 | @moduledoc """ 3 | ChildNode provides facilities for starting another erlang node on the current machine. 4 | 5 | This module enhances and abstracts the erlang `slave` module. After calling `slave.start` to 6 | make sure the child node is running, it ensures that Elixir is started, after which it will run 7 | any function passed in as the `:on_start` param. This function must be compiled and loaded on 8 | both nodes. 9 | 10 | After that, control is handed back to the caller who can use the `:rpc` module to invoke 11 | functions remotely. 12 | 13 | The child nodes process is linked to the caller's process, so if the caller dies, so will the 14 | child node. 15 | 16 | If additional logging is required, set `enable_sasl` option to `true`. 17 | """ 18 | 19 | @type param :: {:enable_sasl, boolean} | {:on_start, (() -> any)} 20 | @type params :: [param] 21 | 22 | defmodule Runner do 23 | @moduledoc """ 24 | When the new node starts up, we often want to set up a supervision tree by calling 25 | a function with `:rpc.call`. However, when the call ends, all the linked processes 26 | in the rpc call will die. This runner encapsulates them and doesn't link to its caller, 27 | so that any processes started by `Runner` will continue to live after the `:rpc` call. 28 | """ 29 | use GenServer 30 | 31 | def start(mod, fun, args) do 32 | GenServer.start(__MODULE__, [mod, fun, args]) 33 | end 34 | 35 | def start(init_fn) when is_function(init_fn) do 36 | GenServer.start(__MODULE__, [init_fn]) 37 | end 38 | 39 | def init([mod, fun, args]) do 40 | rv = apply(mod, fun, args) 41 | {:ok, rv} 42 | end 43 | 44 | def init([init_fn]) do 45 | {:ok, init_fn} 46 | end 47 | 48 | def get(runner_pid) do 49 | GenServer.call(runner_pid, :get) 50 | end 51 | 52 | def do_init(runner_pid, args) do 53 | GenServer.call(runner_pid, {:do_init, args}) 54 | end 55 | 56 | def handle_call({:do_init, args}, _from, init_fn) do 57 | {:reply, init_fn.(args), init_fn} 58 | end 59 | 60 | def handle_call(:get, _from, v) do 61 | {:reply, v, v} 62 | end 63 | end 64 | 65 | @spec start_link(Application.t(), atom, params) :: {:ok, pid} | {:error, any} 66 | def start_link(app_to_start, node_name, params \\ [], timeout \\ 5_000) do 67 | unless Node.alive?() do 68 | {:ok, _} = Node.start(:"local@0.0.0.0") 69 | end 70 | 71 | code_paths = Enum.join(:code.get_path(), " ") 72 | 73 | default_node_start_args = [ 74 | "-setcookie #{Node.get_cookie()}", 75 | "-pa #{code_paths}", 76 | "-connect_all false" 77 | ] 78 | 79 | node_start_args = 80 | if params[:enable_sasl] do 81 | default_node_start_args ++ ["-logger handle_sasl_reports true"] 82 | else 83 | default_node_start_args 84 | end 85 | |> Enum.join(" ") 86 | |> String.to_charlist() 87 | 88 | node_name = to_node_name(node_name) 89 | {:ok, node_name} = :slave.start_link('0.0.0.0', node_name, node_start_args) 90 | {:ok, _} = :rpc.call(node_name, :application, :ensure_all_started, [:elixir]) 91 | 92 | on_start = params[:on_start] 93 | rpc_args = [node_name, app_to_start, on_start, self()] 94 | 95 | case :rpc.call(node_name, __MODULE__, :on_start, rpc_args, timeout) do 96 | {:ok, start_fn_results} -> 97 | {:ok, node_name, start_fn_results} 98 | 99 | {:badrpc, :timeout} -> 100 | {:error, :timeout} 101 | end 102 | end 103 | 104 | def on_start(node_name, app_to_start, start_callback, _caller) do 105 | case app_to_start do 106 | apps when is_list(apps) -> 107 | for app <- apps do 108 | {:ok, _} = Application.ensure_all_started(app) 109 | end 110 | 111 | app when is_atom(app) -> 112 | {:ok, _started_apps} = Application.ensure_all_started(app) 113 | end 114 | 115 | start_fn_results = 116 | case start_callback do 117 | callback when is_function(callback) -> 118 | {:ok, runner_pid} = Runner.start(callback) 119 | Runner.do_init(runner_pid, node_name) 120 | 121 | {m, f, a} -> 122 | {:ok, runner_pid} = Runner.start(m, f, a) 123 | Runner.get(runner_pid) 124 | 125 | nil -> 126 | nil 127 | end 128 | 129 | {:ok, start_fn_results} 130 | end 131 | 132 | @doc "Runs the MFA in a process on the remote node" 133 | @spec run(node, module(), atom(), [any]) :: any 134 | def run(node, m, f, a) do 135 | {:ok, runner_pid} = :rpc.call(node, Runner, :start, [m, f, a]) 136 | :rpc.call(node, Runner, :get, [runner_pid]) 137 | end 138 | 139 | defp to_node_name(node_name) when is_atom(node_name) do 140 | node_name 141 | |> Atom.to_string() 142 | |> String.split(".") 143 | |> sanitize_node_name 144 | end 145 | 146 | defp sanitize_node_name([node_name]) do 147 | String.to_atom(node_name) 148 | end 149 | 150 | defp sanitize_node_name(node_name) when is_list(node_name) do 151 | node_name 152 | |> List.last() 153 | |> Macro.underscore() 154 | |> String.downcase() 155 | |> String.to_atom() 156 | end 157 | end 158 | -------------------------------------------------------------------------------- /test/support/observable_gen.ex: -------------------------------------------------------------------------------- 1 | defmodule ZenMonitor.Test.Support.ObservableGen do 2 | @moduledoc """ 3 | ObservableGen is a test spy that can observe all calls to call/3 and cast/2 and forward them to 4 | a spy process with an {:observe, :call | :cast, *args} 5 | 6 | It is used in ZenMonitor tests to verify that the proper communication is happening between 7 | various components. 8 | """ 9 | use Agent 10 | 11 | def start_link(spy) do 12 | Agent.start_link(fn -> spy end, name: __MODULE__) 13 | end 14 | 15 | def call(destination, message, timeout \\ 5000) do 16 | Agent.get(__MODULE__, fn spy -> 17 | send(spy, {:observe, :call, destination, message, timeout}) 18 | end) 19 | 20 | GenServer.call(destination, message, timeout) 21 | end 22 | 23 | def cast(destination, message) do 24 | Agent.get(__MODULE__, fn spy -> 25 | send(spy, {:observe, :cast, destination, message}) 26 | end) 27 | 28 | GenServer.cast(destination, message) 29 | end 30 | end 31 | -------------------------------------------------------------------------------- /test/support/subscriber.ex: -------------------------------------------------------------------------------- 1 | defmodule ZenMonitor.Test.Support.Subscriber do 2 | def start(spy) do 3 | spawn(__MODULE__, :forward, [spy]) 4 | end 5 | 6 | def forward(spy) do 7 | receive do 8 | message -> send(spy, {:forward, message}) 9 | end 10 | 11 | forward(spy) 12 | end 13 | end 14 | -------------------------------------------------------------------------------- /test/test_helper.exs: -------------------------------------------------------------------------------- 1 | defmodule Helper do 2 | alias ZenMonitor.{Local, Proxy} 3 | import ExUnit.Assertions 4 | 5 | def await_monitors_established(subscriber \\ nil, refs, target) do 6 | subscriber = subscriber || self() 7 | Enum.each(refs, &await_monitor_established(subscriber, &1, target)) 8 | end 9 | 10 | def await_monitor_established(subscriber \\ nil, ref, target) do 11 | subscriber = subscriber || self() 12 | 13 | assert wait_until(fn -> 14 | local_monitor_established?(subscriber, ref, target) 15 | end), 16 | "Local Monitor #{inspect(ref)}: #{inspect(subscriber)} -> #{inspect(target)} did not get established" 17 | 18 | assert wait_until(fn -> 19 | proxy_monitor_established?(target) 20 | end), 21 | "Proxy Monitor #{inspect(ref)}: #{inspect(subscriber)} -> #{inspect(target)} did not get established" 22 | end 23 | 24 | def await_monitors_cleared(subscriber \\ nil, refs, target) do 25 | subscriber = subscriber || self() 26 | Enum.each(refs, &await_monitor_cleared(subscriber, &1, target)) 27 | end 28 | 29 | def await_monitor_cleared(subscriber \\ nil, ref, target) do 30 | subscriber = subscriber || self() 31 | 32 | assert wait_until(fn -> 33 | !local_monitor_established?(subscriber, ref, target) 34 | end), 35 | "Local Monitor #{inspect(ref)}: #{inspect(subscriber)} -> #{inspect(target)} did not get cleared" 36 | end 37 | 38 | def local_monitor_established?(subscriber \\ nil, ref, target) do 39 | subscriber = subscriber || self() 40 | 41 | monitors = Local.Connector.monitors(target, subscriber) 42 | 43 | ref in monitors 44 | end 45 | 46 | def proxy_monitor_established?(target) do 47 | subscriber = Local.Connector.get(target) 48 | target_node = node(target) 49 | table = Proxy.Tables.subscribers() 50 | 51 | row = 52 | if target_node == Node.self() do 53 | :ets.lookup(table, {target, subscriber}) 54 | else 55 | args = [table, {target, subscriber}] 56 | :rpc.call(target_node, :ets, :lookup, args) 57 | end 58 | 59 | !Enum.empty?(row) 60 | end 61 | 62 | @doc """ 63 | Helper that executes a function until it returns true 64 | 65 | Useful for operations that will eventually complete, instead of sleeping to allow an async 66 | operation to complete, wait_until will call the function in a loop up to the specified number of 67 | attempts with the specified delay between attempts. 68 | """ 69 | @spec wait_until(fun :: (() -> boolean), attempts :: non_neg_integer, delay :: pos_integer) :: 70 | boolean 71 | def wait_until(fun, attempts \\ 50, delay \\ 100) 72 | 73 | def wait_until(_, 0, _), do: false 74 | 75 | def wait_until(fun, attempts, delay) do 76 | try do 77 | case fun.() do 78 | true -> 79 | true 80 | 81 | _ -> 82 | Process.sleep(delay) 83 | wait_until(fun, attempts - 1, delay) 84 | end 85 | rescue 86 | MatchError -> 87 | Process.sleep(delay) 88 | wait_until(fun, attempts - 1, delay) 89 | end 90 | end 91 | end 92 | 93 | Application.ensure_all_started(:instruments) 94 | 95 | ExUnit.start() 96 | -------------------------------------------------------------------------------- /test/truncator_test.exs: -------------------------------------------------------------------------------- 1 | defmodule TruncatorTest do 2 | use ExUnit.Case 3 | alias ZenMonitor.Truncator 4 | 5 | describe "scalars should pass through" do 6 | test "atoms" do 7 | assert :test_atom == Truncator.truncate(:test_atom) 8 | end 9 | 10 | test "floats" do 11 | assert 1.2 == Truncator.truncate(1.2) 12 | end 13 | 14 | test "integers" do 15 | assert 1 == Truncator.truncate(1) 16 | end 17 | 18 | test "strings" do 19 | assert "hello" == Truncator.truncate("hello") 20 | end 21 | 22 | test "pids" do 23 | pid = self() 24 | assert pid == Truncator.truncate(pid) 25 | end 26 | end 27 | 28 | describe "top level shutdown messages should pass through" do 29 | test "long list that would normally be truncated" do 30 | long_list = [:a, :b, :c, :d, :e, :f, :g] 31 | assert {:shutdown, ^long_list} = Truncator.truncate({:shutdown, long_list}) 32 | end 33 | 34 | test "only at top level, nested shutdown tuples should be truncated" do 35 | long_list = [:a, :b, :c, :d, :e, :f, :g] 36 | assert {:foo, {:shutdown, :truncated}} = Truncator.truncate({:foo, {:shutdown, long_list}}) 37 | end 38 | end 39 | 40 | describe "bistring truncation" do 41 | test "less than limit should pass through" do 42 | input = "test-string" 43 | assert input == Truncator.truncate(input) 44 | end 45 | 46 | test "equal to limit should pass through" do 47 | input = String.duplicate("a", 1024) 48 | assert input == Truncator.truncate(input) 49 | end 50 | 51 | test "greater than limit should be truncated" do 52 | assert <<_::binary-size(1021), "...">> = Truncator.truncate(String.duplicate("a", 2048)) 53 | end 54 | end 55 | 56 | describe "list truncation" do 57 | test "lists of size less than 5 should pass through" do 58 | assert [] == Truncator.truncate([]) 59 | assert [1] == Truncator.truncate([1]) 60 | assert [1, 2] == Truncator.truncate([1, 2]) 61 | assert [1, 2, 3] == Truncator.truncate([1, 2, 3]) 62 | assert [1, 2, 3, 4] == Truncator.truncate([1, 2, 3, 4]) 63 | end 64 | 65 | test "lists of size 5 should be truncated" do 66 | assert :truncated == Truncator.truncate([1, 2, 3, 4, 5]) 67 | end 68 | 69 | test "lists of size greater than 5 should be truncated" do 70 | assert :truncated == Truncator.truncate([1, 2, 3, 4, 5, 6]) 71 | end 72 | end 73 | 74 | describe "tuple truncation" do 75 | test "tuples of size less than 5 should pass through" do 76 | assert {} == Truncator.truncate({}) 77 | assert {1} == Truncator.truncate({1}) 78 | assert {1, 2} == Truncator.truncate({1, 2}) 79 | assert {1, 2, 3} == Truncator.truncate({1, 2, 3}) 80 | assert {1, 2, 3, 4} == Truncator.truncate({1, 2, 3, 4}) 81 | end 82 | 83 | test "tuples of size 5 should be truncated" do 84 | assert :truncated = Truncator.truncate({1, 2, 3, 4, 5}) 85 | end 86 | 87 | test "tuples of size greater than 5 should be truncated" do 88 | assert :truncated = Truncator.truncate({1, 2, 3, 4, 5, 6}) 89 | end 90 | end 91 | 92 | describe "map truncation" do 93 | test "maps of size less than 5 should pass through" do 94 | assert %{a: 1} == Truncator.truncate(%{a: 1}) 95 | assert %{a: 1, b: 2} == Truncator.truncate(%{a: 1, b: 2}) 96 | assert %{a: 1, b: 2, c: 3} == Truncator.truncate(%{a: 1, b: 2, c: 3}) 97 | assert %{a: 1, b: 2, c: 3, d: 4} == Truncator.truncate(%{a: 1, b: 2, c: 3, d: 4}) 98 | end 99 | 100 | test "maps of size 5 should be truncated" do 101 | assert :truncated == Truncator.truncate(%{a: 1, b: 2, c: 3, d: 4, e: 5}) 102 | end 103 | 104 | test "maps of size greater than 5 should be truncated" do 105 | assert :truncated == Truncator.truncate(%{a: 1, b: 2, c: 3, d: 4, e: 5, f: 6}) 106 | end 107 | end 108 | 109 | describe "struct truncation" do 110 | defmodule OneFieldStruct do 111 | defstruct a: 1 112 | end 113 | 114 | defmodule TwoFieldStruct do 115 | defstruct a: 1, b: 2 116 | end 117 | 118 | defmodule ThreeFieldStruct do 119 | defstruct a: 1, b: 2, c: 3 120 | end 121 | 122 | defmodule FourFieldStruct do 123 | defstruct a: 1, b: 2, c: 3, d: 4 124 | end 125 | 126 | defmodule FiveFieldStruct do 127 | defstruct a: 1, b: 2, c: 3, d: 4, e: 5 128 | end 129 | 130 | defmodule SixFieldStruct do 131 | defstruct a: 1, b: 2, c: 3, d: 4, e: 5, f: 6 132 | end 133 | 134 | test "structs of size less than 5 should pass through" do 135 | one = %OneFieldStruct{} 136 | two = %TwoFieldStruct{} 137 | three = %ThreeFieldStruct{} 138 | four = %FourFieldStruct{} 139 | 140 | assert one == Truncator.truncate(one) 141 | assert two == Truncator.truncate(two) 142 | assert three == Truncator.truncate(three) 143 | assert four == Truncator.truncate(four) 144 | end 145 | 146 | test "structs of size 5 should be truncated" do 147 | assert :truncated == Truncator.truncate(%FiveFieldStruct{}) 148 | end 149 | 150 | test "structs of size greater than 5 should be truncated" do 151 | assert :truncated == Truncator.truncate(%SixFieldStruct{}) 152 | end 153 | end 154 | 155 | describe "struct robustness" do 156 | test "small unknown struct stays as-is" do 157 | unknown_struct = %{ 158 | :__struct__ => NotARealModule, 159 | a: :b, 160 | c: :d 161 | } 162 | 163 | assert unknown_struct == Truncator.truncate(unknown_struct) 164 | end 165 | 166 | test "large unknown struct should be truncated" do 167 | unknown_struct = %{ 168 | :__struct__ => NotARealModule, 169 | a: :b, 170 | c: :d, 171 | e: :f, 172 | g: :h, 173 | i: :j, 174 | } 175 | 176 | assert :truncated == Truncator.truncate(unknown_struct) 177 | end 178 | end 179 | 180 | describe "limited nesting" do 181 | defmodule Nested do 182 | defstruct map: %{}, 183 | list: [], 184 | tuple: {}, 185 | struct: nil 186 | end 187 | 188 | test "it should prevent deeply nested lists" do 189 | nested = [:a, [:b, [:c, [:d, [:e, [:f]]]]]] 190 | 191 | assert :truncated == Truncator.truncate(nested, 0) 192 | assert [:truncated, :truncated] == Truncator.truncate(nested, 1) 193 | assert [:a, [:truncated, :truncated]] == Truncator.truncate(nested, 2) 194 | assert [:a, [:b, [:truncated, :truncated]]] == Truncator.truncate(nested, 3) 195 | assert [:a, [:b, [:c, [:truncated, :truncated]]]] == Truncator.truncate(nested, 4) 196 | end 197 | 198 | test "it should prevent deeply nested maps" do 199 | nested = %{a: %{b: %{c: %{d: %{}}}}} 200 | 201 | assert :truncated == Truncator.truncate(nested, 0) 202 | assert %{a: :truncated} == Truncator.truncate(nested, 1) 203 | assert %{a: %{b: :truncated}} == Truncator.truncate(nested, 2) 204 | assert %{a: %{b: %{c: :truncated}}} == Truncator.truncate(nested, 3) 205 | assert %{a: %{b: %{c: %{d: :truncated}}}} == Truncator.truncate(nested, 4) 206 | assert nested == Truncator.truncate(nested, 5) 207 | end 208 | 209 | test "it should prevent deeply nested tuples" do 210 | nested = {:a, {:b, {:c, {:d, {}}}}} 211 | 212 | assert :truncated == Truncator.truncate(nested, 0) 213 | assert {:truncated, :truncated} == Truncator.truncate(nested, 1) 214 | assert {:a, {:truncated, :truncated}} == Truncator.truncate(nested, 2) 215 | assert {:a, {:b, {:truncated, :truncated}}} == Truncator.truncate(nested, 3) 216 | assert {:a, {:b, {:c, {:truncated, :truncated}}}} == Truncator.truncate(nested, 4) 217 | assert {:a, {:b, {:c, {:d, {}}}}} == Truncator.truncate(nested, 5) 218 | end 219 | 220 | test "it should prevent deeply nested structs" do 221 | assert %Nested{map: :truncated} = Truncator.truncate(%Nested{map: %{a: 1}}, 1) 222 | 223 | assert %Nested{map: %{a: :truncated}} = Truncator.truncate(%Nested{map: %{a: %{b: 2}}}, 2) 224 | 225 | assert %Nested{list: :truncated} = Truncator.truncate(%Nested{list: [1, [2, [3]]]}, 1) 226 | 227 | assert %Nested{list: [:truncated, :truncated]} = 228 | Truncator.truncate(%Nested{list: [1, [2, [3]]]}, 2) 229 | 230 | assert %Nested{struct: :truncated} = 231 | Truncator.truncate(%Nested{struct: MapSet.new([1, 2, 3])}, 1) 232 | 233 | assert %Nested{struct: %MapSet{map: :truncated}} = 234 | Truncator.truncate(%Nested{struct: MapSet.new([1, 2, 3])}, 2) 235 | 236 | assert %Nested{tuple: :truncated} = 237 | Truncator.truncate(%Nested{tuple: {:a, {:b, {:c, {}}}}}, 1) 238 | 239 | assert %Nested{tuple: {:truncated, :truncated}} = 240 | Truncator.truncate(%Nested{tuple: {:a, {:b, {:c, {}}}}}, 2) 241 | 242 | assert %Nested{tuple: {:a, {:truncated, :truncated}}} = 243 | Truncator.truncate(%Nested{tuple: {:a, {:b, {:c, {}}}}}, 3) 244 | end 245 | end 246 | end 247 | --------------------------------------------------------------------------------