├── .formatter.exs
├── .github
    └── workflows
    │   └── ci.yml
├── .gitignore
├── LICENSE
├── README.md
├── config
    ├── config.exs
    ├── dev.exs
    ├── prod.exs
    └── test.exs
├── lib
    ├── zen_monitor.ex
    └── zen_monitor
    │   ├── application.ex
    │   ├── local.ex
    │   ├── local
    │       ├── connector.ex
    │       ├── dispatcher.ex
    │       ├── supervisor.ex
    │       └── tables.ex
    │   ├── metrics.ex
    │   ├── proxy.ex
    │   ├── proxy
    │       ├── batcher.ex
    │       ├── supervisor.ex
    │       └── tables.ex
    │   ├── supervisor.ex
    │   └── truncator.ex
├── mix.exs
├── mix.lock
└── test
    ├── black_box_test.exs
    ├── local
        ├── connector_test.exs
        ├── dispatcher_test.exs
        └── local_test.exs
    ├── proxy
        ├── batcher_test.exs
        └── proxy_test.exs
    ├── stress_test.exs
    ├── support
        ├── child_node.ex
        ├── observable_gen.ex
        └── subscriber.ex
    ├── test_helper.exs
    └── truncator_test.exs


/.formatter.exs:
--------------------------------------------------------------------------------
1 | [
2 |   inputs: [
3 |     "lib/**/*.{ex,exs}",
4 |     "test/**/*.{ex,exs}",
5 |     "config/**/*.exs",
6 |     "mix.exs"
7 |   ],
8 | ]
9 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ master ]
 6 |   pull_request:
 7 |     branches: [ master ]
 8 | 
 9 | jobs:
10 |   build:
11 |     name: Build and test
12 |     runs-on: ubuntu-20.04
13 |     strategy:
14 |       matrix:
15 |         include:
16 |           - elixir-version: 1.7.4
17 |             otp-version: 20.3
18 |           - elixir-version: 1.7.4
19 |             otp-version: 21.3
20 |           - elixir-version: 1.11.4
21 |             otp-version: 21.3
22 |           - elixir-version: 1.11.4
23 |             otp-version: 24.3
24 |           - elixir-version: 1.12.3
25 |             otp-version: 24.3
26 |           - elixir-version: 1.13.3
27 |             otp-version: 24.3
28 |           - elixir-version: 1.13.3
29 |             otp-version: 25.0
30 |     steps:
31 |     - uses: actions/checkout@v2
32 |     - name: Set up Elixir
33 |       uses: erlef/setup-beam@v1
34 |       with:
35 |         elixir-version: ${{ matrix.elixir-version }}
36 |         otp-version: ${{ matrix.otp-version }}
37 |     - name: Restore dependencies cache
38 |       uses: actions/cache@v2
39 |       with:
40 |         path: deps
41 |         key: ${{ runner.os }}-${{ matrix.elixir-version }}-${{ matrix.otp-version}}-mix-${{ hashFiles('**/mix.lock') }}
42 |         restore-keys: ${{ runner.os }}-${{ matrix.elixir-version}}-${{ matrix.otp-version }}-mix-
43 |     - name: Start EPMD
44 |       run: epmd -daemon
45 |     - name: Install dependencies
46 |       run: mix deps.get
47 |     - name: Run tests
48 |       run: mix test
49 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # The directory Mix will write compiled artifacts to.
 2 | /_build/
 3 | 
 4 | # If you run "mix test --cover", coverage assets end up here.
 5 | /cover/
 6 | 
 7 | # The directory Mix downloads your dependencies sources to.
 8 | /deps/
 9 | 
10 | # Where 3rd-party dependencies like ExDoc output generated docs.
11 | /doc/
12 | 
13 | # Ignore .fetch files in case you like to edit your project deps locally.
14 | /.fetch
15 | 
16 | # If the VM crashes, it generates a dump, let's ignore it too.
17 | erl_crash.dump
18 | 
19 | # Also ignore archive artifacts (built via "mix archive.build").
20 | *.ez
21 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Discord
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # ZenMonitor
  2 | 
  3 | [![CI](https://github.com/discord/zen_monitor/workflows/CI/badge.svg)](https://github.com/discord/zen_monitor/actions)
  4 | [![Hex.pm Version](http://img.shields.io/hexpm/v/zen_monitor.svg?style=flat)](https://hex.pm/packages/zen_monitor)
  5 | [![Hex.pm License](http://img.shields.io/hexpm/l/zen_monitor.svg?style=flat)](https://hex.pm/packages/zen_monitor)
  6 | [![HexDocs](https://img.shields.io/badge/HexDocs-Yes-blue)](https://hexdocs.pm/zen_monitor)
  7 | 
  8 | ZenMonitor allows for the efficient monitoring of remote processes with minimal use of ERTS
  9 | Distribution.
 10 | 
 11 | ## Installation
 12 | 
 13 | Add `ZenMonitor` to your dependencies
 14 | 
 15 | ```elixir
 16 | def deps do
 17 |   [
 18 |     {:zen_monitor, "~> 2.1.0"}
 19 |   ]
 20 | end
 21 | ```
 22 | 
 23 | ## Using ZenMonitor
 24 | 
 25 | ZenMonitor strives to be a drop-in replacement for `Process.monitor/1`.  To those ends, the
 26 | programming interface and all the complexities of how it carries out its task are simplified by a
 27 | simple unified programming interface.  All the functions that the caller needs to use have
 28 | convenient delegates available in the top-level `ZenMonitor` module.  The interface is detailed
 29 | below.
 30 | 
 31 | ### ZenMonitor.monitor/1
 32 | 
 33 | This is a drop-in replacement for `Process.monitor/1` when it comes to processes.  It is
 34 | compatible with the various ways that `Process.monitor/1` can establish monitors and will accept
 35 | one of a `pid`, a `name` which is the `atom` that a local process is registered under, or a tuple
 36 | of `{name, node}` for a registered process on a remote node.  These are defined as the
 37 | `ZenMonitor.destination` type.
 38 | 
 39 | `ZenMonitor.monitor/1` returns a standard reference that can be used to `demonitor` and can be
 40 | matched against the reference provided in the `:DOWN` message.
 41 | 
 42 | Similar to `Process.monitor/1`, the caller is allowed to monitor the same process multiple times,
 43 | each monitor will be provided with a unique reference and all monitors will fire `:DOWN` messages
 44 | when the monitored process goes down.  Even though the caller can establish multiple monitors,
 45 | ZenMonitor is designed to handle this efficiently, the only cost is an additional ETS row on the
 46 | local node and additional processing time at fan-out.
 47 | 
 48 | ### ZenMonitor.demonitor/2
 49 | 
 50 | This is a mostly drop-in replacement for `Process.demonitor/2` when it comes to processes.  The
 51 | first argument is the reference returned by `ZenMonitor.monitor/1`.  It accepts a list of option
 52 | atoms, but only honors the `:flush` option at this time.  Passing the `:info` option is allowed
 53 | but has no effect, this function always returns `true`.
 54 | 
 55 | ### ZenMonitor.compatibility/1
 56 | 
 57 | When operating in a mixed environment where some nodes are ZenMonitor compatible and some are not,
 58 | it may be necessary to check the compatibility of a remote node.  `ZenMonitor.compatibility/1`
 59 | accepts any `ZenMonitor.destination` and will report back one of `:compatible` or `:incompatible`
 60 | for the remote's cached compatibility status.
 61 | 
 62 | All remotes start off as `:incompatible` until a positively acknowledged connection is
 63 | established.  See the `ZenMonitor.connect/1` function for more information on connecting nodes.
 64 | 
 65 | ### ZenMonitor.compatibility_for_node/1
 66 | 
 67 | Performs the same operation as `ZenMonitor.compatibility/1` but it accepts a node atom instead of
 68 | a `ZenMonitor.destination`.
 69 | 
 70 | ### ZenMonitor.connect/1
 71 | 
 72 | Attempts a positive connection with the provided remote node.  Connections are established by
 73 | using the `@gen_module`'s `call/4` method to send a `:ping` message to the process registered
 74 | under the atom `ZenMonitor.Proxy` on the remote.  If this process responds with a `:pong` atom
 75 | then the connection is positively established and the node is marked as `:compatible`.  Any other
 76 | response or error condition (timeout / noproc / etc) will be considered negative acknowledgement.
 77 | 
 78 | `ZenMonitor.connect/1` is actually a delegate for `ZenMonitor.Local.Connector.connect/1` see the
 79 | documentation there for more information about how connect behaves.
 80 | 
 81 | ### Handling Down Messages
 82 | 
 83 | Any `:DOWN` message receivers (most commonly `GenServer.handle_info/2` callbacks) that match on
 84 | the reason should be updated to include an outer `{:zen_monitor, original_match}` wrapper.
 85 | 
 86 | ```elixir
 87 | def handle_info({:DOWN, ref, :process, pid, :specific_reason}, state) do
 88 |   ...
 89 | end
 90 | ```
 91 | 
 92 | Should be updated to the following.
 93 | 
 94 | ```elixir
 95 | def handle_info({:DOWN, ref, :process, pid, {:zen_monitor, :specific_reason}}, state) do
 96 |   ...
 97 | end
 98 | ```
 99 | 
100 | ## Why?
101 | 
102 | `ZenMonitor` was developed at [Discord](https://discordapp.com) to improve the stability of our
103 | real-time communications infrastructure.  `ZenMonitor` improves stability in a couple of
104 | different ways.
105 | 
106 | ### Traffic Calming
107 | 
108 | When a process is being monitored by a large number of remote processes, that process going down
109 | can cause both the node hosting the downed process and the node hosting the monitoring processes
110 | to be suddenly flooded with an large amount of work.   This is commonly referred to as a
111 | thundering herd and can overwhelm either node depending on the situation.
112 | 
113 | ZenMonitor relies on interval batching and `GenStage` to help calm the deluge into a throttled
114 | stream of `:DOWN` messages that may take more wall clock time to process but has more predictable
115 | scheduler utilization and network consumption.
116 | 
117 | ### Message Interspersing
118 | 
119 | In the inverse scenario, a single process monitoring a large number of remote processes, a
120 | systemic failure of a large number of monitored processes can result in blocking the message
121 | queue.  This can cause other messages being sent to the process to backup behind the `:DOWN`
122 | messages.
123 | 
124 | Here's what a message queue might look like if 100,000 monitors fired due to node failure.
125 | 
126 | ```
127 | +------------------------------------------------+
128 | |    {:DOWN, ref, :process, pid_1, :nodedown}    |
129 | +------------------------------------------------+
130 | |    {:DOWN, ref, :process, pid_2, :nodedown}    |
131 | +------------------------------------------------+
132 | ...             snip 99,996 messages           ...
133 | +------------------------------------------------+
134 | | {:DOWN, ref, :process, pid_99_999, :nodedown}  |
135 | +------------------------------------------------+
136 | | {:DOWN, ref, :process, pid_100_000, :nodedown} |
137 | +------------------------------------------------+
138 | |                     :work                      |
139 | +------------------------------------------------+
140 | |                     :work                      |
141 | +------------------------------------------------+
142 | |                     :work                      |
143 | +------------------------------------------------+
144 | ...                    etc                     ...
145 | ```
146 | 
147 | The process has to process the 100,000 `:DOWN` messages before it can get back to doing work, if
148 | the processing of a `:DOWN` message is non-trivial then this could result in the process
149 | effectively appearing unresponsive to callers expecting it to do `:work`.
150 | 
151 | `ZenMonitor.Local.Dispatcher` provides a configurable batch sweeping system that dispatches a
152 | fixed demand_amount of `:DOWN` messages every demand_interval (See the documentation for
153 | `ZenMonitor.Local.Dispatcher` for configuration and defaults).  Using `ZenMonitor` the message
154 | queue would look like this.
155 | 
156 | ```
157 | +------------------------------------------------+
158 | |    {:DOWN, ref, :process, pid_1, :nodedown}    |
159 | +------------------------------------------------+
160 | ...             snip 4,998 messages           ...
161 | +------------------------------------------------+
162 | |  {:DOWN, ref, :process, pid_5000, :nodedown}   |
163 | +------------------------------------------------+
164 | |                     :work                      |
165 | +------------------------------------------------+
166 | ...    snip messages during demand_interval    ...
167 | +------------------------------------------------+
168 | |                     :work                      |
169 | +------------------------------------------------+
170 | |  {:DOWN, ref, :process, pid_5001, :nodedown}   |
171 | +------------------------------------------------+
172 | ...             snip 4,998 messages           ...
173 | +------------------------------------------------+
174 | | {:DOWN, ref, :process, pid_10_000, :nodedown}  |
175 | +------------------------------------------------+
176 | |                     :work                      |
177 | +------------------------------------------------+
178 | ...    snip messages during demand_interval    ...
179 | +------------------------------------------------+
180 | |                     :work                      |
181 | +------------------------------------------------+
182 | ...                    etc                     ...
183 | ```
184 | 
185 | This means that the process can continue processing work messages while working through more
186 | manageable batches of `:DOWN` messages, this improves the effective responsiveness of the process.
187 | 
188 | ### Message Truncation
189 | 
190 | `:DOWN` messages include a `reason` field that can include large stack traces and GenServer state
191 | dumps.  Large `reason`s generally don't pose an issue, but in a scenario where thousands of
192 | processes are monitoring a process that generates a large `reason` the cumulative effect of
193 | duplicating the large `reason` to each monitoring process can consume all available memory on a
194 | node.
195 | 
196 | When a `:DOWN` message is received for dispatch to remote subscribers, the first step is to
197 | truncate the message using `ZenMonitor.Truncator`, see the module documentation for more
198 | information about how truncation is performed and what configuration options are supported.
199 | 
200 | This prevents the scenario where a single process with a large stack trace or large state gets
201 | amplified on the receiving node and consumes an large amount of memory.
202 | 
203 | ## Design
204 | 
205 | ZenMonitor is constructed of two cooperating systems, the _Local ZenMonitor System_ and the
206 | _Proxy ZenMonitor System_.  When a process wishes to monitor a remote process, it should inform
207 | the _Local ZenMonitor System_ which will efficiently dispatch the monitoring request to the remote
208 | node's _Proxy ZenMonitor System_.
209 | 
210 | ### Local ZenMonitor System
211 | 
212 | The _Local ZenMonitor System_ is composed of a few processes, these are managed by the
213 | ZenMonitor.Local.Supervisor.  The processes that comprise the _Local ZenMonitor System_ are
214 | described in detail in the following section.
215 | 
216 | #### ZenMonitor.Local
217 | 
218 | ZenMonitor.Local is responsible for accepting monitoring and demonitoring requests from local
219 | processes.  It will send these requests to the Connector processes for efficient transmission
220 | to the responsible ZenMonitor.Proxy processes.
221 | 
222 | When a monitored process dies, the ZenMonitor.Proxy will send this information in a summary
223 | message to the ZenMonitor.Local.Connector process which will use the send down_dispatches to
224 | ZenMonitor.Local for eventual delivery by the ZenMonitor.Local.Dispatcher.
225 | 
226 | ZenMonitor.Local is also responsible for monitoring the local interested process and performing
227 | clean-up if the local interested process crashes for any reason, this prevents the Local
228 | ZenMonitor System from leaking memory.
229 | 
230 | #### ZenMonitor.Local.Tables
231 | 
232 | This is a simple process that is responsible for owning shared ETS tables used by various parts of
233 | the Local ZenMonitor System.
234 | 
235 | It maintains two tables, `ZenMonitor.Local.Tables.Nodes` and  `ZenMonitor.Local.Tables.References`
236 | these tables are public and are normally written to and read from by the ZenMonitor.Local and
237 | ZenMonitor.Local.Connector processes.
238 | 
239 | #### ZenMonitor.Local.Connector
240 | 
241 | ZenMonitor.Local.Connector is responsible for batching monitoring requests into summary requests
242 | for the remote ZenMonitor.Proxy.  The Connector handles the actual distribution connection to the
243 | remote ZenMonitor.Proxy including dealing with incompatible and down nodes.
244 | 
245 | When processes go down on the remote node, the Proxy ZenMonitor System will report summaries of
246 | these down processes to the corresponding ZenMonitor.Local.Connector.
247 | 
248 | There will be one ZenMonitor.Local.Connector per remote node with monitored processes.
249 | 
250 | #### ZenMonitor.Local.Dispatcher
251 | 
252 | When a remote node or remote processes fail, messages will be enqueued for delivery.  The
253 | ZenMonitor.Local.Dispatcher is responsible for processing these enqueued messages at a steady and
254 | controlled rate.
255 | 
256 | ### Proxy ZenMonitor System
257 | 
258 | The _Proxy ZenMonitor System_ is composed of a few processes, these are managed by the
259 | `ZenMonitor.Proxy.Supervisor`.  The processes that comprise the _Proxy ZenMonitor System_ are
260 | described in detail in the following section.
261 | 
262 | #### ZenMonitor.Proxy
263 | 
264 | `ZenMonitor.Proxy` is responsible for handling subscription requests from the
265 | _Local ZenMonitor System_ and for maintaining the ERTS Process Monitors on the processes local to
266 | the remote node.
267 | 
268 | `ZenMonitor.Proxy` is designed to be efficient with local monitors and will guarantee that for any
269 | local process there is, at most, one ERTS monitor no matter the number remote processes and remote
270 | nodes are interested in monitoring that process.
271 | 
272 | When a local process goes down `ZenMonitor.Proxy` will enqueue a new death certificate to the
273 | `ZenMonitor.Proxy.Batcher` processes that correspond to the interested remotes.
274 | 
275 | #### ZenMonitor.Proxy.Tables
276 | 
277 | This is a simple process that is responsible for owning shared ETS tables used by various parts of
278 | the _Proxy ZenMonitor System_.
279 | 
280 | It maintains a single table, `ZenMonitor.Proxy.Tables.Subscribers`.  This table is used by both
281 | the `ZenMonitor.Proxy` and `ZenMonitor.Proxy.Batcher` processes.
282 | 
283 | 
284 | #### ZenMonitor.Proxy.Batcher
285 | 
286 | This process has two primary responsibilities, collecting and summarizing death certificates and
287 | monitoring the remote process.
288 | 
289 | For every remote `ZenMonitor.Local.Connector` that is interested in monitoring processes on this
290 | node, a corresponding `ZenMonitor.Proxy.Batcher` is spawned that will collect and ultimately
291 | deliver death certificates.  The `ZenMonitor.Proxy.Batcher` will also monitor the remote
292 | `ZenMonitor.Local.Connector` and clean up after it if it goes down for any reason.
293 | 
294 | ## Running a Compatible Node
295 | 
296 | ZenMonitor ships with an Application, `ZenMonitor.Application` which will start the overall
297 | supervisor, `ZenMonitor.Supervisor`.  This creates a supervision tree as outlined below.
298 | 
299 | ```
300 |                                                                             -------------------------
301 |                                                                       +----| ZenMonitor.Local.Tables |
302 |                                                                       |     -------------------------
303 |                                                                       |
304 |                                                                       |     ------------------
305 |                                                                       +----| ZenMontior.Local |
306 |                                     -----------------------------     |     ------------------
307 |                               +----| ZenMonitor.Local.Supervisor |----|
308 |                               |     -----------------------------     |     -------------       ----------------------------
309 |                               |                                       +----| GenRegistry |--N--| ZenMonitor.Local.Connector |
310 |                               |                                       |     -------------       ----------------------------
311 |                               |                                       |
312 |                               |                                       |     -----------------------------
313 |                               |                                       +----| ZenMonitor.Local.Dispatcher |
314 |                               |                                             -----------------------------
315 |   -----------------------     |
316 |  | ZenMonitor.Supervisor |----|
317 |   -----------------------     |                                             -------------------------
318 |                               |                                       +----| ZenMonitor.Proxy.Tables |
319 |                               |                                       |     -------------------------
320 |                               |                                       |
321 |                               |     -----------------------------     |     ------------------
322 |                               +----| ZenMonitor.Proxy.Supervisor |----+----| ZenMonitor.Proxy |
323 |                                     -----------------------------     |     ------------------
324 |                                                                       |
325 |                                                                       |     -------------       --------------------------
326 |                                                                       +----| GenRegistry |--M--| ZenMonitor.Proxy.Batcher |
327 |                                                                             -------------       --------------------------
328 | ```
329 | 
330 | 


--------------------------------------------------------------------------------
/config/config.exs:
--------------------------------------------------------------------------------
 1 | use Mix.Config
 2 | 
 3 | config :zen_monitor,
 4 |   gen_module: GenServer,
 5 |   connector_sweep_interval: 100,
 6 |   batcher_sweep_interval: 100,
 7 |   demand_interval: 100,
 8 |   demand_amount: 1000,
 9 |   max_binary_size: 1024,
10 |   truncation_depth: 3
11 | 
12 | import_config "#{Mix.env()}.exs"
13 | 


--------------------------------------------------------------------------------
/config/dev.exs:
--------------------------------------------------------------------------------
1 | use Mix.Config
2 | 


--------------------------------------------------------------------------------
/config/prod.exs:
--------------------------------------------------------------------------------
1 | use Mix.Config
2 | 


--------------------------------------------------------------------------------
/config/test.exs:
--------------------------------------------------------------------------------
 1 | use Mix.Config
 2 | 
 3 | config :zen_monitor,
 4 |   connector_sweep_interval: 10,
 5 |   batcher_sweep_interval: 10,
 6 |   demand_interval: 10,
 7 |   demand_amount: 1000
 8 | 
 9 | config :logger, :console,
10 |   format: "$time [$level] $levelpad | $metadata |    $message\n",
11 |   metadata: [:module, :function, :line]
12 | 


--------------------------------------------------------------------------------
/lib/zen_monitor.ex:
--------------------------------------------------------------------------------
  1 | defmodule ZenMonitor do
  2 |   @moduledoc """
  3 |   ZenMonitor provides efficient monitoring of remote processes and controlled dissemination of
  4 |   any resulting `:DOWN` messages.
  5 | 
  6 |   This module provides a convenient client interface which aims to be a drop in replacement for
  7 |   `Process.monitor/1` and `Process.demonitor/2`
  8 | 
  9 |   # Known differences between ZenMonitor and Process
 10 | 
 11 |     - `ZenMonitor.demonitor/2` has the same signature as Process.demonitor/2 but does not respect
 12 |       the `:info` option.
 13 | 
 14 |     - ZenMonitor aims to be efficient over distribution, one of the main strategies for achieving
 15 |       this is relying mainly on local monitors and then batching up all changes over a time period
 16 |       to be sent as a single message.  This design means that additional latency is added to the
 17 |       delivery of down messages in pursuit of the goal.  Where `Process.monitor/1` on a remote
 18 |       process will provide a :DOWN message as soon as possible, `ZenMonitor.monitor/1` on a remote
 19 |       process will actually have a number of batching periods to go through before the message
 20 |       arrives at the monitoring process, here are all the points that add latency.
 21 | 
 22 |       1.  When the monitor is enqueued it has to wait until the next sweep happens in the
 23 |           `ZenMonitor.Local.Connector` until it will be delivered to the `ZenMonitor.Proxy`.
 24 |       1.  The monitor arrives at the `ZenMonitor.Proxy`, the process crashes and the ERTS `:DOWN`
 25 |           message is delivered. This will be translated into a death_certificate and sent to a
 26 |           `ZenMonitor.Proxy.Batcher` for delivery.  It will have to wait until the next sweep
 27 |           happens for it to be sent back to the `ZenMonitor.Local.Connector` for fan-out.
 28 |       1.  The dead summary including the death_certificate arrives at the
 29 |           `ZenMonitor.Local.Connector` and a down_dispatch is created for it and enqueued with the
 30 |           `ZenMonitor.Local`.
 31 |       1.  The down_dispatch waits in a queue until the `ZenMonitor.Local.Dispatcher` generates
 32 |           more demand.
 33 |       1.  Once demand is generated, `ZenMonitor.Local` will hand off the down_dispatch for actual
 34 |           delivery by `ZenMonitor.Local.Dispatcher`.
 35 | 
 36 |       * Steps 1 and 3 employ a strategy of batch sizing to prevent the message from growing too
 37 |         large.  The batch size is controlled by application configuration and is alterable at boot
 38 |         and runtime.  This means though that Steps 1 and 3 can be delayed by N intervals
 39 |         where `N = ceil(items_ahead_of_event / chunk_size)`
 40 |       * Step 4 employs a similar batching strategy, a down_dispatch will wait in queue for up to N
 41 |         intervals where `N = ceil(items_ahead_of_dispatch / chunk_size)`
 42 | 
 43 |     - `ZenMonitor` decorates the reason of the `:DOWN` message.  If a remote process goes down
 44 |       because of `original_reason`, this will get decorated as `{:zen_monitor, original_reason}`
 45 |       when delivered by ZenMonitor.  This allows the receiver to differentiate `:DOWN` messages
 46 |       originating from `ZenMonitor.monitor/1` and those originating from `Process.monitor/1`.
 47 |       This is necessary when operating in mixed mode.  It is the responsibility of the receiver to
 48 |       unwrap this reason if it requires the `original_reason` for some additional handling of the
 49 |       `:DOWN` message.
 50 |   """
 51 | 
 52 |   @gen_module GenServer
 53 | 
 54 |   @typedoc """
 55 |   `ZenMonitor.destination` are all the types that can be monitored.
 56 | 
 57 |     - `pid()` either local or remote
 58 |     - `{name, node}` represents a named process on the given node
 59 |     - `name :: atom()` is a named process on the local node
 60 |   """
 61 |   @type destination :: pid() | ({name :: atom, node :: node()}) | (name :: atom())
 62 | 
 63 |   ## Delegates
 64 | 
 65 |   @doc """
 66 |   Delegate to `ZenMonitor.Local.compatibility/1`
 67 |   """
 68 |   defdelegate compatibility(target), to: ZenMonitor.Local
 69 | 
 70 |   @doc """
 71 |   Delegate to `ZenMonitor.Local.compatibility_for_node/1`
 72 |   """
 73 |   defdelegate compatibility_for_node(remote), to: ZenMonitor.Local
 74 | 
 75 |   @doc """
 76 |   Delegate to `ZenMonitor.Local.Connector.connect/1`
 77 |   """
 78 |   defdelegate connect(remote), to: ZenMonitor.Local.Connector
 79 | 
 80 |   @doc """
 81 |   Delegate to `ZenMonitor.Local.demonitor/2`
 82 |   """
 83 |   defdelegate demonitor(ref, options \\ []), to: ZenMonitor.Local
 84 | 
 85 |   @doc """
 86 |   Delegate to `ZenMonitor.Local.monitor/1`
 87 |   """
 88 |   defdelegate monitor(target), to: ZenMonitor.Local
 89 | 
 90 |   ## Client
 91 | 
 92 |   @doc """
 93 |   Get the module to use for gen calls from the Application Environment
 94 | 
 95 |   This module only needs to support `GenServer.call/3` and `GenServer.cast/2` functionality, see
 96 |   ZenMonitor's `@gen_module` for the default value
 97 | 
 98 |   This can be controlled at boot and runtime with the `{:zen_monitor, :gen_module}` setting, see
 99 |   `ZenMonitor.gen_module/1` for runtime convenience functionality.
100 |   """
101 |   @spec gen_module() :: atom
102 |   def gen_module do
103 |     Application.get_env(:zen_monitor, :gen_module, @gen_module)
104 |   end
105 | 
106 |   @doc """
107 |   Put the module to use for gen calls into the Application Environment
108 | 
109 |   This is a simple convenience function for overwriting the `{:zen_monitor, :gen_module}` setting
110 |   at runtime.
111 |   """
112 |   @spec gen_module(value :: atom) :: :ok
113 |   def gen_module(value) do
114 |     Application.put_env(:zen_monitor, :gen_module, value)
115 |   end
116 | 
117 |   @doc """
118 |   Get the current monotonic time in milliseconds
119 | 
120 |   This is a helper because `System.monotonic_time(:milliseconds)` is long and error-prone to
121 |   type in multiple call sites.
122 | 
123 |   See `System.monotonic_time/1` for more information.
124 |   """
125 |   @spec now() :: integer
126 |   def now do
127 |     System.monotonic_time(:millisecond)
128 |   end
129 | 
130 |   @doc """
131 |   Find the node for a destination.
132 |   """
133 |   @spec find_node(target :: destination) :: node()
134 |   def find_node(pid) when is_pid(pid), do: node(pid)
135 |   def find_node({_, node}), do: node
136 |   def find_node(_), do: Node.self()
137 | end
138 | 


--------------------------------------------------------------------------------
/lib/zen_monitor/application.ex:
--------------------------------------------------------------------------------
 1 | defmodule ZenMonitor.Application do
 2 |   @moduledoc """
 3 |   OTP Application that acts as the entry point for ZenMonitor.
 4 | 
 5 |   This Application will start all necessary processes for a node to be a compatible ZenMonitor
 6 |   node and to communicate with other compatible ZenMonitor nodes.
 7 | 
 8 |   See `ZenMonitor.Supervisor` for more information.
 9 |   """
10 |   use Application
11 | 
12 |   alias ZenMonitor.Metrics
13 | 
14 |   def start(_type, _args) do
15 |     children = [
16 |       ZenMonitor.Supervisor
17 |     ]
18 | 
19 |     Metrics.register()
20 | 
21 |     Supervisor.start_link(children, strategy: :one_for_one)
22 |   end
23 | end
24 | 


--------------------------------------------------------------------------------
/lib/zen_monitor/local.ex:
--------------------------------------------------------------------------------
  1 | defmodule ZenMonitor.Local do
  2 |   @moduledoc """
  3 |   ZenMonitor.Local
  4 | 
  5 |   Most of the actual logic of monitoring and fan-out is handled by `ZenMonitor.Local.Connector`,
  6 |   see that module for more information.
  7 | 
  8 |   `ZenMonitor.Local` is responsible for monitoring the subscribing local processes and cleaning up
  9 |   monitors if they crash.
 10 |   """
 11 |   use GenStage
 12 |   use Instruments.CustomFunctions, prefix: "zen_monitor.local"
 13 |   alias ZenMonitor.Local.{Connector, Tables}
 14 | 
 15 |   @typedoc """
 16 |   Effective compatibility of a remote node
 17 |   """
 18 |   @type compatibility :: :compatible | :incompatible
 19 | 
 20 |   @typedoc """
 21 |   Represents a future down dispatch for a given pid to be delivered by
 22 |   `ZenMonitor.Local.Dispatcher`
 23 |   """
 24 |   @type down_dispatch :: {pid, {:DOWN, reference, :process, pid, {:zen_monitor, any}}}
 25 | 
 26 |   @subscribers_table Module.concat(__MODULE__, "Subscribers")
 27 |   @hibernation_threshold 1_000
 28 | 
 29 |   defmodule State do
 30 |     @moduledoc """
 31 |     Maintains the internal state for ZenMonitor.Local
 32 | 
 33 |      - `subscribers` is an ETS table that tracks local subscribers to prevent multiple monitors
 34 |      - `batch` is the queue of messages awaiting delivery to ZenMonitor.Local.Dispatcher
 35 |      - `length` is the current length of the batch queue (calculating queue length is an O(n)
 36 |         operation, it is simple to track it as elements are added / removed)
 37 |      - `queue_emptied` is the number of times the queue has been emptied.  Once this number
 38 |         exceeds the hibernation_threshold (see `hibernation_threshold/0`) the process will
 39 |         hibernate
 40 |     """
 41 | 
 42 |     @type t :: %__MODULE__{
 43 |             subscribers: :ets.tid(),
 44 |             length: integer,
 45 |             queue_emptied: integer,
 46 |             batch: :queue.queue()
 47 |           }
 48 |     defstruct [
 49 |       :subscribers,
 50 |       length: 0,
 51 |       queue_emptied: 0,
 52 |       batch: :queue.new()
 53 |     ]
 54 |   end
 55 | 
 56 |   ## Delegates
 57 | 
 58 |   defdelegate compatibility_for_node(remote), to: ZenMonitor.Local.Connector, as: :compatibility
 59 | 
 60 |   ## Client
 61 | 
 62 |   def start_link(_opts \\ []) do
 63 |     GenStage.start_link(__MODULE__, [], name: __MODULE__)
 64 |   end
 65 | 
 66 |   @doc """
 67 |   Begin monitoring the given process
 68 | 
 69 |   Has the same semantics as `Process.monitor/1`, DOWN messages will be delivered
 70 |   at a pace controlled by the :zen_monitor, :demand_interval and
 71 |   :zen_monitor, :demand_amount environment variables
 72 |   """
 73 |   @spec monitor(target :: ZenMonitor.destination()) :: reference
 74 |   def monitor(target) do
 75 |     increment("monitor")
 76 |     ref = make_ref()
 77 |     me = self()
 78 | 
 79 |     # Write the reference out
 80 |     :ets.insert(Tables.references(), {{me, ref}, target})
 81 | 
 82 |     # Enqueue the monitor into the Connector for async monitor
 83 |     Connector.monitor(target, ref, me)
 84 | 
 85 |     # Perform reciprocal monitoring (if needed)
 86 |     unless :ets.member(@subscribers_table, me) do
 87 |       GenStage.cast(__MODULE__, {:monitor_subscriber, me})
 88 |     end
 89 | 
 90 |     # Return the reference to the caller
 91 |     ref
 92 |   end
 93 | 
 94 |   @doc """
 95 |   Stop monitoring a process by monitor reference
 96 | 
 97 |   Has the same semantics as `Process.demonitor/2` (although you can pass the `:info` option, it
 98 |   has no effect and is not honored, `:flush` is honored)
 99 |   To demonitor a process you should pass in the reference returned from
100 |   `ZenMonitor.Local.monitor/1` for the given process
101 |   """
102 |   @spec demonitor(ref :: reference, options :: [:flush]) :: true
103 |   def demonitor(ref, options \\ []) when is_reference(ref) do
104 |     increment("demonitor")
105 |     me = self()
106 | 
107 |     # First consume the reference
108 |     case :ets.take(Tables.references(), {me, ref}) do
109 |       [] ->
110 |         # Unknown reference, maybe it's been dispatched, consume any :DOWN messages in the inbox
111 |         # if :flush is provided.  Dispatch atomically consumes the reference, which is why we only
112 |         # need to scan the inbox if we don't find a reference.
113 |         if :flush in options do
114 |           receive do
115 |             {:DOWN, ^ref, _, _, _} -> nil
116 |           after
117 |             0 ->
118 |               nil
119 |           end
120 |         end
121 | 
122 |         :ok
123 | 
124 |       [{{^me, ^ref}, pid}] ->
125 |         # Instruct the Connector to demonitor the monitor
126 |         Connector.demonitor(pid, ref)
127 |     end
128 | 
129 |     true
130 |   end
131 | 
132 |   @doc """
133 |   Check the compatiblity of the remote node that owns the provided destination
134 | 
135 |   This is a simple convenience function that looksup the node for the destination and then calls
136 |   `ZenMonitor.Local.compatiblity_for_node/1`
137 |   """
138 |   @spec compatibility(target :: ZenMonitor.destination()) :: compatibility
139 |   def compatibility(target) do
140 |     target
141 |     |> ZenMonitor.find_node()
142 |     |> compatibility_for_node()
143 |   end
144 | 
145 |   @doc """
146 |   Asynchronously enqueue a list of down dispatches for delivery by the Dispatcher
147 | 
148 |   If called with the empty list, cast will be suppressed.
149 |   """
150 |   @spec enqueue(messages :: [down_dispatch]) :: :ok
151 |   def enqueue([]), do: :ok
152 | 
153 |   def enqueue(messages) do
154 |     GenStage.cast(__MODULE__, {:enqueue, messages})
155 |   end
156 | 
157 |   @doc """
158 |   Synchronously checks the length of the ZenMonitor.Local's internal batch
159 |   """
160 |   @spec batch_length() :: integer()
161 |   def batch_length do
162 |     GenStage.call(__MODULE__, :batch_length)
163 |   end
164 | 
165 |   @doc """
166 |   Gets the hibernation threshold from the Application Environment
167 | 
168 |   Every time the demand empties the queue a counter is incremented.  When this counter exceeds the
169 |   hibernation threshold the ZenMonitor.Local process will be sent into hibernation. See
170 |   ZenMonitor.Local's @hibernation_threshold for the default value
171 | 
172 |   This can be controlled at boot and runtime with the {:zen_monitor, :hibernation_threshold}
173 |   setting, see ZenMonitor.Local.hibernation_threshold/1 for runtime convenience functionality.
174 |   """
175 |   @spec hibernation_threshold() :: integer
176 |   def hibernation_threshold do
177 |     Application.get_env(:zen_monitor, :hibernation_threshold, @hibernation_threshold)
178 |   end
179 | 
180 |   @doc """
181 |   Puts the hibernation threshold into the Application Environment
182 | 
183 |   This is a simple convenience function for overwriting the
184 |   {:zen_monitor, :hibernation_threshold} setting at runtime.
185 |   """
186 |   @spec hibernation_threshold(value :: integer) :: :ok
187 |   def hibernation_threshold(value) do
188 |     Application.put_env(:zen_monitor, :hibernation_threshold, value)
189 |   end
190 | 
191 |   ## Server
192 | 
193 |   def init(_opts) do
194 |     Process.flag(:message_queue_data, :off_heap)
195 | 
196 |     subscribers =
197 |       :ets.new(@subscribers_table, [:protected, :named_table, :set, read_concurrency: true])
198 | 
199 |     {:producer, %State{subscribers: subscribers}}
200 |   end
201 | 
202 |   @doc """
203 |   Handles demand from `ZenMonitor.Local.Dispatcher`
204 | 
205 |   ZenMonitor.Local maintains a queue of pending messages to be sent to local processes, the actual
206 |   dispatch of which are throttled by ZenMonitor.Local.Dispatcher.  When
207 |   ZenMonitor.Local.Dispatcher requests more messages to dispatch, this handler will collect up to
208 |   the requested amount from the batch queue to satisfy the demand.
209 |   """
210 |   def handle_demand(demand, %State{length: length} = state) do
211 |     if length <= demand do
212 |       empty_queue(state)
213 |     else
214 |       chunk_queue(demand, state)
215 |     end
216 |   end
217 | 
218 |   # Handle a local subscriber going down
219 |   # When a process establishes a remote monitor, ZenMonitor.Local establishes a reciprocal monitor,
220 |   # see monitor/1 and handle_cast({:monitor_subscriber, ...}) for more information.
221 |   # If the subscriber crashes, all of the ETS records maintained by ZenMonitor.Local and the various
222 |   # ZenMonitor.Local.Connectors is no longer needed and will be cleaned up by this handler.
223 |   def handle_info(
224 |         {:DOWN, _ref, :process, subscriber, _reason},
225 |         %State{subscribers: subscribers} = state
226 |       ) do
227 |     for [ref, remote_pid] <- :ets.match(Tables.references(), {{subscriber, :"$1"}, :"$2"}) do
228 |       # Remove the reference
229 |       :ets.delete(Tables.references(), {subscriber, ref})
230 | 
231 |       # Instruct the Connector to demonitor
232 |       Connector.demonitor(remote_pid, ref)
233 |     end
234 | 
235 |     # Remove the subscriber from the subscribers table
236 |     :ets.delete(subscribers, subscriber)
237 | 
238 |     {:noreply, [], state}
239 |   end
240 | 
241 |   # Handles recipricol subscriber monitoring
242 |   # When a process establishes a remote monitor, ZenMonitor.Local will establish a reciprocal
243 |   # monitor on the subscriber.  This is done so that appropriate cleanup can happen if the
244 |   # subscriber goes down.
245 |   # This handler guarantees that a local subscriber will only ever have one active reciprocal
246 |   # monitor at a time by tracking the subscribers in an ETS table.
247 |   def handle_cast({:monitor_subscriber, subscriber}, %State{subscribers: subscribers} = state) do
248 |     if :ets.insert_new(subscribers, {subscriber}) do
249 |       Process.monitor(subscriber)
250 |     end
251 | 
252 |     {:noreply, [], state}
253 |   end
254 | 
255 |   # Handles enqueuing messages for eventual dispatch
256 |   # ZenMonitor.Local.Connector is responsible for generating down dispatches and enqueuing them with
257 |   # ZenMonitor.Local.  ZenMonitor.Local takes these messages and places them into the
258 |   # batch queue to be delivered to ZenMonitor.Local.Dispatcher as demanded.
259 |   def handle_cast({:enqueue, messages}, %State{batch: batch, length: length} = state) do
260 |     {batch, new_length} =
261 |       messages
262 |       |> Enum.reduce({batch, length}, fn item, {acc, len} ->
263 |         {:queue.in(item, acc), len + 1}
264 |       end)
265 | 
266 |     increment("enqueue", new_length - length)
267 | 
268 |     {:noreply, [], %State{state | batch: batch, length: new_length}}
269 |   end
270 | 
271 |   # Handles batch length checks
272 |   # Returns the current length of the batch
273 |   def handle_call(:batch_length, _from, %State{length: length} = state) do
274 |     {:reply, length, [], state}
275 |   end
276 | 
277 |   ## Private
278 | 
279 |   @spec empty_queue(state :: State.t()) ::
280 |           {:noreply, [down_dispatch], State.t()}
281 |           | {:noreply, [down_dispatch], State.t(), :hibernate}
282 |   defp empty_queue(%State{queue_emptied: queue_emptied, batch: batch} = state) do
283 |     new_queue_emptied = queue_emptied + 1
284 |     response = :queue.to_list(batch)
285 | 
286 |     if new_queue_emptied >= hibernation_threshold() do
287 |       {:noreply, response, %State{state | batch: :queue.new(), length: 0, queue_emptied: 0},
288 |        :hibernate}
289 |     else
290 |       {:noreply, response,
291 |        %State{state | batch: :queue.new(), length: 0, queue_emptied: new_queue_emptied}}
292 |     end
293 |   end
294 | 
295 |   @spec chunk_queue(size :: integer(), state :: State.t()) ::
296 |           {:noreply, [down_dispatch], State.t()}
297 |   defp chunk_queue(size, %State{batch: batch, length: length} = state) do
298 |     {messages, new_batch} = :queue.split(size, batch)
299 |     {:noreply, :queue.to_list(messages), %State{state | batch: new_batch, length: length - size}}
300 |   end
301 | end
302 | 


--------------------------------------------------------------------------------
/lib/zen_monitor/local/connector.ex:
--------------------------------------------------------------------------------
  1 | defmodule ZenMonitor.Local.Connector do
  2 |   @moduledoc """
  3 |   `ZenMonitor.Local.Connector` performs a variety of duties.  For every remote that a the local
  4 |   is interested in monitoring processes on there will be a dedicated `ZenMonitor.Local.Connector`.
  5 |   This collection of Connectors are managed by a `GenRegistry` registered under the
  6 |   `ZenMonitor.Local.Connector` atom.
  7 | 
  8 |   # Connecting and Monitoring the remote `ZenMonitor.Proxy`
  9 | 
 10 |   Connectors, as their name suggests, connect to the `ZenMonitor.Proxy` on the remote node that they
 11 |   are responsible for.  They do this using standard ERTS Distribution, by invoking the remote
 12 |   Proxy's ping command.  A Remote is considered compatible if the ping command returns the :pong
 13 |   atom, otherwise it will be marked incompatible.
 14 | 
 15 |   Connectors manage their remote node's status in the global node status cache, and provide
 16 |   facilities for efficient querying of remote status, see `compatibility/1` and
 17 |   `cached_compatibility/1`
 18 | 
 19 |   # Batching and Updating the remote `ZenMonitor.Proxy`
 20 | 
 21 |   When a local process wishes to monitor a remote process, the Connector will be informed of this
 22 |   fact with a call to `monitor/3`.  The Connector is responsible for maintaining a local record of
 23 |   this monitor for future fan-out and for efficiently batching up these requests to be delivered
 24 |   to the remote ZenMonitor.Proxy.
 25 | 
 26 |   # Fan-out of Dead Summaries
 27 | 
 28 |   Periodically, the `ZenMonitor.Proxy` (technically the `ZenMonitor.Proxy.Batcher`) on the remote
 29 |   node will send a "Dead Summary".  This is a message from the remote that informs the Connector
 30 |   of all the processes the Connector has monitored that have gone down since the last summary.
 31 | 
 32 |   The Connector uses it's local records to generate a batch of _down dispatches_.  These are
 33 |   messages that look identical to the messages provided by `Process.monitor/1` when a process goes
 34 |   down.  It is sometimes necessary for the original monitoring process to be able to discern
 35 |   whether the `:DOWN` message originated from ERTS or from ZenMonitor, to aid this, ZenMonitor
 36 |   will wrap the original reason in a tuple of `{:zen_monitor, original_reason}`.
 37 | 
 38 |   The fan-out messages are sent to `ZenMonitor.Local` for eventual delivery via
 39 |   `ZenMonitor.Local.Dispatcher`, see those modules for more information.
 40 | 
 41 |   # Fan-out of nodedown / ZenMonitor.Proxy down
 42 | 
 43 |   The Connector is also responsible for monitoring the remote node and dealing with nodedown (or
 44 |   the node becoming incompatible, either due to the `ZenMonitor.Proxy` crashing or a code change).
 45 | 
 46 |   If the Connector detects that the remote it is responsible for is down or no longer compatible,
 47 |   it will fire every established monitor with `{:zen_monitor, :nodedown}`.  It uses the same
 48 |   mechanism as for Dead Summaries, see `ZenMonitor.Local` and `ZenMonitor.Local.Dispatcher` for
 49 |   more information.
 50 |   """
 51 |   use GenServer
 52 |   use Instruments.CustomFunctions, prefix: "zen_monitor.local.connector"
 53 | 
 54 |   alias ZenMonitor.Local
 55 |   alias ZenMonitor.Local.Tables
 56 | 
 57 |   @base_penalty 1_000
 58 |   @maximum_penalty 60_000
 59 |   @max_attempt :math.ceil(:math.log2(@maximum_penalty))
 60 |   @chunk_size 5000
 61 |   @sweep_interval 100
 62 | 
 63 |   @type t :: __MODULE__
 64 |   @type compatibility :: :compatible | :incompatible
 65 |   @type cached_compatibility :: compatibility | :miss | {:expired, integer} | :unavailable
 66 |   @type death_certificate :: {pid, reason :: any}
 67 |   @type down_dispatch :: {pid, {:DOWN, reference, :process, pid, {:zen_monitor, any}}}
 68 | 
 69 |   defmodule State do
 70 |     @moduledoc """
 71 |     Maintains the internal state for the Connector
 72 | 
 73 |       - `monitors` is an ETS table for keeping track of monitors for the purpose of fan-out.
 74 |       - `remote_node_monitored` is a flag used to track whether or not the remote node has been
 75 |         monitored
 76 |       - `remote_proxy_ref` is the monitoring reference of the remote node's ZenMonitor.Proxy
 77 |       - `remote` is the remote node for which the Connector is responsible.
 78 |       - `batch` is the queue of instructions pending until the next sweep.
 79 |       - `length` is the current length of the batch queue (calculating queue length is an O(n)
 80 |         operation, it is simple to track it as elements are added / removed)
 81 |     """
 82 |     @type t :: %__MODULE__{
 83 |             monitors: :ets.tab(),
 84 |             remote_node_monitored: boolean(),
 85 |             remote_proxy_ref: reference() | nil,
 86 |             remote: node(),
 87 |             length: integer(),
 88 |             batch: :queue.queue()
 89 |           }
 90 | 
 91 |     defstruct [
 92 |       :monitors,
 93 |       :remote,
 94 |       :remote_proxy_ref,
 95 |       remote_node_monitored: false,
 96 |       length: 0,
 97 |       batch: :queue.new()
 98 |     ]
 99 |   end
100 | 
101 |   ## Client
102 | 
103 |   def start_link(remote) do
104 |     GenServer.start_link(__MODULE__, remote)
105 |   end
106 | 
107 |   @doc """
108 |   Get a connector from the registry by destination
109 |   """
110 |   @spec get(target :: ZenMonitor.destination()) :: pid()
111 |   def get(target) do
112 |     target
113 |     |> ZenMonitor.find_node()
114 |     |> get_for_node()
115 |   end
116 | 
117 |   @doc """
118 |   Get a connector from the registry by remote node
119 |   """
120 |   @spec get_for_node(remote :: node()) :: pid()
121 |   def get_for_node(remote) when is_atom(remote) do
122 |     case GenRegistry.lookup(__MODULE__, remote) do
123 |       {:ok, connector} ->
124 |         connector
125 | 
126 |       {:error, :not_found} ->
127 |         {:ok, connector} = GenRegistry.lookup_or_start(__MODULE__, remote, [remote])
128 |         connector
129 |     end
130 |   end
131 | 
132 |   @doc """
133 |   Asynchronously monitors a pid.
134 |   """
135 |   @spec monitor(target :: ZenMonitor.destination(), ref :: reference(), subscriber :: pid()) ::
136 |           :ok
137 |   def monitor(target, ref, subscriber) do
138 |     target
139 |     |> get()
140 |     |> GenServer.cast({:monitor, target, ref, subscriber})
141 |   end
142 | 
143 |   @doc """
144 |   Retrieves all the monitors established between the target and the subscriber
145 |   """
146 |   @spec monitors(target :: ZenMonitor.destination(), subscriber :: pid()) :: [reference()]
147 |   def monitors(target, subscriber) do
148 |     target
149 |     |> get()
150 |     |> GenServer.call({:monitors, target, subscriber})
151 |   end
152 | 
153 |   @doc """
154 |   Asynchronously demonitors a pid.
155 |   """
156 |   @spec demonitor(target :: ZenMonitor.destination(), ref :: reference()) :: :ok
157 |   def demonitor(target, ref) do
158 |     target
159 |     |> get()
160 |     |> GenServer.cast({:demonitor, target, ref})
161 |   end
162 | 
163 |   @doc """
164 |   Determine the effective compatibility of a remote node
165 | 
166 |   This will attempt a fast client-side lookup in the ETS table.  Only a positive `:compatible`
167 |   record will result in `:compatible`, otherwise the effective compatibility is `:incompatible`
168 |   """
169 |   @spec compatibility(remote :: node()) :: compatibility
170 |   def compatibility(remote) do
171 |     case cached_compatibility(remote) do
172 |       :compatible ->
173 |         :compatible
174 | 
175 |       _ ->
176 |         :incompatible
177 |     end
178 |   end
179 | 
180 |   @doc """
181 |   Check the cached compatibility status for a remote node
182 | 
183 |   This will only perform a fast client-side lookup in the ETS table.  If an authoritative entry is
184 |   found it will be returned (either `:compatible`, `:incompatible`, or `:unavailable`).  If no
185 |   entry is found then `:miss` is returned.  If an expired entry is found then
186 |   `{:expired, attempts}` is returned.
187 |   """
188 |   @spec cached_compatibility(remote :: node()) :: cached_compatibility
189 |   def cached_compatibility(remote) do
190 |     case :ets.lookup(Tables.nodes(), remote) do
191 |       [] ->
192 |         :miss
193 | 
194 |       [{^remote, :compatible}] ->
195 |         :compatible
196 | 
197 |       [{^remote, {:incompatible, enforce_until, attempt}}] ->
198 |         if enforce_until < ZenMonitor.now() do
199 |           {:expired, attempt}
200 |         else
201 |           :incompatible
202 |         end
203 | 
204 |       [{^remote, :unavailable}] ->
205 |         :unavailable
206 |     end
207 |   end
208 | 
209 |   @doc """
210 |   Connect to the provided remote
211 | 
212 |   This function will not consult the cache before calling into the GenServer, the GenServer will
213 |   consult with the cache before attempting to connect, this allows for many callers to connect
214 |   with the server guaranteeing that only one attempt will actually perform network work.
215 | 
216 |   If the compatibility of a remote host is needed instead, callers should use the
217 |   `compatibility/1` or `cached_compatibility/1` functions.  `compatibility/1` will provide the
218 |   effective compatibility, `cached_compatibility/1` is mainly used internally but can provide more
219 |   detailed information about the cache status of the remote.  Neither of these methods,
220 |   `compatibility/1` nor `cached_compatibility/1`, will perform network work or call into the
221 |   GenServer.
222 |   """
223 |   @spec connect(remote :: node()) :: compatibility
224 |   def connect(remote) do
225 |     remote
226 |     |> get_for_node()
227 |     |> GenServer.call(:connect)
228 |   end
229 | 
230 |   @doc """
231 |   Gets the sweep interval from the Application Environment
232 | 
233 |   The sweep interval is the number of milliseconds to wait between sweeps, see
234 |   ZenMonitor.Local.Connector's @sweep_interval for the default value
235 | 
236 |   This can be controlled at boot and runtime with the {:zen_monitor, :connector_sweep_interval}
237 |   setting, see `ZenMonitor.Local.Connector.sweep_interval/1` for runtime convenience
238 |   functionality.
239 |   """
240 |   @spec sweep_interval() :: integer
241 |   def sweep_interval do
242 |     Application.get_env(:zen_monitor, :connector_sweep_interval, @sweep_interval)
243 |   end
244 | 
245 |   @doc """
246 |   Puts the sweep interval into the Application Environment
247 | 
248 |   This is a simple convenience function for overwriting the
249 |   {:zen_monitor, :connector_sweep_interval} setting at runtime.
250 |   """
251 |   @spec sweep_interval(value :: integer) :: :ok
252 |   def sweep_interval(value) do
253 |     Application.put_env(:zen_monitor, :connector_sweep_interval, value)
254 |   end
255 | 
256 |   @doc """
257 |   Gets the chunk size from the Application Environment
258 | 
259 |   The chunk size is the maximum number of subscriptions that will be sent during each sweep, see
260 |   ZenMonitor.Local.Connector's @chunk_size for the default value
261 | 
262 |   This can be controlled at boot and runtime with the {:zen_monitor, :connector_chunk_size}
263 |   setting, see `ZenMonitor.Local.Connector.chunk_size/1` for runtime convenience functionality.
264 |   """
265 |   @spec chunk_size() :: integer
266 |   def chunk_size do
267 |     Application.get_env(:zen_monitor, :connector_chunk_size, @chunk_size)
268 |   end
269 | 
270 |   @doc """
271 |   Puts the chunk size into the Application Environment
272 | 
273 |   This is a simple convenience function for overwriting the {:zen_monitor, :connector_chunk_size}
274 |   setting at runtime.
275 |   """
276 |   @spec chunk_size(value :: integer) :: :ok
277 |   def chunk_size(value) do
278 |     Application.put_env(:zen_monitor, :connector_chunk_size, value)
279 |   end
280 | 
281 |   ## Server
282 | 
283 |   def init(remote) do
284 |     schedule_sweep()
285 |     monitors = :ets.new(:monitors, [:private, :ordered_set])
286 |     {:ok, %State{remote: remote, monitors: monitors}}
287 |   end
288 | 
289 |   # Synchronous connect handler
290 |   # Attempts to connect to the remote, this handler does check the cache before connecting to avoid
291 |   # a thundering herd.
292 |   def handle_call(:connect, _from, %State{} = state) do
293 |     {result, state} = do_compatibility(state)
294 |     {:reply, result, state}
295 |   end
296 | 
297 |   # Returns all the monitors between a target and a subscriber
298 |   def handle_call({:monitors, target, subscriber}, _from, %State{} = state) do
299 |     size = :ets.info(state.monitors, :size)
300 | 
301 |     monitors =
302 |       if size == 0 do
303 |         # Don't bother doing the match on an empty table
304 |         []
305 |       else
306 |         case :ets.match(state.monitors, {{target, :"$1"}, subscriber}, size) do
307 |           :"$end_of_table" ->
308 |             # Match failed
309 |             []
310 | 
311 |           {monitors, _} ->
312 |             # Unwrap the references
313 |             List.flatten(monitors)
314 |         end
315 |       end
316 | 
317 |     {:reply, monitors, state}
318 |   end
319 | 
320 |   # Handles establishing a new monitor
321 |   # 1.  Records the monitor into the internal ETS table
322 |   # 2.  If this is the first monitor for the pid, adds it to the queue for subsequent dispatch to
323 |   #     the ZenMonitor.Proxy during the next sweep.
324 |   def handle_cast(
325 |         {:monitor, target, ref, subscriber},
326 |         %State{batch: batch, length: length, monitors: monitors} = state
327 |       ) do
328 |     # Check if we should subscribe to this target (this check has to happen before we insert the
329 |     # new monitor otherwise the new monitor will always be found and we will never enqueue
330 |     # anything)
331 |     should_subscribe? = unknown_target?(monitors, target)
332 | 
333 |     # Always add it to the monitor table
334 |     :ets.insert(monitors, {{target, ref}, subscriber})
335 | 
336 |     # Enqueue the subscribe instruction if it isn't already monitored
337 |     new_state =
338 |       if should_subscribe? do
339 |         increment("enqueue", 1, tags: ["op:subscribe"])
340 |         %State{state | batch: :queue.in({:subscribe, target}, batch), length: length + 1}
341 |       else
342 |         state
343 |       end
344 | 
345 |     {:noreply, new_state}
346 |   end
347 | 
348 |   # Handles demonitoring a reference for a given pid
349 |   # Cleans up the internal ETS record if it exists
350 |   def handle_cast(
351 |         {:demonitor, target, ref},
352 |         %State{batch: batch, length: length, monitors: monitors} = state
353 |       ) do
354 |     # Remove it from the monitors table
355 |     :ets.delete(monitors, {target, ref})
356 | 
357 |     # If that was the last monitor for the target, we should unsubscribe.  Unlike monitor we have
358 |     # to perform this check after the delete or else the row we are deleting will always make the
359 |     # target known.
360 |     should_unsubscribe? = unknown_target?(monitors, target)
361 | 
362 |     # Enqueue the unsubscribe instruction if the target no longer exists
363 |     state =
364 |       if should_unsubscribe? do
365 |         increment("enqueue", 1, tags: ["op:unsubscribe"])
366 |         %State{state | batch: :queue.in({:unsubscribe, target}, batch), length: length + 1}
367 |       else
368 |         state
369 |       end
370 | 
371 |     {:noreply, state}
372 |   end
373 | 
374 |   # Handles nodedown for the Connector's remote
375 |   # When the remote node goes down, every monitor maintained by the Connector should fire
376 |   def handle_info({:nodedown, remote}, %State{remote: remote} = state) do
377 |     # Mark this node as unavailable
378 |     {:incompatible, state} = do_mark_unavailable(state)
379 | 
380 |     # Mark the remote node as unmonitored (any monitors that existed were just consumed)
381 |     state = %State{state | remote_node_monitored: false}
382 | 
383 |     # Dispatch down to everyone
384 |     {:noreply, do_down(state)}
385 |   end
386 | 
387 |   # Handles when the proxy crashes because of noconnection
388 |   # This reason indicates that we have lost connection with the remote node, mark it as unavailable.
389 |   def handle_info({:DOWN, ref, :process, _, :noconnection}, %State{remote_proxy_ref: ref} = state) do
390 |     # Mark this node as unavailable
391 |     {:incompatible, state} = do_mark_unavailable(state)
392 | 
393 |     # Clear the remote_proxy_ref
394 |     state = %State{state | remote_proxy_ref: nil}
395 | 
396 |     # Dispatch down to everyone
397 |     {:noreply, do_down(state)}
398 |   end
399 | 
400 |   # Handles when the proxy crashes for any other reason
401 |   # Penalize the remote as incompatible and let the normal remote recovery take care of it.
402 |   def handle_info({:DOWN, ref, :process, _, _}, %State{remote_proxy_ref: ref} = state) do
403 |     # Mark this node as incompatible
404 |     {:incompatible, state} = do_mark_incompatible(state, 1)
405 | 
406 |     # Clear the remote_proxy_ref
407 |     state = %State{state | remote_proxy_ref: nil}
408 | 
409 |     # Dispatch down to everyone
410 |     {:noreply, do_down(state)}
411 |   end
412 | 
413 |   # Handle the dead summary from the remote
414 |   # Periodically the remote node will send us a summary of everything that has died that we have
415 |   # monitored.
416 |   # Connector will find and consume all the matching monitors and enqueue the appropriate messages
417 |   # for each monitor with ZenMonitor.Local
418 |   def handle_info(
419 |         {:dead, remote, death_certificates},
420 |         %State{remote: remote, monitors: monitors} = state
421 |       ) do
422 |     death_certificates
423 |     |> messages_for_death_certificates(monitors)
424 |     |> Local.enqueue()
425 | 
426 |     {:noreply, state}
427 |   end
428 | 
429 |   # Handle the periodic sweep
430 |   # If the remote is compatible this will create a subscription summary up to chunk_size of all the
431 |   # pids that need monitoring since the last sweep.  This will be sent to the remote for monitoring.
432 |   # If the remote is incompatible, all pids since the last sweep will have their monitors fire with
433 |   # `{:zen_monitor, :nodedown}`
434 |   def handle_info(:sweep, %State{} = state) do
435 |     new_state =
436 |       case do_compatibility(state) do
437 |         {:compatible, state} ->
438 |           do_sweep(state)
439 | 
440 |         {:incompatible, state} ->
441 |           do_down(state)
442 |       end
443 | 
444 |     schedule_sweep()
445 |     {:noreply, new_state}
446 |   end
447 | 
448 |   def handle_info(_, %State{} = state) do
449 |     increment("unhandled_info")
450 |     {:noreply, state}
451 |   end
452 | 
453 |   ## Private
454 | 
455 |   @spec do_compatibility(state :: State.t()) :: {compatibility, State.t()}
456 |   defp do_compatibility(%State{remote: remote} = state) do
457 |     case cached_compatibility(remote) do
458 |       :miss ->
459 |         do_connect(state, 1)
460 | 
461 |       {:expired, attempt} ->
462 |         do_connect(state, attempt + 1)
463 | 
464 |       :unavailable ->
465 |         do_connect(state, 1)
466 | 
467 |       hit ->
468 |         {hit, state}
469 |     end
470 |   end
471 | 
472 |   @spec do_connect(State.t(), attempt :: integer) :: {compatibility, State.t()}
473 |   defp do_connect(%State{remote: remote} = state, attempt) do
474 |     try do
475 |       with {:known_node, true} <- {:known_node, known_node?(remote)},
476 |            {:ping, :pong} <-
477 |              {:ping, ZenMonitor.gen_module().call({ZenMonitor.Proxy, remote}, :ping)} do
478 |         do_mark_compatible(state)
479 |       else
480 |         {:known_node, false} ->
481 |           do_mark_unavailable(state)
482 | 
483 |         {:ping, _} ->
484 |           do_mark_incompatible(state, attempt)
485 |       end
486 |     catch
487 |       :exit, {{:nodedown, _node}, _} ->
488 |         do_mark_unavailable(state)
489 | 
490 |       :exit, _ ->
491 |         do_mark_incompatible(state, attempt)
492 |     end
493 |   end
494 | 
495 |   @spec do_sweep(state :: State.t()) :: State.t()
496 |   defp do_sweep(%State{batch: batch, length: length} = state) do
497 |     {summary, overflow, new_length} = chunk(batch, length)
498 |     increment("sweep", length - new_length)
499 |     do_subscribe(state, summary)
500 |     %State{state | batch: overflow, length: new_length}
501 |   end
502 | 
503 |   @spec chunk(batch :: :queue.queue(), length :: integer) :: {[pid], :queue.queue(), integer}
504 |   defp chunk(batch, length) do
505 |     size = chunk_size()
506 | 
507 |     if length <= size do
508 |       {:queue.to_list(batch), :queue.new(), 0}
509 |     else
510 |       {summary, overflow} = :queue.split(size, batch)
511 |       {:queue.to_list(summary), overflow, length - size}
512 |     end
513 |   end
514 | 
515 |   @spec do_subscribe(state :: State.t(), summary :: []) :: :ok
516 |   defp do_subscribe(%State{}, []), do: :ok
517 | 
518 |   defp do_subscribe(%State{remote: remote}, summary) do
519 |     ZenMonitor.gen_module().cast({ZenMonitor.Proxy, remote}, {:process, self(), summary})
520 |   end
521 | 
522 |   @spec do_down(state :: State.t()) :: State.t()
523 |   defp do_down(%State{monitors: monitors} = state) do
524 |     # Generate messages for every monitor
525 |     messages =
526 |       for [{{pid, ref}, subscriber}] <- :ets.match(monitors, :"$1") do
527 |         {subscriber, {:DOWN, ref, :process, pid, {:zen_monitor, :nodedown}}}
528 |       end
529 | 
530 |     # Clear the monitors table
531 |     :ets.delete_all_objects(monitors)
532 | 
533 |     # Enqueue the messages with ZenMonitor.Local
534 |     Local.enqueue(messages)
535 | 
536 |     # Return a new empty state
537 |     %State{state | batch: :queue.new(), length: 0}
538 |   end
539 | 
540 |   @spec do_mark_compatible(State.t()) :: {:compatible, State.t()}
541 |   defp do_mark_compatible(%State{remote: remote} = state) do
542 |     state =
543 |       state
544 |       |> monitor_remote_node()
545 |       |> monitor_remote_proxy()
546 | 
547 |     :ets.insert(Tables.nodes(), {remote, :compatible})
548 |     {:compatible, state}
549 |   end
550 | 
551 |   @spec do_mark_incompatible(State.t(), attempt :: integer) :: {:incompatible, State.t()}
552 |   defp do_mark_incompatible(%State{remote: remote} = state, attempt) do
553 |     state = monitor_remote_node(state)
554 | 
555 |     :ets.insert(
556 |       Tables.nodes(),
557 |       {remote, {:incompatible, ZenMonitor.now() + penalty(attempt), attempt}}
558 |     )
559 | 
560 |     {:incompatible, state}
561 |   end
562 | 
563 |   @spec do_mark_unavailable(State.t()) :: {:incompatible, State.t()}
564 |   defp do_mark_unavailable(%State{remote: remote} = state) do
565 |     :ets.insert(Tables.nodes(), {remote, :unavailable})
566 |     {:incompatible, state}
567 |   end
568 | 
569 |   @spec monitor_remote_node(State.t()) :: State.t()
570 |   defp monitor_remote_node(%State{remote_node_monitored: true} = state), do: state
571 | 
572 |   defp monitor_remote_node(%State{remote_node_monitored: false, remote: remote} = state) do
573 |     Node.monitor(remote, true)
574 |     %State{state | remote_node_monitored: true}
575 |   end
576 | 
577 |   @spec monitor_remote_proxy(State.t()) :: State.t()
578 |   defp monitor_remote_proxy(%State{remote_proxy_ref: nil, remote: remote} = state) do
579 |     %State{state | remote_proxy_ref: Process.monitor({ZenMonitor.Proxy, remote})}
580 |   end
581 | 
582 |   defp monitor_remote_proxy(%State{} = state), do: state
583 | 
584 |   @spec messages_for_death_certificates(
585 |           death_certificates :: [death_certificate],
586 |           monitors :: :ets.tab()
587 |         ) :: [down_dispatch]
588 |   defp messages_for_death_certificates(death_certificates, monitors) do
589 |     do_messages_for_death_certificates(death_certificates, monitors, [])
590 |   end
591 | 
592 |   @spec do_messages_for_death_certificates(
593 |           death_certificates :: [death_certificate],
594 |           monitors :: :ets.tab(),
595 |           acc :: [down_dispatch]
596 |         ) :: [down_dispatch]
597 |   defp do_messages_for_death_certificates([], _monitors, acc), do: Enum.reverse(acc)
598 | 
599 |   defp do_messages_for_death_certificates([{pid, reason} | rest], monitors, acc) do
600 |     acc =
601 |       monitors
602 |       |> :ets.match({{pid, :"$1"}, :"$2"})
603 |       |> Enum.reduce(acc, fn [ref, subscriber], acc ->
604 |         # Consume the monitor
605 |         :ets.delete(monitors, {pid, ref})
606 | 
607 |         # Add the new message into the accumulator
608 |         [{subscriber, {:DOWN, ref, :process, pid, {:zen_monitor, reason}}} | acc]
609 |       end)
610 | 
611 |     do_messages_for_death_certificates(rest, monitors, acc)
612 |   end
613 | 
614 |   @spec known_node?(remote :: node()) :: boolean()
615 |   defp known_node?(remote) do
616 |     remote == Node.self() or remote in Node.list()
617 |   end
618 | 
619 |   @spec penalty(attempt :: integer) :: integer
620 |   defp penalty(attempt) do
621 |     min(@maximum_penalty, @base_penalty * round(:math.pow(2, min(attempt, @max_attempt))))
622 |   end
623 | 
624 |   @spec unknown_target?(monitors :: :ets.tid(), target :: pid) :: boolean
625 |   defp unknown_target?(monitors, target) do
626 |     # ETS does not make for the most readable code, here's what the following line does.
627 |     # Perform a match on the internal monitors table looking for keys that start with
628 |     # {target, ...}
629 |     # Since we are just interested to see if there are any, but don't care about the content, we
630 |     # set the other fields to :_ to ignore them.
631 |     # The target is known if there are _any_ results, so we apply a limit to the match of just 1
632 |     # result.
633 |     # This means that we either get back a tuple of {[[]]], continuation} or :"$end_of_table"
634 |     # :"$end_of_table" implies that the match for a single item found nothing, therefore the
635 |     # target does not exist and is unknown
636 |     :ets.match(monitors, {{target, :_}, :_}, 1) == :"$end_of_table"
637 |   end
638 | 
639 |   @spec schedule_sweep() :: reference
640 |   defp schedule_sweep do
641 |     Process.send_after(self(), :sweep, sweep_interval())
642 |   end
643 | end
644 | 


--------------------------------------------------------------------------------
/lib/zen_monitor/local/dispatcher.ex:
--------------------------------------------------------------------------------
  1 | defmodule ZenMonitor.Local.Dispatcher do
  2 |   @moduledoc """
  3 |   `ZenMonitor.Local.Dispatcher` is a GenStage Consumer responsible for throttled delivery of down
  4 |   messages.
  5 | 
  6 |   `ZenMonitor.Local` acts as a GenStage Producer, it stores all of the down messages that need to
  7 |   be dispatched based off of what has been enqueued by the `ZenMonitor.Local.Connector`.
  8 | 
  9 |   The Dispatcher will deliver these messages throttled by a maximum rate which is controlled by
 10 |   the {:zen_monitor, :demand_interval} and {:zen_monitor, :demand_amount} settings.
 11 | 
 12 |   To calculate the maximum number of messages processed per second you can use the following
 13 |   formula:
 14 | 
 15 |   maximum_mps = (demand_amount) * (1000 / demand_interval)
 16 | 
 17 |   For example, if the demand_amount is 1000, and demand_interval is 100 (milliseconds) the maximum
 18 |   messages per second are:
 19 | 
 20 |   maximum_mps = (1000) * (1000 / 100)
 21 |              -> (1000) * 10
 22 |              -> 10_000
 23 | 
 24 |   For convenience a `ZenMonitor.Local.Dispatcher.maximum_mps/0` is provided that will perform this
 25 |   calculation.
 26 |   """
 27 |   use GenStage
 28 |   use Instruments.CustomFunctions, prefix: "zen_monitor.local.dispatcher"
 29 | 
 30 |   alias ZenMonitor.Local.Tables
 31 | 
 32 |   @demand_interval 100
 33 |   @demand_amount 1000
 34 | 
 35 |   ## Client
 36 | 
 37 |   def start_link(_opts \\ []) do
 38 |     GenStage.start_link(__MODULE__, [], name: __MODULE__)
 39 |   end
 40 | 
 41 |   @doc """
 42 |   Gets the demand interval from the Application Environment
 43 | 
 44 |   The demand interval is the number of milliseconds to wait between demanding more events from the
 45 |   GenStage Producer (`ZenMonitor.Local`)
 46 | 
 47 |   This can be controlled at boot and runtime with the {:zen_monitor, :demand_interval} setting,
 48 |   see `ZenMonitor.Local.Dispatcher.demand_interval/1` for runtime convenience functionality.
 49 |   """
 50 |   @spec demand_interval() :: integer
 51 |   def demand_interval do
 52 |     Application.get_env(:zen_monitor, :demand_interval, @demand_interval)
 53 |   end
 54 | 
 55 |   @doc """
 56 |   Puts the demand interval into the Application Environment
 57 | 
 58 |   This is a simple convenience function for overwrite the {:zen_monitor, :demand_interval} setting
 59 |   at runtime
 60 |   """
 61 |   @spec demand_interval(value :: integer) :: :ok
 62 |   def demand_interval(value) do
 63 |     Application.put_env(:zen_monitor, :demand_interval, value)
 64 |   end
 65 | 
 66 |   @doc """
 67 |   Gets the demand amount from the Application Environment
 68 | 
 69 |   The demand amount is the number of events tor request from the GenStage Producer
 70 |   (`ZenMonitor.Local`) every demand interval
 71 | 
 72 |   This can be controlled at boot and runtime with the {:zen_monitor, :demand_amount} setting, see
 73 |   `ZenMonitor.Local.Dispatcher.demand_amount/1` for runtime convenience functionality.
 74 |   """
 75 |   @spec demand_amount() :: integer
 76 |   def demand_amount do
 77 |     Application.get_env(:zen_monitor, :demand_amount, @demand_amount)
 78 |   end
 79 | 
 80 |   @doc """
 81 |   Puts the demand amount into the Application Environment
 82 | 
 83 |   This is a simple convenience function for overwriting the {:zen_monitor, :demand_amount} setting
 84 |   at runtime.
 85 |   """
 86 |   @spec demand_amount(value :: integer) :: :ok
 87 |   def demand_amount(value) do
 88 |     Application.put_env(:zen_monitor, :demand_amount, value)
 89 |   end
 90 | 
 91 |   @doc """
 92 |   Calculate the current maximum messages per second
 93 | 
 94 |   This is a convenience function to help operators understand the current throughput of the
 95 |   Dispatcher.
 96 |   """
 97 |   @spec maximum_mps() :: float
 98 |   def maximum_mps do
 99 |     demand_amount() * (1000 / demand_interval())
100 |   end
101 | 
102 |   ## Server
103 | 
104 |   def init(_opts) do
105 |     Process.flag(:message_queue_data, :off_heap)
106 |     {:consumer, nil, subscribe_to: [{ZenMonitor.Local, min_demand: 1}]}
107 |   end
108 | 
109 |   @doc """
110 |   Handles the events for dispatch
111 | 
112 |   Dispatch is a simple two step procedure followed for each message to be dispatched.
113 | 
114 |   1.  Check if the message is still valid.  Messages can become invalid if the monitor was
115 |       demonitored after the message was enqueued.
116 | 
117 |   2a.  If valid:  forward the message to the subscriber
118 |   2b.  If invalid: skip message
119 | 
120 |   Event dispatch will calculate an "unfulfilled" demand based off the number of messages skipped
121 |   and demand that the producer provide additional events so that MPS is maintained and prevent the
122 |   Dispatcher from being starved because of invalid messages.
123 |   """
124 |   def handle_events(events, _from, producer) do
125 |     delivered = length(events)
126 |     increment("events.delivered", delivered)
127 | 
128 |     messages =
129 |       for {subscriber, {:DOWN, ref, :process, _, _} = message} <- events,
130 |           still_monitored?(subscriber, ref) do
131 |         send(subscriber, message)
132 |       end
133 | 
134 |     # Ensure that filtering does not starve out the Dispatcher
135 | 
136 |     # Calculate the effective demand by taking the smaller of the current demand_amount and the
137 |     # length of events delivered.
138 |     effective_demand = min(delivered, demand_amount())
139 |     processed = length(messages)
140 |     increment("events.processed", processed)
141 | 
142 |     # The unfulfilled demand is the difference between the effective demand and the actual events
143 |     unfulfilled = effective_demand - processed
144 | 
145 |     # Ask the producer to fulfill the unfulfilled demand (if this number is 0 or negative, the
146 |     # ask helper will handle that for us and not ask for anything)
147 |     ask(producer, unfulfilled)
148 | 
149 |     {:noreply, [], producer}
150 |   end
151 | 
152 |   @doc """
153 |   Handles the callback for the subscription being established with the producer.
154 | 
155 |   This is the start of the demand loop, once the producer confirms subscription, the initial call
156 |   to schedule_demand/0 happens.
157 |   """
158 |   def handle_subscribe(:producer, _, from, _state) do
159 |     schedule_demand()
160 |     {:manual, from}
161 |   end
162 | 
163 |   @doc """
164 |   Handles the periodic generate_demand message
165 | 
166 |   Asks the producer for demand_amount of events then schedules the next demand generation.
167 |   """
168 |   def handle_info(:generate_demand, producer) do
169 |     ask(producer, demand_amount())
170 |     schedule_demand()
171 | 
172 |     {:noreply, [], producer}
173 |   end
174 | 
175 |   ## Private
176 | 
177 |   @spec ask(producer :: pid, amount :: integer) :: :ok
178 |   defp ask(_producer, amount) when amount <= 0, do: :ok
179 | 
180 |   defp ask(producer, amount) do
181 |     GenStage.ask(producer, amount)
182 |   end
183 | 
184 |   @spec still_monitored?(subscriber :: pid, ref :: reference) :: boolean
185 |   defp still_monitored?(subscriber, ref) do
186 |     :ets.take(Tables.references(), {subscriber, ref}) != []
187 |   end
188 | 
189 |   @spec schedule_demand() :: reference
190 |   defp schedule_demand do
191 |     Process.send_after(self(), :generate_demand, demand_interval())
192 |   end
193 | end
194 | 


--------------------------------------------------------------------------------
/lib/zen_monitor/local/supervisor.ex:
--------------------------------------------------------------------------------
 1 | defmodule ZenMonitor.Local.Supervisor do
 2 |   @moduledoc """
 3 |   Supervisor for the `ZenMonitor.Local` components.
 4 | 
 5 |   See `ZenMonitor.Local`, `ZenMonitor.Local.Tables`, `ZenMonitor.Local.Connector`, and
 6 |   `ZenMonitor.Local.Dispatcher` for more information about the supervised processes.
 7 | 
 8 |   There are many `ZenMonitor.Local.Connector` processes, which are managed by a `GenRegistry`.
 9 |   These are keyed by the remote node the Connector is responsible for.
10 | 
11 |   This supervisor uses the `:rest_for_one` strategy, so the order of the children is important and
12 |   should not be altered.
13 |   """
14 |   use Supervisor
15 | 
16 |   def start_link(_opts \\ []) do
17 |     Supervisor.start_link(__MODULE__, [], name: __MODULE__)
18 |   end
19 | 
20 |   def init(_opts) do
21 |     children = [
22 |       ZenMonitor.Local.Tables,
23 |       ZenMonitor.Local,
24 |       GenRegistry.Spec.child_spec(ZenMonitor.Local.Connector),
25 |       ZenMonitor.Local.Dispatcher
26 |     ]
27 | 
28 |     Supervisor.init(children, strategy: :rest_for_one)
29 |   end
30 | end
31 | 


--------------------------------------------------------------------------------
/lib/zen_monitor/local/tables.ex:
--------------------------------------------------------------------------------
 1 | defmodule ZenMonitor.Local.Tables do
 2 |   @moduledoc """
 3 |   `ZenMonitor.Local.Tables` owns tables that are shared between multiple `ZenMonitor.Local`
 4 |   components.
 5 | 
 6 |   See `nodes/0` and `references/0` for more information.
 7 |   """
 8 |   use GenServer
 9 | 
10 |   @node_table Module.concat(__MODULE__, "Nodes")
11 |   @reference_table Module.concat(__MODULE__, "References")
12 | 
13 |   ## Client
14 | 
15 |   def start_link(_opts \\ []) do
16 |     GenServer.start_link(__MODULE__, [], name: __MODULE__)
17 |   end
18 | 
19 |   @doc """
20 |   Nodes holds cached information about remote node compatibility
21 | 
22 |   This information is stored in one of the following structures:
23 | 
24 |   For compatible nodes
25 |   { remote_node, :compatible }
26 |     ^---key---^  ^--value--^
27 | 
28 |   For incompatible nodes
29 |   { remote_node, {:incompatible, enforce_until, attempts} }
30 |     ^---key---^  ^---------------value-----------------^
31 | 
32 |   `enforce_until` is the time (as reported by System.monotonic_time(:milliseconds)) after which
33 |   this cache entry should no longer be enforced.
34 | 
35 |   `attempts` is the number of consecutive connect attempts that have failed, this value is useful
36 |   for calculating geometric backoff values
37 |   """
38 |   @spec nodes() :: :ets.tab()
39 |   def nodes do
40 |     @node_table
41 |   end
42 | 
43 |   @doc """
44 |   References holds the set of authoritative monitor references
45 | 
46 |   These references are stored in this structure:
47 | 
48 |   { {subscriber_pid, monitor_reference}, {remote_node, remote_pid} }
49 |     ^-------------key-----------------^  ^----------value--------^
50 | 
51 |   There is a compound key of {subscriber_pid, monitor_reference} this allows for lookup of a given
52 |   reference (if the subscriber is known, by convention it will be the calling process, self()) or
53 |   the retrieval of all active monitors for a subscriber.
54 |   """
55 |   @spec references() :: :ets.tab()
56 |   def references do
57 |     @reference_table
58 |   end
59 | 
60 |   ## Server
61 | 
62 |   def init(_opts) do
63 |     @node_table = :ets.new(@node_table, [:public, :named_table, :set, write_concurrency: true])
64 | 
65 |     @reference_table =
66 |       :ets.new(@reference_table, [:public, :named_table, :ordered_set, write_concurrency: true])
67 | 
68 |     {:ok, nil}
69 |   end
70 | end
71 | 


--------------------------------------------------------------------------------
/lib/zen_monitor/metrics.ex:
--------------------------------------------------------------------------------
 1 | defmodule ZenMonitor.Metrics do
 2 |   @moduledoc """
 3 |   Metrics helper for monitoring the ZenMonitor system.
 4 |   """
 5 |   alias Instruments.Probe
 6 | 
 7 |   @doc """
 8 |   Registers various probes for the ZenMonitor System.
 9 | 
10 |     - ERTS message_queue_len for the `ZenMonitor.Local` and `ZenMonitor.Proxy` processes.
11 |     - Internal Batch Queue length for `ZenMonitor.Local` (dispatches to be delivered)
12 |     - ETS table size for References (number of monitors)
13 |     - ETS table size for Subscribers (number of monitored local processes * interested remotes)
14 | 
15 |   """
16 |   @spec register() :: :ok
17 |   def register do
18 |     Probe.define!(
19 |       "zen_monitor.local.message_queue_len",
20 |       :gauge,
21 |       mfa: {__MODULE__, :message_queue_len, [ZenMonitor.Local]}
22 |     )
23 | 
24 |     Probe.define!(
25 |       "zen_monitor.proxy.message_queue_len",
26 |       :gauge,
27 |       mfa: {__MODULE__, :message_queue_len, [ZenMonitor.Proxy]}
28 |     )
29 | 
30 |     Probe.define!(
31 |       "zen_monitor.local.batch_length",
32 |       :gauge,
33 |       mfa: {ZenMonitor.Local, :batch_length, []}
34 |     )
35 | 
36 |     Probe.define!(
37 |       "zen_monitor.local.ets.references.size",
38 |       :gauge,
39 |       mfa: {__MODULE__, :table_size, [ZenMonitor.Local.Tables.references()]}
40 |     )
41 | 
42 |     Probe.define!(
43 |       "zen_monitor.proxy.ets.subscribers.size",
44 |       :gauge,
45 |       mfa: {__MODULE__, :table_size, [ZenMonitor.Proxy.Tables.subscribers()]}
46 |     )
47 | 
48 |     :ok
49 |   end
50 | 
51 |   @doc """
52 |   Given a pid or a registered name, this will return the message_queue_len as reported by
53 |   `Process.info/2`
54 |   """
55 |   @spec message_queue_len(target :: nil | pid() | atom()) :: nil | integer()
56 |   def message_queue_len(nil), do: nil
57 | 
58 |   def message_queue_len(target) when is_pid(target) do
59 |     case Process.info(target, :message_queue_len) do
60 |       {:message_queue_len, len} -> len
61 |       _ -> nil
62 |     end
63 |   end
64 | 
65 |   def message_queue_len(target) when is_atom(target) do
66 |     target
67 |     |> Process.whereis()
68 |     |> message_queue_len()
69 |   end
70 | 
71 |   @doc """
72 |   Given a table identifier, returns the size as reported by `:ets.info/2`
73 |   """
74 |   @spec table_size(:ets.tid()) :: nil | integer()
75 |   def table_size(tid) do
76 |     case :ets.info(tid, :size) do
77 |       :undefined -> nil
78 |       size -> size
79 |     end
80 |   end
81 | end
82 | 


--------------------------------------------------------------------------------
/lib/zen_monitor/proxy.ex:
--------------------------------------------------------------------------------
  1 | defmodule ZenMonitor.Proxy do
  2 |   @moduledoc """
  3 |   ZenMonitor.Proxy monitors local processes and proxies their down messages to interested
  4 |   ZenMonitor.Locals on remote nodes for fanout.
  5 |   """
  6 |   use GenServer
  7 | 
  8 |   alias ZenMonitor.Truncator
  9 |   alias ZenMonitor.Proxy.{Batcher, Tables}
 10 | 
 11 |   @typedoc """
 12 |   Defines the valid operations that can be processed
 13 |   """
 14 |   @type operation :: :subscribe | :unsubscribe
 15 | 
 16 |   @typedoc """
 17 |   An instruction is a valid operation upon a given destination
 18 |   """
 19 |   @type instruction :: {operation, ZenMonitor.destination()}
 20 | 
 21 |   @typedoc """
 22 |   A string of instructions with the same operation can be collapsed into a partition for more
 23 |   efficient processing.
 24 |   """
 25 |   @type partition :: {operation, [ZenMonitor.destination()]}
 26 | 
 27 |   defmodule State do
 28 |     @moduledoc """
 29 |     Maintains the internal state for ZenMonitor.Proxy
 30 | 
 31 |     `monitors` is an ETS table with all the pids that the Proxy is currently monitoring
 32 |     """
 33 |     @type t :: %__MODULE__{
 34 |             monitors: :ets.tid()
 35 |           }
 36 |     defstruct [
 37 |       :monitors
 38 |     ]
 39 |   end
 40 | 
 41 |   ## Client
 42 | 
 43 |   def start_link(args) do
 44 |     GenServer.start_link(__MODULE__, args, name: __MODULE__)
 45 |   end
 46 | 
 47 |   @doc """
 48 |   Ping is a diagnostic function to check that the proxy is running.
 49 | 
 50 |   It is mainly used by ZenMonitor.Local.Connectors to check if ZenMonitor.Proxy is available
 51 |   and running on a remote node
 52 |   """
 53 |   @spec ping() :: :pong
 54 |   def ping() do
 55 |     GenServer.call(__MODULE__, :ping)
 56 |   end
 57 | 
 58 |   ## Server
 59 | 
 60 |   def init(_args) do
 61 |     Process.flag(:message_queue_data, :off_heap)
 62 |     {:ok, %State{monitors: :ets.new(:monitors, [:private, :set])}}
 63 |   end
 64 | 
 65 |   def handle_call(:ping, _from, %State{} = state) do
 66 |     {:reply, :pong, state}
 67 |   end
 68 | 
 69 |   def handle_cast({:subscribe, subscriber, targets}, %State{} = state) do
 70 |     process_operation(:subscribe, subscriber, targets, state)
 71 |     {:noreply, state}
 72 |   end
 73 | 
 74 |   def handle_cast({:process, subscriber, instructions}, %State{} = state) do
 75 |     # Create the most efficient instruction partitions
 76 |     for {operation, targets} <- partition_instructions(instructions) do
 77 |       process_operation(operation, subscriber, targets, state)
 78 |     end
 79 | 
 80 |     {:noreply, state}
 81 |   end
 82 | 
 83 |   def handle_info({:DOWN, _, :process, pid, reason}, %State{monitors: monitors} = state) do
 84 |     # Reasons can include stack traces and other dangerous items, truncate them.
 85 |     truncated_reason = Truncator.truncate(reason)
 86 | 
 87 |     # Enqueue the death certificates with the interested subscriber's batchers
 88 |     for [subscriber] <- :ets.match(Tables.subscribers(), {{pid, :"$1"}}) do
 89 |       # Delete the subscription
 90 |       :ets.delete(Tables.subscribers(), {pid, subscriber})
 91 | 
 92 |       # Enqueue the death certificate with the Batcher
 93 |       subscriber
 94 |       |> Batcher.get()
 95 |       |> Batcher.enqueue(pid, truncated_reason)
 96 |     end
 97 | 
 98 |     # Clear the monitor
 99 |     :ets.delete(monitors, pid)
100 | 
101 |     {:noreply, state}
102 |   end
103 | 
104 |   ## Private
105 | 
106 |   @spec process_operation(
107 |           operation,
108 |           subscriber :: pid(),
109 |           targets :: [ZenMonitor.destination()],
110 |           State.t()
111 |         ) :: :ok
112 |   defp process_operation(:subscribe, subscriber, targets, %State{monitors: monitors}) do
113 |     # Record that the subscriber is interested in the targets
114 |     :ets.insert(Tables.subscribers(), Enum.map(targets, &{{&1, subscriber}}))
115 | 
116 |     # Record and monitor each of the pids, filtering out already monitored pids
117 |     for target <- targets,
118 |         :ets.insert_new(monitors, {target}) do
119 |       Process.monitor(target)
120 |     end
121 | 
122 |     :ok
123 |   end
124 | 
125 |   defp process_operation(:unsubscribe, subscriber, targets, _state) do
126 |     # Remove the subscriptions from the subscribers table
127 |     for target <- targets do
128 |       :ets.delete(Tables.subscribers(), {target, subscriber})
129 |     end
130 | 
131 |     :ok
132 |   end
133 | 
134 |   @spec partition_instructions([instruction]) :: [partition]
135 |   defp partition_instructions(instructions) do
136 |     do_partition_instructions(instructions, [])
137 |   end
138 | 
139 |   @spec do_partition_instructions([instruction], [partition]) :: [partition]
140 |   defp do_partition_instructions([], acc) do
141 |     # There are no more instructions to process, the accumulator now has all the partitions, but
142 |     # in reverse order, reverse and return it
143 |     Enum.reverse(acc)
144 |   end
145 | 
146 |   defp do_partition_instructions([{op, target} | rest], acc) do
147 |     # Inspect the first instruction in the instruction list, collect all the targets with that
148 |     # operation into a new partition.
149 |     {partition, remaining} = do_collect_targets(op, rest, [target])
150 | 
151 |     # Recursively process any remaining instructions after prepending in the new partition into
152 |     # the accumulator
153 |     do_partition_instructions(remaining, [{op, partition} | acc])
154 |   end
155 | 
156 |   @spec do_collect_targets(operation, [instruction], [ZenMonitor.destination()]) ::
157 |           {[ZenMonitor.destination()], [instruction]}
158 |   defp do_collect_targets(_op, [], acc) do
159 |     # There are no more instructions to process, return the accumulator.  Note that since
160 |     # instructions of the same operation are commutative there is no need to reverse the
161 |     # accumulator even though the targets are in reverse order
162 |     {acc, []}
163 |   end
164 | 
165 |   defp do_collect_targets(op, [{op, target} | rest], acc) do
166 |     # The next instruction matches the current operation, prepend the target into the accumulator
167 |     # and recursively process the rest of the instructions
168 |     do_collect_targets(op, rest, [target | acc])
169 |   end
170 | 
171 |   defp do_collect_targets(_op, [{_other, _} | _rest] = remainder, acc) do
172 |     # The next instruction does not match the current operations.  Similar to when there are no
173 |     # more instructions to process, the accumulator is returned as-is.  The remaining instructions
174 |     # (including the current instruction that didn't match) are returned for further processing.
175 |     {acc, remainder}
176 |   end
177 | end
178 | 


--------------------------------------------------------------------------------
/lib/zen_monitor/proxy/batcher.ex:
--------------------------------------------------------------------------------
  1 | defmodule ZenMonitor.Proxy.Batcher do
  2 | 
  3 |   @moduledoc """
  4 |   `ZenMonitor.Proxy.Batcher` is responsible for collecting death_certificates from
  5 |   `ZenMonitor.Proxy` destined for the Batcher's subscriber (normally the subscriber is a
  6 |   `ZenMonitor.Local.Connector`)
  7 | 
  8 |   Periodically it will sweep and send all of the death_certificates it has collected since the
  9 |   last sweep to the subscriber for processing.
 10 |   """
 11 |   use GenServer
 12 |   use Instruments.CustomFunctions, prefix: "zen_monitor.proxy.batcher"
 13 | 
 14 |   alias ZenMonitor.Proxy.Tables
 15 | 
 16 |   @chunk_size 5000
 17 |   @lookup_timeout 30_000
 18 |   @sweep_interval 100
 19 | 
 20 |   defmodule State do
 21 |     @moduledoc """
 22 |     Maintains the internal state for the Batcher
 23 | 
 24 |     - `subscriber` is the process that death_certificates should be delivered to
 25 |     - `batch` is the queue of death_certificates pending until the next sweep.
 26 |     - `length` is the current length of the batch queue (calculating queue length is an O(n)
 27 |       operation, is is simple to track it as elements are added / removed)
 28 |     """
 29 | 
 30 |     @type t :: %__MODULE__{
 31 |             subscriber: pid,
 32 |             batch: :queue.queue(),
 33 |             length: integer
 34 |           }
 35 |     defstruct [
 36 |       :subscriber,
 37 |       batch: :queue.new(),
 38 |       length: 0
 39 |     ]
 40 |   end
 41 | 
 42 |   ## Client
 43 | 
 44 |   def start_link(subscriber) do
 45 |     GenServer.start_link(__MODULE__, subscriber)
 46 |   end
 47 | 
 48 |   @doc """
 49 |   Get a batcher for a given subscriber
 50 |   """
 51 |   @spec get(subscriber :: pid) :: pid
 52 |   def get(subscriber) do
 53 |     case GenRegistry.lookup(__MODULE__, subscriber) do
 54 |       {:ok, batcher} ->
 55 |         batcher
 56 | 
 57 |       {:error, :not_found} ->
 58 |         {:ok, batcher} = GenRegistry.lookup_or_start(__MODULE__, subscriber, [subscriber], lookup_timeout())
 59 |         batcher
 60 |     end
 61 |   end
 62 | 
 63 |   @doc """
 64 |   Enqueues a new death certificate into the batcher
 65 |   """
 66 |   @spec enqueue(batcher :: pid, pid, reason :: any) :: :ok
 67 |   def enqueue(batcher, pid, reason) do
 68 |     GenServer.cast(batcher, {:enqueue, pid, reason})
 69 |   end
 70 | 
 71 |   @doc """
 72 |   Gets the sweep interval from the Application Environment
 73 | 
 74 |   The sweep interval is the number of milliseconds to wait between sweeps, see
 75 |   ZenMonitor.Proxy.Batcher's @sweep_interval for the default value
 76 | 
 77 |   This can be controlled at boot and runtime with the {:zen_monitor, :batcher_sweep_interval}
 78 |   setting, see `ZenMonitor.Proxy.Batcher.sweep_interval/1` for runtime convenience functionality.
 79 |   """
 80 |   @spec sweep_interval() :: integer
 81 |   def sweep_interval do
 82 |     Application.get_env(:zen_monitor, :batcher_sweep_interval, @sweep_interval)
 83 |   end
 84 | 
 85 |   @doc """
 86 |   Puts the sweep interval into the Application Environment
 87 | 
 88 |   This is a simple convenience function to overwrite the {:zen_monitor, :batcher_sweep_interval}
 89 |   setting at runtime
 90 |   """
 91 |   @spec sweep_interval(value :: integer) :: :ok
 92 |   def sweep_interval(value) do
 93 |     Application.put_env(:zen_monitor, :batcher_sweep_interval, value)
 94 |   end
 95 | 
 96 |   @doc """
 97 |   Gets the lookup timeout from the Application Environment
 98 | 
 99 |   The lookup timeout is the maximum amount of time in milliseconds that the calling process will
100 |   wait to lookup or start a Batcher before exiting.
101 | 
102 |   This can be controlled at boot and runtime with the `{:zen_monitor, :batcher_lookup_timeout}`
103 |   setting, see `ZenMonitor.Proxy.Batcher.lookup_timeout/1` for runtime convenience functionality.
104 |   """
105 |   @spec lookup_timeout() :: timeout()
106 |   def lookup_timeout do
107 |     Application.get_env(:zen_monitor, :batcher_lookup_timeout, @lookup_timeout)
108 |   end
109 | 
110 |   @doc """
111 |   Puts the lookup timeout into the Application Environment
112 | 
113 |   This is a simple convenience function to overwrite the {:zen_monitor, :batcher_lookup_timeout}
114 |   setting at runtime.
115 |   """
116 |   @spec lookup_timeout(timeout :: timeout()) :: :ok
117 |   def lookup_timeout(timeout) do
118 |     Application.put_env(:zen_monitor, :batcher_lookup_timeout, timeout)
119 |   end
120 | 
121 | 
122 |   @doc """
123 |   Gets the chunk size from the Application Environment
124 | 
125 |   The chunk size is the maximum number of death certificates that will be sent during each sweep,
126 |   see ZenMonitor.Proxy.Batcher's @chunk_size for the default value
127 | 
128 |   This can be controlled at boot and runtime with the {:zen_monitor, :batcher_chunk_size}
129 |   setting, see ZenMonitor.Proxy.Batcher.chunk_size/1 for runtime convenience functionality.
130 |   """
131 |   @spec chunk_size() :: integer
132 |   def chunk_size do
133 |     Application.get_env(:zen_monitor, :batcher_chunk_size, @chunk_size)
134 |   end
135 | 
136 |   @doc """
137 |   Puts the chunk size into the Application Environment
138 | 
139 |   This is a simple convenience function to overwrite the {:zen_monitor, :batcher_chunk_size}
140 |   setting at runtime.
141 |   """
142 |   @spec chunk_size(value :: integer) :: :ok
143 |   def chunk_size(value) do
144 |     Application.put_env(:zen_monitor, :batcher_chunk_size, value)
145 |   end
146 | 
147 |   ## Server
148 | 
149 |   def init(subscriber) do
150 |     Process.monitor(subscriber)
151 |     schedule_sweep()
152 |     {:ok, %State{subscriber: subscriber}}
153 |   end
154 | 
155 |   @doc """
156 |   Handle enqueuing a new death_certificate
157 | 
158 |   Simply puts it in the batch queue.
159 |   """
160 |   def handle_cast({:enqueue, pid, reason}, %State{batch: batch, length: length} = state) do
161 |     increment("enqueue")
162 |     {:noreply, %State{state | batch: :queue.in({pid, reason}, batch), length: length + 1}}
163 |   end
164 | 
165 |   # Handle the subscriber crashing
166 |   # When the subscriber crashes there is no point in continuing to run, so the Batcher stops.
167 |   def handle_info(
168 |         {:DOWN, _, :process, subscriber, reason},
169 |         %State{subscriber: subscriber} = state
170 |       ) do
171 |     # The subscriber process has crashed, clean up the subscribers table
172 |     :ets.match_delete(Tables.subscribers(), {{:_, subscriber}})
173 |     {:stop, {:shutdown, {:subscriber_down, reason}}, state}
174 |   end
175 | 
176 |   # Handle sweep
177 |   # Every sweep the batcher will send the death_certificates batched up since the last sweep to the
178 |   # subscriber.  After that it will schedule another sweep.
179 |   def handle_info(:sweep, %State{} = state) do
180 |     new_state = do_sweep(state)
181 |     schedule_sweep()
182 |     {:noreply, new_state}
183 |   end
184 | 
185 |   ## Private
186 | 
187 |   @spec do_sweep(state :: State.t()) :: State.t()
188 |   defp do_sweep(%State{length: 0} = state), do: state
189 | 
190 |   defp do_sweep(%State{subscriber: subscriber, batch: batch, length: length} = state) do
191 |     {summary, overflow, new_length} = chunk(batch, length)
192 |     increment("sweep", length - new_length)
193 |     Process.send(subscriber, {:dead, node(), :queue.to_list(summary)}, [:noconnect])
194 |     %State{state | batch: overflow, length: new_length}
195 |   end
196 | 
197 |   @spec chunk(batch :: :queue.queue(), length :: integer) ::
198 |           {:queue.queue(), :queue.queue(), integer}
199 |   defp chunk(batch, length) do
200 |     size = chunk_size()
201 | 
202 |     if length <= size do
203 |       {batch, :queue.new(), 0}
204 |     else
205 |       {summary, overflow} = :queue.split(size, batch)
206 |       {summary, overflow, length - size}
207 |     end
208 |   end
209 | 
210 |   @spec schedule_sweep() :: reference
211 |   defp schedule_sweep do
212 |     Process.send_after(self(), :sweep, sweep_interval())
213 |   end
214 | end
215 | 


--------------------------------------------------------------------------------
/lib/zen_monitor/proxy/supervisor.ex:
--------------------------------------------------------------------------------
 1 | defmodule ZenMonitor.Proxy.Supervisor do
 2 |   @moduledoc """
 3 |   Supervisor for the `ZenMonitor.Proxy` components.
 4 | 
 5 |   See `ZenMonitor.Proxy`, `ZenMonitor.Proxy.Tables`, and `ZenMonitor.Proxy.Batcher` for more
 6 |   information about the supervised processes.
 7 | 
 8 |   There are many `ZenMonitor.Proxy.Batcher` processes, which are managed by a `GenRegistry`.
 9 |   These are keyed by the pid of the `ZenMonitor.Local.Connector` the Batcher is responsible for.
10 | 
11 |   This supervisor uses the `:rest_for_one` strategy, so the order of the children is important and
12 |   should not be altered.
13 |   """
14 |   use Supervisor
15 | 
16 |   def start_link(_opts \\ []) do
17 |     Supervisor.start_link(__MODULE__, [], name: __MODULE__)
18 |   end
19 | 
20 |   def init(_opts) do
21 |     children = [
22 |       ZenMonitor.Proxy.Tables,
23 |       ZenMonitor.Proxy,
24 |       GenRegistry.Spec.child_spec(ZenMonitor.Proxy.Batcher)
25 |     ]
26 | 
27 |     Supervisor.init(children, strategy: :rest_for_one)
28 |   end
29 | end
30 | 


--------------------------------------------------------------------------------
/lib/zen_monitor/proxy/tables.ex:
--------------------------------------------------------------------------------
 1 | defmodule ZenMonitor.Proxy.Tables do
 2 |   @moduledoc """
 3 |   `ZenMonitor.Proxy.Tables` owns the tables that are shared between multiple `ZenMonitor.Proxy`
 4 |   components.
 5 | 
 6 |   See `subscribers/0` for more information.
 7 |   """
 8 |   use GenServer
 9 | 
10 |   @subscriber_table Module.concat(__MODULE__, "Subscribers")
11 | 
12 |   ## Client
13 | 
14 |   def start_link(_opts \\ []) do
15 |     GenServer.start_link(__MODULE__, [], name: __MODULE__)
16 |   end
17 | 
18 |   @doc """
19 |   Subscribers holds information about who is subscribed to each pid.
20 | 
21 |   This information is stored in the following structure:
22 | 
23 |   { { monitored_pid, subscriber } }
24 |     ^-----------key-------------^
25 | 
26 |   `monitored_pid` is the local process that is being monitored.
27 | 
28 |   `subscriber` is the remote `ZenMonitor.Local.Connector` that is interested in the `monitored_pid`
29 |   """
30 |   @spec subscribers() :: :ets.tab()
31 |   def subscribers do
32 |     @subscriber_table
33 |   end
34 | 
35 |   ## Server
36 | 
37 |   def init(_opts) do
38 |     @subscriber_table =
39 |       :ets.new(@subscriber_table, [:public, :named_table, :ordered_set, write_concurrency: true])
40 | 
41 |     {:ok, nil}
42 |   end
43 | end
44 | 


--------------------------------------------------------------------------------
/lib/zen_monitor/supervisor.ex:
--------------------------------------------------------------------------------
 1 | defmodule ZenMonitor.Supervisor do
 2 |   @moduledoc """
 3 |   ZenMonitor.Supervisor is a convenience Supervisor that starts the Local and Proxy Supervisors
 4 | 
 5 |   See ZenMonitor.Local.Supervisor and ZenMonitor.Proxy.Supervisor for more information.
 6 |   """
 7 |   use Supervisor
 8 | 
 9 |   def start_link(_opts \\ []) do
10 |     Supervisor.start_link(__MODULE__, [], name: __MODULE__)
11 |   end
12 | 
13 |   def init(_opts) do
14 |     children = [
15 |       ZenMonitor.Local.Supervisor,
16 |       ZenMonitor.Proxy.Supervisor
17 |     ]
18 | 
19 |     Supervisor.init(children, strategy: :one_for_one)
20 |   end
21 | end
22 | 


--------------------------------------------------------------------------------
/lib/zen_monitor/truncator.ex:
--------------------------------------------------------------------------------
  1 | defmodule ZenMonitor.Truncator do
  2 |   @moduledoc """
  3 |   ZenMonitor.Truncator is used to truncate error messages to prevent error expansion issues.
  4 | 
  5 |   ## Error Expansion
  6 | 
  7 |   At the core of ZenMonitor is a system that collects local `:DOWN` messages, batches them up and
  8 |   relays them in bulk.  This opens up a failure mode where each `:DOWN` message individually is
  9 |   deliverable, but the bulk summary grows to an unsupportable size due to the aggregation of large
 10 |   reason payloads.
 11 | 
 12 |   If no truncation is performed then the payload can cause instability on the sender or the
 13 |   receiver side.
 14 | 
 15 |   ## Truncation Behavior
 16 | 
 17 |   ZenMonitor will truncate error reasons if they exceed a certain size to prevent Error Expansion
 18 |   from breaking either the sender or the receiver.
 19 | 
 20 |   Truncation is performed recursively on the term up to a maximum depth which can be provided to
 21 |   the `ZenMonitor.Truncator.truncate/2` function.
 22 | 
 23 |   See below for an explanation of how the Truncator treats different values
 24 | 
 25 |   ### Pass-Through Values
 26 | 
 27 |   There are a number of types that the Truncator will pass through unmodified.
 28 | 
 29 |   - Atoms
 30 |   - Pids
 31 |   - Numbers
 32 |   - References
 33 |   - Ports
 34 |   - Binaries less than `@max_binary_size` (see the Binary section below for more information)
 35 | 
 36 |   ### Binaries
 37 | 
 38 |   There is a configurable value `@max_binary_size` any binary encountered over this size will be
 39 |   truncated to `@max_binary_size - 3` and a trailing '...' will be appended to indicate the value
 40 |   has been truncated.  This guarantees that no binary will appear in the term with size greater
 41 |   than `@max_binary_size`
 42 | 
 43 |   ### Tuples
 44 | 
 45 |   0-tuples through 4-tuples will be passed through with their interior terms recursively
 46 |   truncated.  If a tuple has more than 4 elements, it will be replaced with the `:truncated` atom.
 47 | 
 48 |   ### Lists
 49 | 
 50 |   Lists with 0 to 4 elements will be passed through with each element recursively truncated.  If a
 51 |   list has more than 4 elements, it will be replaced with the `:truncated` atom.
 52 | 
 53 |   ### Maps
 54 | 
 55 |   Maps with a `map_size/1` less than 5 will be passed through with each value recursively
 56 |   truncated.  If a map has a size of 5 or greater then it will be replaced with the `:truncated`
 57 |   atom.
 58 | 
 59 |   ### Structs
 60 | 
 61 |   Structs are converted into maps and then the map rules are applied, they are then converted back
 62 |   into structs.  The effect is that a Struct with 4 fields or fewer will be retained (with all
 63 |   values recursively truncated) while Structs with 5 or more fields will be replaced with the
 64 |   `:truncated` atom.
 65 | 
 66 |   ### Recursion Limit
 67 | 
 68 |   The Truncator will only descend up to the `depth` argument passed into
 69 |   `ZenMonitor.Truncator.truncate/2`, regardless of the value, if the recursion descends deeper
 70 |   than this value then the `:truncated` atom will be used in place of the original value.
 71 | 
 72 |   ## Configuration
 73 | 
 74 |   `ZenMonitor.Truncator` exposes two different configuration options, and allows for one call-site
 75 |   override.  The configuration options are evaluated at compile time, changing these values at
 76 |   run-time (through a facility like `Application.put_env/3`) will have no effect.
 77 | 
 78 |   Both configuration options reside under the `:zen_monitor` app key.
 79 | 
 80 |   `:max_binary_size` is size in bytes over which the Truncator will truncate the binary.  The
 81 |   largest binary returned by the Truncator is defined to be the max_binary_size + 3, this is
 82 |   because when the truncator Truncator a binary it will append `...` to indicate that truncation
 83 |   has occurred.
 84 | 
 85 |   `:truncation_depth` is the default depth that the Truncator will recursively descend into the
 86 |   term to be truncated.  This is the value used for `ZenMonitor.Truncator.truncate/2` if no second
 87 |   argument is provided, providing a call-site second argument will override this configuration.
 88 |   """
 89 | 
 90 |   @max_binary_size Application.get_env(:zen_monitor, :max_binary_size, 1024)
 91 |   @truncation_binary_size @max_binary_size - 3
 92 |   @truncation_depth Application.get_env(:zen_monitor, :truncation_depth, 3)
 93 | 
 94 |   @doc """
 95 |   Truncates a term to a given depth
 96 | 
 97 |   See the module documentation for more information about how truncation works.
 98 |   """
 99 |   @spec truncate(term, depth :: pos_integer()) :: term
100 |   def truncate(term, depth \\ @truncation_depth) do
101 |     do_truncate(term, 0, depth)
102 |   end
103 | 
104 |   ## Private
105 | 
106 |   defp do_truncate({:shutdown, _} = shutdown, 0, _) do
107 |     shutdown
108 |   end
109 | 
110 |   defp do_truncate(_, current, max_depth) when current >= max_depth do
111 |     :truncated
112 |   end
113 | 
114 |   defp do_truncate(atom, _, _) when is_atom(atom), do: atom
115 | 
116 |   defp do_truncate(pid, _, _) when is_pid(pid), do: pid
117 | 
118 |   defp do_truncate(number, _, _) when is_number(number), do: number
119 | 
120 |   defp do_truncate(bin, _, _) when is_binary(bin) and byte_size(bin) <= @max_binary_size, do: bin
121 | 
122 |   defp do_truncate(<<first_chunk::binary-size(@truncation_binary_size), _rest::bits>>, _, _) do
123 |     first_chunk <> "..."
124 |   end
125 | 
126 |   defp do_truncate(ref, _, _) when is_reference(ref), do: ref
127 | 
128 |   defp do_truncate(port, _, _) when is_port(port), do: port
129 | 
130 |   # Tuples
131 |   defp do_truncate({a, b, c, d}, current, max_depth) do
132 |     next = current + 1
133 | 
134 |     {do_truncate(a, next, max_depth), do_truncate(b, next, max_depth),
135 |      do_truncate(c, next, max_depth), do_truncate(d, next, max_depth)}
136 |   end
137 | 
138 |   defp do_truncate({a, b, c}, current, max_depth) do
139 |     next = current + 1
140 | 
141 |     {do_truncate(a, next, max_depth), do_truncate(b, next, max_depth),
142 |      do_truncate(c, next, max_depth)}
143 |   end
144 | 
145 |   defp do_truncate({a, b}, current, max_depth) do
146 |     next = current + 1
147 |     {do_truncate(a, next, max_depth), do_truncate(b, next, max_depth)}
148 |   end
149 | 
150 |   defp do_truncate({a}, current, max_depth) do
151 |     next = current + 1
152 |     {do_truncate(a, next, max_depth)}
153 |   end
154 | 
155 |   defp do_truncate({} = tuple, _, _) do
156 |     tuple
157 |   end
158 | 
159 |   # Lists
160 |   defp do_truncate([_, _, _, _] = l, current, max_depth) do
161 |     do_truncate_list(l, current, max_depth)
162 |   end
163 | 
164 |   defp do_truncate([_, _, _] = l, current, max_depth) do
165 |     do_truncate_list(l, current, max_depth)
166 |   end
167 | 
168 |   defp do_truncate([_, _] = l, current, max_depth) do
169 |     do_truncate_list(l, current, max_depth)
170 |   end
171 | 
172 |   defp do_truncate([_] = l, current, max_depth) do
173 |     do_truncate_list(l, current, max_depth)
174 |   end
175 | 
176 |   defp do_truncate([], _, _) do
177 |     []
178 |   end
179 | 
180 |   # Maps / Structs
181 |   defp do_truncate(%struct_module{} = struct, current, max_depth) do
182 |     truncated_value =
183 |       struct
184 |       |> Map.from_struct()
185 |       |> do_truncate(current, max_depth)
186 | 
187 |     if is_map(truncated_value) do
188 |       # Don't use Kernel.struct/2 because that crashes if this node
189 |       # does not have the code for struct_module.
190 |       Map.put(truncated_value, :__struct__, struct_module)
191 |     else
192 |       truncated_value
193 |     end
194 |   end
195 | 
196 |   defp do_truncate(%{} = m, current, max_depth) when map_size(m) < 5 do
197 |     for {k, v} <- m, into: %{} do
198 |       {k, do_truncate(v, current + 1, max_depth)}
199 |     end
200 |   end
201 | 
202 |   # Catch all
203 |   defp do_truncate(_, _, _) do
204 |     :truncated
205 |   end
206 | 
207 |   defp do_truncate_list(l, current, max_depth) do
208 |     Enum.map(l, &do_truncate(&1, current + 1, max_depth))
209 |   end
210 | end
211 | 


--------------------------------------------------------------------------------
/mix.exs:
--------------------------------------------------------------------------------
 1 | defmodule ZenMonitor.Mixfile do
 2 |   use Mix.Project
 3 | 
 4 |   def project do
 5 |     [
 6 |       app: :zen_monitor,
 7 |       name: "ZenMonitor",
 8 |       version: "2.1.0",
 9 |       elixir: "~> 1.7",
10 |       start_permanent: Mix.env() == :prod,
11 |       aliases: aliases(),
12 |       deps: deps(),
13 |       docs: docs(),
14 |       elixirc_paths: elixirc_paths(Mix.env()),
15 |       package: package()
16 |     ]
17 |   end
18 | 
19 |   def application do
20 |     [
21 |       extra_applications: [:logger, :instruments],
22 |       mod: {ZenMonitor.Application, []}
23 |     ]
24 |   end
25 | 
26 |   defp aliases do
27 |     [
28 |       test: "test --no-start"
29 |     ]
30 |   end
31 | 
32 |   defp deps do
33 |     [
34 |       {:gen_stage, "~> 1.0"},
35 |       {:instruments, "~> 2.1"},
36 |       {:gen_registry, "~> 1.0"},
37 |       {:ex_doc, "~> 0.27.3", only: :dev, runtime: false},
38 |       {:dialyxir, "~> 1.0", only: :dev, runtime: false}
39 |     ]
40 |   end
41 | 
42 |   defp docs do
43 |     [
44 |       name: "ZenMonitor",
45 |       extras: ["README.md"],
46 |       main: "readme",
47 |       source_url: "https://github.com/discordapp/zen_monitor",
48 |       groups_for_modules: [
49 |         "Programmer Interface": [
50 |           ZenMonitor
51 |         ],
52 |         "Local ZenMonitor System": [
53 |           ZenMonitor.Local,
54 |           ZenMonitor.Local.State,
55 |           ZenMonitor.Local.Connector,
56 |           ZenMonitor.Local.Connector.State,
57 |           ZenMonitor.Local.Dispatcher,
58 |           ZenMonitor.Local.Tables
59 |         ],
60 |         "Proxy ZenMonitor System": [
61 |           ZenMonitor.Proxy,
62 |           ZenMonitor.Proxy.State,
63 |           ZenMonitor.Proxy.Batcher,
64 |           ZenMonitor.Proxy.Batcher.State,
65 |           ZenMonitor.Proxy.Tables
66 |         ],
67 |         "Supervisors / OTP / Utilities": [
68 |           ZenMonitor.Application,
69 |           ZenMonitor.Supervisor,
70 |           ZenMonitor.Local.Supervisor,
71 |           ZenMonitor.Proxy.Supervisor,
72 |           ZenMonitor.Metrics,
73 |           ZenMonitor.Truncator
74 |         ]
75 |       ]
76 |     ]
77 |   end
78 | 
79 |   defp elixirc_paths(:test) do
80 |     elixirc_paths(:any) ++ ["test/support"]
81 |   end
82 | 
83 |   defp elixirc_paths(_) do
84 |     ["lib"]
85 |   end
86 | 
87 |   defp package() do
88 |     [
89 |       name: :zen_monitor,
90 |       description: "ZenMonitor provides efficient monitoring of remote processes.",
91 |       maintainers: ["Discord Core Infrastructure"],
92 |       licenses: ["MIT"],
93 |       links: %{
94 |         "GitHub" => "https://github.com/discordapp/zen_monitor"
95 |       }
96 |     ]
97 |   end
98 | end
99 | 


--------------------------------------------------------------------------------
/mix.lock:
--------------------------------------------------------------------------------
 1 | %{
 2 |   "dialyxir": {:hex, :dialyxir, "1.0.0", "6a1fa629f7881a9f5aaf3a78f094b2a51a0357c843871b8bc98824e7342d00a5", [:mix], [{:erlex, ">= 0.2.6", [hex: :erlex, repo: "hexpm", optional: false]}], "hexpm", "aeb06588145fac14ca08d8061a142d52753dbc2cf7f0d00fc1013f53f8654654"},
 3 |   "earmark": {:hex, :earmark, "1.4.10", "bddce5e8ea37712a5bfb01541be8ba57d3b171d3fa4f80a0be9bcf1db417bcaf", [:mix], [{:earmark_parser, ">= 1.4.10", [hex: :earmark_parser, repo: "hexpm", optional: false]}], "hexpm", "12dbfa80810478e521d3ffb941ad9fbfcbbd7debe94e1341b4c4a1b2411c1c27"},
 4 |   "earmark_parser": {:hex, :earmark_parser, "1.4.29", "149d50dcb3a93d9f3d6f3ecf18c918fb5a2d3c001b5d3305c926cddfbd33355b", [:mix], [], "hexpm", "4902af1b3eb139016aed210888748db8070b8125c2342ce3dcae4f38dcc63503"},
 5 |   "erlex": {:hex, :erlex, "0.2.6", "c7987d15e899c7a2f34f5420d2a2ea0d659682c06ac607572df55a43753aa12e", [:mix], [], "hexpm", "2ed2e25711feb44d52b17d2780eabf998452f6efda104877a3881c2f8c0c0c75"},
 6 |   "ex_doc": {:hex, :ex_doc, "0.27.3", "d09ed7ab590b71123959d9017f6715b54a448d76b43cf909eb0b2e5a78a977b2", [:mix], [{:earmark_parser, "~> 1.4.19", [hex: :earmark_parser, repo: "hexpm", optional: false]}, {:makeup_elixir, "~> 0.14", [hex: :makeup_elixir, repo: "hexpm", optional: false]}, {:makeup_erlang, "~> 0.1", [hex: :makeup_erlang, repo: "hexpm", optional: false]}], "hexpm", "ee60b329d08195039bfeb25231a208749be4f2274eae42ce38f9be0538a2f2e6"},
 7 |   "gen_registry": {:hex, :gen_registry, "1.0.2", "b7175cf940e5d13da5a90d283974e7f9c64d9b87cb4ceb4f2cbacf95e5260215", [:mix], [], "hexpm", "51ebb0556e9469faeb737d9c8d6112df7fbd27c68bdf308e3d3572a231e7f5d8"},
 8 |   "gen_stage": {:hex, :gen_stage, "1.1.2", "b1656cd4ba431ed02c5656fe10cb5423820847113a07218da68eae5d6a260c23", [:mix], [], "hexpm", "9e39af23140f704e2b07a3e29d8f05fd21c2aaf4088ff43cb82be4b9e3148d02"},
 9 |   "instruments": {:hex, :instruments, "2.1.1", "e6629f71048e963e941263494420720c41554d95b1779ffe3404bf22cecb4efd", [:mix], [{:recon, "~> 2.3.1", [hex: :recon, repo: "hexpm", optional: false]}, {:statix, "~> 1.2.1", [hex: :statix, repo: "hexpm", optional: false]}], "hexpm", "f295ddf3fb09ac37f915cd1b2bd2f4dbcafc2706730237a4bf30aa846a65ada5"},
10 |   "makeup": {:hex, :makeup, "1.1.0", "6b67c8bc2882a6b6a445859952a602afc1a41c2e08379ca057c0f525366fc3ca", [:mix], [{:nimble_parsec, "~> 1.2.2 or ~> 1.3", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "0a45ed501f4a8897f580eabf99a2e5234ea3e75a4373c8a52824f6e873be57a6"},
11 |   "makeup_elixir": {:hex, :makeup_elixir, "0.16.0", "f8c570a0d33f8039513fbccaf7108c5d750f47d8defd44088371191b76492b0b", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}, {:nimble_parsec, "~> 1.2.3", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "28b2cbdc13960a46ae9a8858c4bebdec3c9a6d7b4b9e7f4ed1502f8159f338e7"},
12 |   "makeup_erlang": {:hex, :makeup_erlang, "0.1.1", "3fcb7f09eb9d98dc4d208f49cc955a34218fc41ff6b84df7c75b3e6e533cc65f", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}], "hexpm", "174d0809e98a4ef0b3309256cbf97101c6ec01c4ab0b23e926a9e17df2077cbb"},
13 |   "nimble_parsec": {:hex, :nimble_parsec, "1.2.3", "244836e6e3f1200c7f30cb56733fd808744eca61fd182f731eac4af635cc6d0b", [:mix], [], "hexpm", "c8d789e39b9131acf7b99291e93dae60ab48ef14a7ee9d58c6964f59efb570b0"},
14 |   "recon": {:hex, :recon, "2.3.6", "2bcad0cf621fb277cabbb6413159cd3aa30265c2dee42c968697988b30108604", [:rebar3], [], "hexpm", "f55198650a8ec01d3efc04797abe550c7d023e7ff8b509f373cf933032049bd8"},
15 |   "statix": {:hex, :statix, "1.2.1", "4f23c8cc2477ea0de89fed5e34f08c54b0d28b838f7b8f26613155f2221bb31e", [:mix], [], "hexpm", "7f988988fddcce19ae376bb8e47aa5ea5dabf8d4ba78d34d1ae61eb537daf72e"},
16 | }
17 | 


--------------------------------------------------------------------------------
/test/black_box_test.exs:
--------------------------------------------------------------------------------
  1 | defmodule ZenMonitor.BlackBox.Test do
  2 |   @moduledoc """
  3 |   This test suite treats the ZenMonitor system as a black box and simply asserts that the client
  4 |   facing behavior is correct.
  5 |   """
  6 |   use ExUnit.Case
  7 | 
  8 |   alias ZenMonitor.Local.{Connector, Dispatcher}
  9 |   alias ZenMonitor.Proxy.Batcher
 10 | 
 11 |   setup do
 12 |     start_supervised(ZenMonitor.Supervisor)
 13 |     {:ok, down: :down@down, remotes: []}
 14 |   end
 15 | 
 16 |   @doc """
 17 |   Reduces the intervals for all the batching parts of ZenMonitor so that the default
 18 |   assert_receive / refute_receive timeouts are an order of magnitude larger.
 19 |   """
 20 |   def fast_zen_monitor(ctx) do
 21 |     # Tune the local dispatcher
 22 |     original_demand_interval = Dispatcher.demand_interval()
 23 |     Dispatcher.demand_interval(10)
 24 | 
 25 |     # Tune the local connector
 26 |     original_connector_interval = Connector.sweep_interval()
 27 |     Connector.sweep_interval(10)
 28 | 
 29 |     # Tune the remote batchers
 30 |     original_batch_intervals =
 31 |       Enum.map([node() | ctx.remotes], fn remote ->
 32 |         original = :rpc.call(remote, Batcher, :sweep_interval, [])
 33 |         :rpc.call(remote, Batcher, :sweep_interval, [10])
 34 |         {remote, original}
 35 |       end)
 36 | 
 37 |     on_exit(fn ->
 38 |       # Restore the local settings
 39 |       Dispatcher.demand_interval(original_demand_interval)
 40 |       Connector.sweep_interval(original_connector_interval)
 41 | 
 42 |       # Restore the remote settings
 43 |       Enum.each(original_batch_intervals, fn {remote, original} ->
 44 |         :rpc.call(remote, Batcher, :sweep_interval, [original])
 45 |       end)
 46 |     end)
 47 | 
 48 |     :ok
 49 |   end
 50 | 
 51 |   def start_compatible_remote(ctx) do
 52 |     {:ok, compatible, nil} = ChildNode.start_link(:zen_monitor, :Compatible)
 53 | 
 54 |     # Perform an initial connect like discovery would
 55 |     Node.connect(compatible)
 56 | 
 57 |     on_exit(fn ->
 58 |       Node.monitor(compatible, true)
 59 | 
 60 |       receive do
 61 |         {:nodedown, ^compatible} -> :ok
 62 |       end
 63 |     end)
 64 | 
 65 |     {:ok, compatible: compatible, remotes: [compatible | ctx.remotes()]}
 66 |   end
 67 | 
 68 |   def start_incompatible_remote(_) do
 69 |     {:ok, incompatible, nil} = ChildNode.start_link(:elixir, :Incompatible)
 70 | 
 71 |     # Perform an initial connect like discovery would
 72 |     Node.connect(incompatible)
 73 | 
 74 |     on_exit(fn ->
 75 |       Node.monitor(incompatible, true)
 76 | 
 77 |       receive do
 78 |         {:nodedown, ^incompatible} -> :ok
 79 |       end
 80 |     end)
 81 | 
 82 |     {:ok, incompatible: incompatible}
 83 |   end
 84 | 
 85 |   def start_compatible_processes(ctx) do
 86 |     compatible_pid = Node.spawn(ctx.compatible, Process, :sleep, [:infinity])
 87 |     compatible_pid_b = Node.spawn(ctx.compatible, Process, :sleep, [:infinity])
 88 | 
 89 |     {:ok, compatible_pid: compatible_pid, compatible_pid_b: compatible_pid_b}
 90 |   end
 91 | 
 92 |   def start_incompatible_processes(ctx) do
 93 |     incompatible_pid = Node.spawn(ctx.incompatible, Process, :sleep, [:infinity])
 94 |     incompatible_pid_b = Node.spawn(ctx.incompatible, Process, :sleep, [:infinity])
 95 | 
 96 |     {:ok, incompatible_pid: incompatible_pid, incompatible_pid_b: incompatible_pid_b}
 97 |   end
 98 | 
 99 |   def start_local_processes(_) do
100 |     local_pid = spawn(fn -> Process.sleep(:infinity) end)
101 |     local_pid_b = spawn(fn -> Process.sleep(:infinity) end)
102 | 
103 |     {:ok, local_pid: local_pid, local_pid_b: local_pid_b}
104 |   end
105 | 
106 |   describe "Monitoring a local process" do
107 |     setup [:fast_zen_monitor, :start_local_processes]
108 | 
109 |     test "monitoring a local process returns a reference", ctx do
110 |       ref = ZenMonitor.monitor(ctx.local_pid)
111 |       assert is_reference(ref)
112 |     end
113 | 
114 |     test "local process returns a :DOWN message if it goes down", ctx do
115 |       target = ctx.local_pid()
116 | 
117 |       # Monitor the local process
118 |       ref = ZenMonitor.monitor(target)
119 |       Helper.await_monitor_established(ref, target)
120 | 
121 |       # Kill the local process
122 |       Process.exit(target, :kill)
123 |       Helper.await_monitor_cleared(ref, target)
124 | 
125 |       # Assert that we receive the down messages
126 |       assert_receive {:DOWN, ^ref, :process, ^target, {:zen_monitor, :killed}}
127 | 
128 |       # Make sure that we don't receive any additional messages
129 |       refute_receive {:DOWN, _, _, _, _}
130 |     end
131 | 
132 |     test "multiple monitors all get fired", ctx do
133 |       target = ctx.local_pid()
134 | 
135 |       # Monitor the local process multiple times
136 |       ref_a = ZenMonitor.monitor(target)
137 |       ref_b = ZenMonitor.monitor(target)
138 |       ref_c = ZenMonitor.monitor(target)
139 |       Helper.await_monitors_established([ref_a, ref_b, ref_c], target)
140 | 
141 |       # Kill the local process
142 |       Process.exit(target, :kill)
143 |       Helper.await_monitors_cleared([ref_a, ref_b, ref_c], target)
144 | 
145 |       # Assert that we receive down message for each monitor
146 |       assert_receive {:DOWN, ^ref_a, :process, ^target, {:zen_monitor, :killed}}
147 |       assert_receive {:DOWN, ^ref_b, :process, ^target, {:zen_monitor, :killed}}
148 |       assert_receive {:DOWN, ^ref_c, :process, ^target, {:zen_monitor, :killed}}
149 | 
150 |       # Make sure that we don't receive any additional messages
151 |       refute_receive {:DOWN, _, _, _, _}
152 |     end
153 | 
154 |     test "an already down local process returns a :DOWN message", ctx do
155 |       target = ctx.local_pid()
156 | 
157 |       # Kill the local process, before the monitors
158 |       Process.exit(target, :kill)
159 | 
160 |       # Monitor the local process
161 |       ref = ZenMonitor.monitor(target)
162 |       Helper.await_monitor_cleared(ref, target)
163 | 
164 |       # Assert that we receive the correct reason
165 |       assert_receive {:DOWN, ^ref, :process, ^target, {:zen_monitor, :noproc}}
166 | 
167 |       # Make sure that we don't receive any additional messages
168 |       refute_receive {:DOWN, _, _, _, _}
169 |     end
170 | 
171 |     test "multiple monitors for already down local process returns :DOWN messages", ctx do
172 |       target = ctx.local_pid()
173 | 
174 |       # Kill the local process, before the monitors
175 |       Process.exit(target, :kill)
176 | 
177 |       # Monitor the local process multiple times
178 |       ref_a = ZenMonitor.monitor(target)
179 |       ref_b = ZenMonitor.monitor(target)
180 |       ref_c = ZenMonitor.monitor(target)
181 | 
182 |       Helper.await_monitors_cleared([ref_a, ref_b, ref_c], target)
183 | 
184 |       # Assert that we receive multiple :DOWN messages with the correct reason
185 |       assert_receive {:DOWN, ^ref_a, :process, ^target, {:zen_monitor, :noproc}}
186 |       assert_receive {:DOWN, ^ref_b, :process, ^target, {:zen_monitor, :noproc}}
187 |       assert_receive {:DOWN, ^ref_c, :process, ^target, {:zen_monitor, :noproc}}
188 | 
189 |       # Make sure that we don't receive any additional messages
190 |       refute_receive {:DOWN, _, _, _, _}
191 |     end
192 | 
193 |     test "mixed monitors established before and after process down", ctx do
194 |       target = ctx.local_pid()
195 | 
196 |       # Establish some monitors before the pid is killed
197 |       ref_alive_a = ZenMonitor.monitor(target)
198 |       ref_alive_b = ZenMonitor.monitor(target)
199 |       Helper.await_monitors_established([ref_alive_a, ref_alive_b], target)
200 | 
201 |       # Kill the local process
202 |       Process.exit(target, :kill)
203 |       Helper.await_monitors_cleared([ref_alive_a, ref_alive_b], target)
204 | 
205 |       # Assert that the initial monitors fire
206 |       assert_receive {:DOWN, ^ref_alive_a, :process, ^target, {:zen_monitor, :killed}}
207 |       assert_receive {:DOWN, ^ref_alive_b, :process, ^target, {:zen_monitor, :killed}}
208 | 
209 |       # Establish some monitors after the pid is killed
210 |       ref_dead_a = ZenMonitor.monitor(target)
211 |       ref_dead_b = ZenMonitor.monitor(target)
212 | 
213 |       Helper.await_monitors_cleared([ref_dead_a, ref_dead_b], target)
214 | 
215 |       # Assert that the new monitors got the expected :DOWN messages with the correct reason
216 |       assert_receive {:DOWN, ^ref_dead_a, :process, ^target, {:zen_monitor, :noproc}}
217 |       assert_receive {:DOWN, ^ref_dead_b, :process, ^target, {:zen_monitor, :noproc}}
218 | 
219 |       # Make sure that we don't receive any additional messages
220 |       refute_receive {:DOWN, _, _, _, _}
221 |     end
222 | 
223 |     test "multiple down processes all report back as :DOWN", ctx do
224 |       target = ctx.local_pid()
225 |       other = ctx.local_pid_b()
226 | 
227 |       # Establish multiple monitors for each process
228 |       target_ref_a = ZenMonitor.monitor(target)
229 |       target_ref_b = ZenMonitor.monitor(target)
230 |       other_ref_a = ZenMonitor.monitor(other)
231 |       other_ref_b = ZenMonitor.monitor(other)
232 | 
233 |       Helper.await_monitors_established([target_ref_a, target_ref_b], target)
234 |       Helper.await_monitors_established([other_ref_a, other_ref_b], other)
235 | 
236 |       # Kill both local processes
237 |       Process.exit(target, :kill)
238 |       Process.exit(other, :kill)
239 | 
240 |       Helper.await_monitors_cleared([target_ref_a, target_ref_b], target)
241 |       Helper.await_monitors_cleared([other_ref_a, other_ref_b], other)
242 | 
243 |       # Assert that we receive all the expected :DOWN messages
244 |       assert_receive {:DOWN, ^target_ref_a, :process, ^target, {:zen_monitor, :killed}}
245 |       assert_receive {:DOWN, ^target_ref_b, :process, ^target, {:zen_monitor, :killed}}
246 |       assert_receive {:DOWN, ^other_ref_a, :process, ^other, {:zen_monitor, :killed}}
247 |       assert_receive {:DOWN, ^other_ref_b, :process, ^other, {:zen_monitor, :killed}}
248 |     end
249 | 
250 |     test "multiple already down process all report back as :DOWN", ctx do
251 |       target = ctx.local_pid()
252 |       other = ctx.local_pid_b()
253 | 
254 |       # Kill both local processes
255 |       Process.exit(target, :kill)
256 |       Process.exit(other, :kill)
257 | 
258 |       # Establish multiple monitors for each process
259 |       target_ref_a = ZenMonitor.monitor(target)
260 |       target_ref_b = ZenMonitor.monitor(target)
261 |       other_ref_a = ZenMonitor.monitor(other)
262 |       other_ref_b = ZenMonitor.monitor(other)
263 | 
264 |       Helper.await_monitors_cleared([target_ref_a, target_ref_b], target)
265 |       Helper.await_monitors_cleared([other_ref_a, other_ref_b], other)
266 | 
267 |       # Assert that we receive all the expected :DOWN messages
268 |       assert_receive {:DOWN, ^target_ref_a, :process, ^target, {:zen_monitor, :noproc}}
269 |       assert_receive {:DOWN, ^target_ref_b, :process, ^target, {:zen_monitor, :noproc}}
270 |       assert_receive {:DOWN, ^other_ref_a, :process, ^other, {:zen_monitor, :noproc}}
271 |       assert_receive {:DOWN, ^other_ref_b, :process, ^other, {:zen_monitor, :noproc}}
272 |     end
273 | 
274 |     test "mixed down processes all report back as :DOWN", ctx do
275 |       target = ctx.local_pid()
276 |       other = ctx.local_pid_b()
277 | 
278 |       # Kill target before establishing any monitors
279 |       Process.exit(target, :kill)
280 | 
281 |       # Establish multiple monitors for each process
282 |       target_ref_a = ZenMonitor.monitor(target)
283 |       target_ref_b = ZenMonitor.monitor(target)
284 |       other_ref_a = ZenMonitor.monitor(other)
285 |       other_ref_b = ZenMonitor.monitor(other)
286 |       Helper.await_monitors_established([other_ref_a, other_ref_b], other)
287 | 
288 |       # Kill other after establishing the monitors
289 |       Process.exit(other, :kill)
290 |       Helper.await_monitors_cleared([target_ref_a, target_ref_b], target)
291 |       Helper.await_monitors_cleared([other_ref_a, other_ref_b], other)
292 | 
293 |       # Assert that we receive all the expected :DOWN messages
294 |       assert_receive {:DOWN, ^target_ref_a, :process, ^target, {:zen_monitor, :noproc}}
295 |       assert_receive {:DOWN, ^target_ref_b, :process, ^target, {:zen_monitor, :noproc}}
296 |       assert_receive {:DOWN, ^other_ref_a, :process, ^other, {:zen_monitor, :killed}}
297 |       assert_receive {:DOWN, ^other_ref_b, :process, ^other, {:zen_monitor, :killed}}
298 |     end
299 |   end
300 | 
301 |   describe "Monitoring a remote process on a compatible node" do
302 |     setup [:start_compatible_remote, :start_compatible_processes, :fast_zen_monitor]
303 | 
304 |     test "monitoring a remote process returns a reference", ctx do
305 |       ref = ZenMonitor.monitor(ctx.compatible_pid)
306 |       assert is_reference(ref)
307 |     end
308 | 
309 |     test "remote process returns a :DOWN message if it goes down", ctx do
310 |       target = ctx.compatible_pid()
311 | 
312 |       # Monitor the remote process
313 |       ref = ZenMonitor.monitor(target)
314 |       Helper.await_monitor_established(ref, target)
315 | 
316 |       # Kill the remote process
317 |       Process.exit(target, :kill)
318 |       Helper.await_monitor_cleared(ref, target)
319 | 
320 |       # Assert that we receive the down messages
321 |       assert_receive {:DOWN, ^ref, :process, ^target, {:zen_monitor, :killed}}
322 | 
323 |       # Make sure that we don't receive any additional messages
324 |       refute_receive {:DOWN, _, _, _, _}
325 |     end
326 | 
327 |     test "multiple monitors all get fired", ctx do
328 |       target = ctx.compatible_pid()
329 | 
330 |       # Monitor the remote process multiple times
331 |       ref_a = ZenMonitor.monitor(target)
332 |       ref_b = ZenMonitor.monitor(target)
333 |       ref_c = ZenMonitor.monitor(target)
334 |       Helper.await_monitors_established([ref_a, ref_b, ref_c], target)
335 | 
336 |       # Kill the remote process
337 |       Process.exit(target, :kill)
338 |       Helper.await_monitors_cleared([ref_a, ref_b, ref_c], target)
339 | 
340 |       # Assert that we receive down message for each monitor
341 |       assert_receive {:DOWN, ^ref_a, :process, ^target, {:zen_monitor, :killed}}
342 |       assert_receive {:DOWN, ^ref_b, :process, ^target, {:zen_monitor, :killed}}
343 |       assert_receive {:DOWN, ^ref_c, :process, ^target, {:zen_monitor, :killed}}
344 | 
345 |       # Make sure that we don't receive any additional messages
346 |       refute_receive {:DOWN, _, _, _, _}
347 |     end
348 | 
349 |     test "an already down remote process returns a :DOWN message", ctx do
350 |       target = ctx.compatible_pid()
351 | 
352 |       # Kill the remote process, before the monitors
353 |       Process.exit(target, :kill)
354 | 
355 |       # Monitor the remote process
356 |       ref = ZenMonitor.monitor(target)
357 |       Helper.await_monitor_cleared(ref, target)
358 |       # Assert that we receive the correct reason
359 |       assert_receive {:DOWN, ^ref, :process, ^target, {:zen_monitor, :noproc}}
360 | 
361 |       # Make sure that we don't receive any additional messages
362 |       refute_receive {:DOWN, _, _, _, _}
363 |     end
364 | 
365 |     test "multiple monitors for alread down remote process returns :DOWN messages", ctx do
366 |       target = ctx.compatible_pid()
367 | 
368 |       # Kill the remote process, before the monitors
369 |       Process.exit(target, :kill)
370 | 
371 |       # Monitor the remote process multiple times
372 |       ref_a = ZenMonitor.monitor(target)
373 |       ref_b = ZenMonitor.monitor(target)
374 |       ref_c = ZenMonitor.monitor(target)
375 | 
376 |       Helper.await_monitors_cleared([ref_a, ref_b, ref_c], target)
377 | 
378 |       # Assert that we receive multiple :DOWN messages with the correct reason
379 |       assert_receive {:DOWN, ^ref_a, :process, ^target, {:zen_monitor, :noproc}}
380 |       assert_receive {:DOWN, ^ref_b, :process, ^target, {:zen_monitor, :noproc}}
381 |       assert_receive {:DOWN, ^ref_c, :process, ^target, {:zen_monitor, :noproc}}
382 | 
383 |       # Make sure that we don't receive any additional messages
384 |       refute_receive {:DOWN, _, _, _, _}
385 |     end
386 | 
387 |     test "mixed monitors established before and after process down", ctx do
388 |       target = ctx.compatible_pid()
389 | 
390 |       # Establish some monitors before the pid is killed
391 |       ref_alive_a = ZenMonitor.monitor(target)
392 |       ref_alive_b = ZenMonitor.monitor(target)
393 |       Helper.await_monitors_established([ref_alive_a, ref_alive_b], target)
394 | 
395 |       # Kill the remote process
396 |       Process.exit(target, :kill)
397 |       Helper.await_monitors_cleared([ref_alive_a, ref_alive_b], target)
398 | 
399 |       # Assert that the initial monitors fire
400 |       assert_receive {:DOWN, ^ref_alive_a, :process, ^target, {:zen_monitor, :killed}}
401 |       assert_receive {:DOWN, ^ref_alive_b, :process, ^target, {:zen_monitor, :killed}}
402 | 
403 |       # Establish some monitors after the pid is killed
404 |       ref_dead_a = ZenMonitor.monitor(target)
405 |       ref_dead_b = ZenMonitor.monitor(target)
406 |       Helper.await_monitors_cleared([ref_dead_a, ref_dead_b], target)
407 | 
408 |       # Assert that the new monitors got the expected :DOWN messages with the correct reason
409 |       assert_receive {:DOWN, ^ref_dead_a, :process, ^target, {:zen_monitor, :noproc}}
410 |       assert_receive {:DOWN, ^ref_dead_b, :process, ^target, {:zen_monitor, :noproc}}
411 | 
412 |       # Make sure that we don't receive any additional messages
413 |       refute_receive {:DOWN, _, _, _, _}
414 |     end
415 | 
416 |     test "multiple down processes all report back as :DOWN", ctx do
417 |       target = ctx.compatible_pid()
418 |       other = ctx.compatible_pid_b()
419 | 
420 |       # Establish multiple monitors for each process
421 |       target_ref_a = ZenMonitor.monitor(target)
422 |       target_ref_b = ZenMonitor.monitor(target)
423 |       other_ref_a = ZenMonitor.monitor(other)
424 |       other_ref_b = ZenMonitor.monitor(other)
425 |       Helper.await_monitors_established([target_ref_a, target_ref_b], target)
426 |       Helper.await_monitors_established([other_ref_a, other_ref_b], other)
427 | 
428 |       # Kill both remote processes
429 |       Process.exit(target, :kill)
430 |       Process.exit(other, :kill)
431 |       Helper.await_monitors_cleared([target_ref_a, target_ref_b], target)
432 |       Helper.await_monitors_cleared([other_ref_a, other_ref_b], other)
433 | 
434 |       # Assert that we receive all the expected :DOWN messages
435 |       assert_receive {:DOWN, ^target_ref_a, :process, ^target, {:zen_monitor, :killed}}
436 |       assert_receive {:DOWN, ^target_ref_b, :process, ^target, {:zen_monitor, :killed}}
437 |       assert_receive {:DOWN, ^other_ref_a, :process, ^other, {:zen_monitor, :killed}}
438 |       assert_receive {:DOWN, ^other_ref_b, :process, ^other, {:zen_monitor, :killed}}
439 |     end
440 | 
441 |     test "multiple already down process all report back as :DOWN", ctx do
442 |       target = ctx.compatible_pid()
443 |       other = ctx.compatible_pid_b()
444 | 
445 |       # Kill both remote processes
446 |       Process.exit(target, :kill)
447 |       Process.exit(other, :kill)
448 | 
449 |       # Establish multiple monitors for each process
450 |       target_ref_a = ZenMonitor.monitor(target)
451 |       target_ref_b = ZenMonitor.monitor(target)
452 |       other_ref_a = ZenMonitor.monitor(other)
453 |       other_ref_b = ZenMonitor.monitor(other)
454 | 
455 |       Helper.await_monitors_cleared([target_ref_a, target_ref_b], target)
456 |       Helper.await_monitors_cleared([other_ref_a, other_ref_b], other)
457 | 
458 |       # Assert that we receive all the expected :DOWN messages
459 |       assert_receive {:DOWN, ^target_ref_a, :process, ^target, {:zen_monitor, :noproc}}
460 |       assert_receive {:DOWN, ^target_ref_b, :process, ^target, {:zen_monitor, :noproc}}
461 |       assert_receive {:DOWN, ^other_ref_a, :process, ^other, {:zen_monitor, :noproc}}
462 |       assert_receive {:DOWN, ^other_ref_b, :process, ^other, {:zen_monitor, :noproc}}
463 |     end
464 | 
465 |     test "mixed down processes all report back as :DOWN", ctx do
466 |       target = ctx.compatible_pid()
467 |       other = ctx.compatible_pid_b()
468 | 
469 |       # Kill target before establishing any monitors
470 |       Process.exit(target, :kill)
471 | 
472 |       # Establish multiple monitors for each process
473 |       target_ref_a = ZenMonitor.monitor(target)
474 |       target_ref_b = ZenMonitor.monitor(target)
475 |       other_ref_a = ZenMonitor.monitor(other)
476 |       other_ref_b = ZenMonitor.monitor(other)
477 |       Helper.await_monitors_established([other_ref_a, other_ref_b], other)
478 | 
479 |       # Kill other after establishing the monitors
480 |       Process.exit(other, :kill)
481 |       Helper.await_monitors_cleared([other_ref_a, other_ref_b], other)
482 | 
483 |       # Assert that we receive all the expected :DOWN messages
484 |       assert_receive {:DOWN, ^target_ref_a, :process, ^target, {:zen_monitor, :noproc}}
485 |       assert_receive {:DOWN, ^target_ref_b, :process, ^target, {:zen_monitor, :noproc}}
486 |       assert_receive {:DOWN, ^other_ref_a, :process, ^other, {:zen_monitor, :killed}}
487 |       assert_receive {:DOWN, ^other_ref_b, :process, ^other, {:zen_monitor, :killed}}
488 |     end
489 | 
490 |     test "all monitored processes report back as :DOWN if the node dies", ctx do
491 |       remote = ctx.compatible()
492 |       target = ctx.compatible_pid()
493 |       other = ctx.compatible_pid_b()
494 | 
495 |       # Monitor both remote processes
496 |       target_ref = ZenMonitor.monitor(target)
497 |       other_ref = ZenMonitor.monitor(other)
498 |       Helper.await_monitor_established(target_ref, target)
499 |       Helper.await_monitor_established(other_ref, other)
500 | 
501 |       # Stop the remote node
502 |       assert :ok = :slave.stop(remote)
503 |       Helper.await_monitor_cleared(target_ref, target)
504 |       Helper.await_monitor_cleared(other_ref, other)
505 | 
506 |       # Assert that the :DOWN messages were dispatched with :nodedown
507 |       assert_receive {:DOWN, ^target_ref, :process, ^target, {:zen_monitor, :nodedown}}
508 |       assert_receive {:DOWN, ^other_ref, :process, ^other, {:zen_monitor, :nodedown}}
509 |     end
510 |   end
511 | 
512 |   describe "Monitoring a remote process on an incompatible node" do
513 |     setup [:start_incompatible_remote, :start_incompatible_processes, :fast_zen_monitor]
514 | 
515 |     test "monitoring a remote process returns a reference", ctx do
516 |       ref = ZenMonitor.monitor(ctx.incompatible_pid)
517 |       assert is_reference(ref)
518 |     end
519 | 
520 |     test "monitoring returns down with :nodedown", ctx do
521 |       target = ctx.incompatible_pid()
522 | 
523 |       # Attempt to monitor incompatible node
524 |       ref = ZenMonitor.monitor(target)
525 | 
526 |       # Assert that the :DOWN message with :incompatible are delivered
527 |       assert_receive {:DOWN, ^ref, :process, ^target, {:zen_monitor, :nodedown}}
528 |     end
529 | 
530 |     test "monitoring multiple returns multiple downs with :nodedown", ctx do
531 |       target = ctx.incompatible_pid()
532 |       other = ctx.incompatible_pid_b()
533 | 
534 |       # Attempt to monitor all the incompatible processes
535 |       target_ref = ZenMonitor.monitor(target)
536 |       other_ref = ZenMonitor.monitor(other)
537 | 
538 |       # Assert that the :DOWN messages with :incompatible are delivered
539 |       assert_receive {:DOWN, ^target_ref, :process, ^target, {:zen_monitor, :nodedown}}
540 |       assert_receive {:DOWN, ^other_ref, :process, ^other, {:zen_monitor, :nodedown}}
541 |     end
542 |   end
543 | 
544 |   describe "Monitoring a remote process on a compatible node that becomes incompatible" do
545 |     setup [:start_compatible_remote, :start_compatible_processes, :fast_zen_monitor]
546 | 
547 |     test "monitoring a remote process returns a reference", ctx do
548 |       ref = ZenMonitor.monitor(ctx.compatible_pid())
549 |       assert is_reference(ref)
550 |     end
551 | 
552 |     test "subscribing to a previously compatible host will cause :nodedown", ctx do
553 |       remote = ctx.compatible()
554 |       target = ctx.compatible_pid()
555 |       other = ctx.compatible_pid_b()
556 | 
557 |       # Perform an initial monitor
558 |       target_ref = ZenMonitor.monitor(target)
559 |       Helper.await_monitor_established(target_ref, target)
560 | 
561 |       # Check that the remote is considered compatible
562 |       assert :compatible = ZenMonitor.compatibility_for_node(remote)
563 | 
564 |       # Make the remote incompatible by killing the ZenMonitor running on it
565 |       assert :ok = :rpc.call(remote, Application, :stop, [:zen_monitor])
566 | 
567 |       # Perform an additional monitor
568 |       other_ref = ZenMonitor.monitor(other)
569 | 
570 |       Helper.await_monitor_cleared(target_ref, target)
571 | 
572 |       # Assert that we get notified for both monitored processes
573 |       assert_receive {:DOWN, ^target_ref, :process, ^target, {:zen_monitor, :nodedown}}
574 |       assert_receive {:DOWN, ^other_ref, :process, ^other, {:zen_monitor, :nodedown}}
575 | 
576 |       # Check that the remote is no longer considered compatible
577 |       assert :incompatible = ZenMonitor.compatibility_for_node(remote)
578 |     end
579 |   end
580 | 
581 |   describe "Monitoring has process-level multi-tenancy" do
582 |     setup [:start_compatible_remote, :start_compatible_processes, :fast_zen_monitor]
583 | 
584 |     test "only the down process sends a :DOWN message", ctx do
585 |       target = ctx.compatible_pid()
586 |       other = ctx.compatible_pid_b()
587 | 
588 |       # Monitor both remote processes
589 |       target_ref = ZenMonitor.monitor(target)
590 |       other_ref = ZenMonitor.monitor(other)
591 |       Helper.await_monitor_established(target_ref, target)
592 |       Helper.await_monitor_established(other_ref, other)
593 | 
594 |       # Kill the target process
595 |       Process.exit(target, :kill)
596 |       Helper.await_monitor_cleared(target_ref, target)
597 | 
598 |       # Assert that we receive a :DOWN for the target
599 |       assert_receive {:DOWN, ^target_ref, :process, ^target, {:zen_monitor, _}}
600 | 
601 |       # Assert that we do not receive a :DOWN for the other process
602 |       refute_receive {:DOWN, ^other_ref, :process, ^other, {:zen_monitor, _}}
603 |     end
604 | 
605 |     test "only the already down process sends a :DOWN message", ctx do
606 |       target = ctx.compatible_pid()
607 |       other = ctx.compatible_pid_b()
608 | 
609 |       # Kill the target process
610 |       Process.exit(target, :kill)
611 | 
612 |       # Monitor both remote processes
613 |       target_ref = ZenMonitor.monitor(target)
614 |       other_ref = ZenMonitor.monitor(other)
615 | 
616 |       # Assert that we receive a :DOWN for the target
617 |       assert_receive {:DOWN, ^target_ref, :process, ^target, {:zen_monitor, _}}
618 | 
619 |       # Assert that we do not receive a :DOWN for the other process
620 |       refute_receive {:DOWN, ^other_ref, :process, ^other, {:zen_monitor, _}}
621 |     end
622 |   end
623 | 
624 |   describe "Demonitor" do
625 |     setup [:start_compatible_remote, :start_compatible_processes, :fast_zen_monitor]
626 | 
627 |     test "prevents :DOWN from being delivered", ctx do
628 |       target = ctx.compatible_pid()
629 | 
630 |       # Monitor the remote process
631 |       ref = ZenMonitor.monitor(target)
632 | 
633 |       # Demonitor the reference
634 |       ZenMonitor.demonitor(ref)
635 | 
636 |       # Kill the process
637 |       Process.exit(target, :kill)
638 | 
639 |       # Assert that nothing was delivered
640 |       refute_receive {:DOWN, ^ref, :process, ^target, _}
641 |     end
642 | 
643 |     test ":DOWN sent before demonitor still exists", ctx do
644 |       target = ctx.compatible_pid()
645 | 
646 |       # Monitor the remote process
647 |       ref = ZenMonitor.monitor(target)
648 |       Helper.await_monitor_established(ref, target)
649 | 
650 |       # Kill the remote process
651 |       Process.exit(target, :kill)
652 |       Helper.await_monitor_cleared(ref, target)
653 | 
654 |       # Demonitor the reference
655 |       ZenMonitor.demonitor(ref)
656 | 
657 |       # Assert that a down message had already been received
658 |       assert_received {:DOWN, ^ref, :process, ^target, {:zen_monitor, _}}
659 |     end
660 | 
661 |     test "only effects the demonitored reference", ctx do
662 |       target = ctx.compatible_pid()
663 | 
664 |       # Monitor the remote process twice
665 |       ref_to_demonitor = ZenMonitor.monitor(target)
666 |       ref_to_keep = ZenMonitor.monitor(target)
667 |       Helper.await_monitors_established([ref_to_demonitor, ref_to_keep], target)
668 | 
669 |       # Demonitor one of the references
670 |       ZenMonitor.demonitor(ref_to_demonitor)
671 | 
672 |       # Kill the remote process
673 |       Process.exit(target, :kill)
674 |       Helper.await_monitor_cleared(ref_to_keep, target)
675 | 
676 |       # Assert that the monitor that was not demonitored fired
677 |       assert_receive {:DOWN, ^ref_to_keep, :process, ^target, {:zen_monitor, _}}
678 | 
679 |       # Assert that the demonitored monitor did not fire
680 |       refute_receive {:DOWN, ^ref_to_demonitor, :process, ^target, _}
681 |     end
682 |   end
683 | 
684 |   describe "Demonitor Flush" do
685 |     setup [:start_compatible_remote, :start_compatible_processes, :fast_zen_monitor]
686 | 
687 |     test "prevents :DOWN from being delivered", ctx do
688 |       target = ctx.compatible_pid()
689 | 
690 |       # Monitor the remote process
691 |       ref = ZenMonitor.monitor(target)
692 |       Helper.await_monitor_established(ref, target)
693 | 
694 |       # Demonitor the reference
695 |       ZenMonitor.demonitor(ref, [:flush])
696 | 
697 |       # Kill the process
698 |       Process.exit(target, :kill)
699 |       Helper.await_monitor_cleared(ref, target)
700 | 
701 |       # Assert that nothing was delivered
702 |       refute_receive {:DOWN, ^ref, :process, ^target, _}
703 |     end
704 | 
705 |     test ":DOWN sent before demonitor will be consumed by the flush", ctx do
706 |       target = ctx.compatible_pid()
707 | 
708 |       # Monitor the remote process
709 |       ref = ZenMonitor.monitor(target)
710 |       Helper.await_monitor_established(ref, target)
711 | 
712 |       # Kill the remote process
713 |       Process.exit(target, :kill)
714 |       Helper.await_monitor_cleared(ref, target)
715 | 
716 |       # Demonitor the reference
717 |       ZenMonitor.demonitor(ref, [:flush])
718 | 
719 |       # Assert that no down message has been received
720 |       refute_receive {:DOWN, ^ref, :process, ^target, {:zen_monitor, _}}
721 |     end
722 | 
723 |     test ":flush only removes the flushed reference", ctx do
724 |       target = ctx.compatible_pid()
725 | 
726 |       # Monitor the remote process twice
727 |       ref_to_flush = ZenMonitor.monitor(target)
728 |       ref_to_demonitor = ZenMonitor.monitor(target)
729 |       ref_to_keep = ZenMonitor.monitor(target)
730 |       Helper.await_monitors_established([ref_to_flush, ref_to_demonitor, ref_to_keep], target)
731 | 
732 |       # Kill the remote process
733 |       Process.exit(target, :kill)
734 |       Helper.await_monitors_cleared([ref_to_flush, ref_to_demonitor, ref_to_keep], target)
735 | 
736 |       # Flush one of the references
737 |       ZenMonitor.demonitor(ref_to_flush, [:flush])
738 | 
739 |       # Demonitor one of the references
740 |       ZenMonitor.demonitor(ref_to_demonitor)
741 | 
742 |       # Assert that the monitor that was not demonitored fired
743 |       assert_receive {:DOWN, ^ref_to_keep, :process, ^target, {:zen_monitor, _}}
744 | 
745 |       # Assert that the demonitored non-flush monitor fired
746 |       assert_receive {:DOWN, ^ref_to_demonitor, :process, ^target, {:zen_monitor, _}}
747 | 
748 |       # Assert that the demonitored and flushed monitor did not fire
749 |       refute_receive {:DOWN, ^ref_to_flush, :process, ^target, _}
750 |     end
751 |   end
752 | 
753 |   describe "Compatibility For Node" do
754 |     setup [:start_compatible_remote, :start_incompatible_remote]
755 | 
756 |     test "when remote is compatible", ctx do
757 |       assert :compatible = ZenMonitor.connect(ctx.compatible)
758 |       assert :compatible = ZenMonitor.compatibility_for_node(ctx.compatible)
759 |     end
760 | 
761 |     test "when remote is incompatible", ctx do
762 |       assert :incompatible = ZenMonitor.connect(ctx.incompatible)
763 |       assert :incompatible = ZenMonitor.compatibility_for_node(ctx.incompatible)
764 |     end
765 | 
766 |     test "when remote is down", ctx do
767 |       assert :incompatible = ZenMonitor.connect(ctx.down)
768 |       assert :incompatible = ZenMonitor.compatibility_for_node(ctx.down)
769 |     end
770 |   end
771 | end
772 | 


--------------------------------------------------------------------------------
/test/local/connector_test.exs:
--------------------------------------------------------------------------------
  1 | defmodule ZenMonitor.Local.Connector.Test do
  2 |   use ExUnit.Case
  3 | 
  4 |   alias ZenMonitor.Local.{Connector, Dispatcher}
  5 |   alias ZenMonitor.Proxy.Batcher
  6 |   alias ZenMonitor.Test.Support.ObservableGen
  7 | 
  8 |   setup do
  9 |     {:ok, compatible, nil} = ChildNode.start_link(:zen_monitor, :Compatible)
 10 |     {:ok, incompatible, nil} = ChildNode.start_link(:elixir, :Incompatible)
 11 | 
 12 |     start_supervised(ZenMonitor.Supervisor)
 13 | 
 14 |     on_exit(fn ->
 15 |       Node.monitor(compatible, true)
 16 |       Node.monitor(incompatible, true)
 17 | 
 18 |       receive do
 19 |         {:nodedown, ^compatible} -> :ok
 20 |       end
 21 | 
 22 |       receive do
 23 |         {:nodedown, ^incompatible} -> :ok
 24 |       end
 25 |     end)
 26 | 
 27 |     {:ok, compatible: compatible, incompatible: incompatible, down: :down@down}
 28 |   end
 29 | 
 30 |   def disable_sweep(_) do
 31 |     # Set sweep interval to 1 minute (effectively disable for this describe block)
 32 |     original_sweep_interval = Connector.sweep_interval()
 33 |     Connector.sweep_interval(60_000)
 34 | 
 35 |     on_exit(fn ->
 36 |       Connector.sweep_interval(original_sweep_interval)
 37 |     end)
 38 | 
 39 |     :ok
 40 |   end
 41 | 
 42 |   def reduce_chunk_size(_) do
 43 |     # Set chunk size to 2 for testing convenience
 44 |     original_chunk_size = Connector.chunk_size()
 45 |     Connector.chunk_size(2)
 46 | 
 47 |     on_exit(fn ->
 48 |       Connector.chunk_size(original_chunk_size)
 49 |     end)
 50 | 
 51 |     :ok
 52 |   end
 53 | 
 54 |   def start_remote_process(ctx) do
 55 |     compatible_pid = Node.spawn(ctx.compatible, Process, :sleep, [:infinity])
 56 |     compatible_pid_b = Node.spawn(ctx.compatible, Process, :sleep, [:infinity])
 57 |     compatible_pid_c = Node.spawn(ctx.compatible, Process, :sleep, [:infinity])
 58 | 
 59 |     incompatible_pid = Node.spawn(ctx.incompatible, Process, :sleep, [:infinity])
 60 |     incompatible_pid_b = Node.spawn(ctx.incompatible, Process, :sleep, [:infinity])
 61 |     incompatible_pid_c = Node.spawn(ctx.incompatible, Process, :sleep, [:infinity])
 62 | 
 63 |     {
 64 |       :ok,
 65 |       compatible_pid: compatible_pid,
 66 |       compatible_pid_b: compatible_pid_b,
 67 |       compatible_pid_c: compatible_pid_c,
 68 |       incompatible_pid: incompatible_pid,
 69 |       incompatible_pid_b: incompatible_pid_b,
 70 |       incompatible_pid_c: incompatible_pid_c
 71 |     }
 72 |   end
 73 | 
 74 |   def observe_gen(_) do
 75 |     # Start up an observer
 76 |     {:ok, observer} = ObservableGen.start_link(self())
 77 | 
 78 |     # Replace the original rpc_module with the ObservableRPC
 79 |     original_gen_module = ZenMonitor.gen_module()
 80 |     ZenMonitor.gen_module(ObservableGen)
 81 | 
 82 |     on_exit(fn ->
 83 |       ZenMonitor.gen_module(original_gen_module)
 84 |     end)
 85 | 
 86 |     {:ok, observer: observer}
 87 |   end
 88 | 
 89 |   def observe_zen_monitor(_) do
 90 |     Process.unregister(ZenMonitor.Local)
 91 |     Process.register(self(), ZenMonitor.Local)
 92 |     :ok
 93 |   end
 94 | 
 95 |   @doc """
 96 |   Reduces the intervals for all the batching parts of ZenMonitor so that the default
 97 |   assert_receive / refute_receive timeouts are an order of magnitude larger.
 98 |   """
 99 |   def fast_zen_monitor(ctx) do
100 |     # Tune the local dispatcher
101 |     original_demand_interval = Dispatcher.demand_interval()
102 |     Dispatcher.demand_interval(10)
103 | 
104 |     # Tune the local connector
105 |     original_connector_interval = Connector.sweep_interval()
106 |     Connector.sweep_interval(10)
107 | 
108 |     # Tune the remote batchers
109 |     original_batch_intervals =
110 |       Enum.map([node(), ctx.compatible], fn remote ->
111 |         original = :rpc.call(remote, Batcher, :sweep_interval, [])
112 |         :rpc.call(remote, Batcher, :sweep_interval, [10])
113 |         {remote, original}
114 |       end)
115 | 
116 |     on_exit(fn ->
117 |       # Restore the local settings
118 |       Dispatcher.demand_interval(original_demand_interval)
119 |       Connector.sweep_interval(original_connector_interval)
120 | 
121 |       # Restore the remote settings
122 |       Enum.each(original_batch_intervals, fn {remote, original} ->
123 |         :rpc.call(remote, Batcher, :sweep_interval, [original])
124 |       end)
125 |     end)
126 | 
127 |     :ok
128 |   end
129 | 
130 |   describe "Getting a connector" do
131 |     test "get connector for compatible remote node", ctx do
132 |       connector = Connector.get_for_node(ctx.compatible)
133 |       assert Process.alive?(connector)
134 |     end
135 | 
136 |     test "get connector for incompatible remote node", ctx do
137 |       connector = Connector.get_for_node(ctx.incompatible)
138 |       assert Process.alive?(connector)
139 |     end
140 | 
141 |     test "multiple gets return the same connector", ctx do
142 |       connector_a = Connector.get_for_node(ctx.compatible)
143 |       connector_b = Connector.get_for_node(ctx.compatible)
144 | 
145 |       assert connector_a == connector_b
146 |     end
147 | 
148 |     test "new connector after connector is killed", ctx do
149 |       original = Connector.get_for_node(ctx.compatible)
150 |       assert Process.alive?(original)
151 | 
152 |       Process.exit(original, :kill)
153 |       refute Process.alive?(original)
154 | 
155 |       replacement = Connector.get_for_node(ctx.compatible)
156 | 
157 |       replacement = if replacement == original do
158 |         Process.sleep(50)
159 |         Connector.get_for_node(ctx.compatible)
160 |       else
161 |         replacement
162 |       end
163 | 
164 |       assert Process.alive?(replacement)
165 | 
166 |       assert original != replacement
167 |     end
168 | 
169 |     test "each remote has its own connector", ctx do
170 |       connector_a = Connector.get_for_node(ctx.compatible)
171 |       connector_b = Connector.get_for_node(ctx.incompatible)
172 | 
173 |       assert connector_a != connector_b
174 |     end
175 |   end
176 | 
177 |   describe "Performing a connect" do
178 |     setup [:observe_gen]
179 | 
180 |     test "connecting to a compatible remote node", ctx do
181 |       compatible = ctx.compatible()
182 | 
183 |       assert :compatible = Connector.connect(compatible)
184 |       assert_receive {:observe, :call, {ZenMonitor.Proxy, ^compatible}, :ping, _}
185 |     end
186 | 
187 |     test "connecting to an incompatible remote node", ctx do
188 |       incompatible = ctx.incompatible
189 | 
190 |       assert :incompatible = Connector.connect(incompatible)
191 |       assert_receive {:observe, :call, {ZenMonitor.Proxy, ^incompatible}, :ping, _}
192 |     end
193 | 
194 |     test "connecting to a down remote node", ctx do
195 |       down = ctx.down()
196 | 
197 |       assert :incompatible = Connector.connect(down)
198 |       refute_receive {:observe, :call, _, _, _}
199 |     end
200 |   end
201 | 
202 |   describe "Connect status caching" do
203 |     setup [:disable_sweep]
204 | 
205 |     test "miss when never connected", ctx do
206 |       assert :miss = Connector.cached_compatibility(ctx.compatible)
207 |     end
208 | 
209 |     test "compatible hit after successful connection", ctx do
210 |       assert :miss = Connector.cached_compatibility(ctx.compatible)
211 |       assert :compatible = Connector.connect(ctx.compatible)
212 |       assert :compatible = Connector.cached_compatibility(ctx.compatible)
213 |     end
214 | 
215 |     test "incompatible hit after unsuccessful connection", ctx do
216 |       assert :miss = Connector.cached_compatibility(ctx.incompatible)
217 |       assert :incompatible = Connector.connect(ctx.incompatible)
218 |       assert :incompatible = Connector.cached_compatibility(ctx.incompatible)
219 |     end
220 | 
221 |     test "incompatible cache entries expire", ctx do
222 |       assert :miss = Connector.cached_compatibility(ctx.incompatible)
223 |       assert :incompatible = Connector.connect(ctx.incompatible)
224 |       assert :incompatible = Connector.cached_compatibility(ctx.incompatible)
225 | 
226 |       assert Helper.wait_until(fn ->
227 |                {:expired, _} = Connector.cached_compatibility(ctx.incompatible)
228 |                true
229 |              end)
230 |     end
231 | 
232 |     test "remote node crash causes an unavailable cache", ctx do
233 |       assert :miss = Connector.cached_compatibility(ctx.compatible)
234 |       assert :compatible = Connector.connect(ctx.compatible)
235 |       assert :ok = :slave.stop(ctx.compatible)
236 | 
237 |       assert Helper.wait_until(fn ->
238 |                Connector.cached_compatibility(ctx.compatible) == :unavailable
239 |              end)
240 |     end
241 |   end
242 | 
243 |   describe "Compatibility checking" do
244 |     test "all nodes start off incompatible", ctx do
245 |       assert :incompatible = Connector.compatibility(ctx.compatible)
246 |       assert :incompatible = Connector.compatibility(ctx.incompatible)
247 |       assert :incompatible = Connector.compatibility(ctx.down)
248 |     end
249 | 
250 |     test "after connecting to a compatible node it becomes compatible", ctx do
251 |       remote = ctx.compatible()
252 | 
253 |       assert :incompatible = Connector.compatibility(remote)
254 |       assert :compatible = Connector.connect(remote)
255 |       assert :compatible = Connector.compatibility(remote)
256 |     end
257 | 
258 |     test "after connecting to an incompatible node it remains incompatible", ctx do
259 |       remote = ctx.incompatible()
260 | 
261 |       assert :incompatible = Connector.compatibility(remote)
262 |       assert :incompatible = Connector.connect(remote)
263 |       assert :incompatible = Connector.compatibility(remote)
264 |     end
265 | 
266 |     test "after connecting to a down node it remains incompatible", ctx do
267 |       remote = ctx.down()
268 | 
269 |       assert :incompatible = Connector.compatibility(remote)
270 |       assert :incompatible = Connector.connect(remote)
271 |       assert :incompatible = Connector.compatibility(remote)
272 |     end
273 |   end
274 | 
275 |   describe "Monitoring a remote process (local bookkeeping)" do
276 |     setup [:disable_sweep, :start_remote_process]
277 | 
278 |     test "unmonitored pid is added to queue", ctx do
279 |       ref = make_ref()
280 | 
281 |       connector = Connector.get_for_node(ctx.compatible)
282 |       initial_state = :sys.get_state(connector)
283 | 
284 |       assert initial_state.length == 0
285 |       assert :queue.len(initial_state.batch) == 0
286 | 
287 |       Connector.monitor(ctx.compatible_pid, ref, self())
288 | 
289 |       # This assertion isn't actually needed, but since monitor is async, this is an easy way to
290 |       # check if the operation has completed
291 |       assert :compatible = Connector.connect(ctx.compatible)
292 | 
293 |       updated_state = :sys.get_state(connector)
294 | 
295 |       expected_pid = ctx.compatible_pid
296 |       assert updated_state.length == 1
297 |       assert :queue.len(updated_state.batch) == 1
298 |       assert {:value, {:subscribe, ^expected_pid}} = :queue.peek(updated_state.batch)
299 |     end
300 | 
301 |     test "already monitored pid is not added to the queue", ctx do
302 |       ref_1 = make_ref()
303 |       ref_2 = make_ref()
304 | 
305 |       connector = Connector.get_for_node(ctx.compatible)
306 |       initial_state = :sys.get_state(connector)
307 | 
308 |       assert initial_state.length == 0
309 |       assert :queue.len(initial_state.batch) == 0
310 | 
311 |       # Monitor the same pid twice
312 |       Connector.monitor(ctx.compatible_pid, ref_1, self())
313 |       Connector.monitor(ctx.compatible_pid, ref_2, self())
314 | 
315 |       # This assertion isn't actually needed, but since monitor is async, this is an easy way to
316 |       # check if the operation has completed
317 |       assert :compatible = Connector.connect(ctx.compatible)
318 | 
319 |       updated_state = :sys.get_state(connector)
320 | 
321 |       expected_pid = ctx.compatible_pid
322 |       assert updated_state.length == 1
323 |       assert :queue.len(updated_state.batch) == 1
324 |       assert {:value, {:subscribe, ^expected_pid}} = :queue.peek(updated_state.batch)
325 |     end
326 |   end
327 | 
328 |   describe "Demonitoring a process" do
329 |     setup [:observe_zen_monitor, :disable_sweep, :start_remote_process, :fast_zen_monitor]
330 | 
331 |     test "works on unknown process / ref", ctx do
332 |       assert :ok = Connector.demonitor(ctx.compatible_pid, make_ref())
333 |     end
334 | 
335 |     test "doesn't send down", ctx do
336 |       reference = make_ref()
337 |       target = ctx.compatible_pid
338 |       connector = Connector.get_for_node(ctx.compatible)
339 | 
340 |       # Monitor the target
341 |       Connector.monitor(target, reference, self())
342 | 
343 |       # Force a sweep
344 |       send(connector, :sweep)
345 | 
346 |       # Demonitor the target
347 |       Connector.demonitor(target, reference)
348 | 
349 |       # Kill the target
350 |       Process.exit(target, :kill)
351 | 
352 |       # Assert that no dispatches are sent for the target
353 |       refute_receive _
354 |     end
355 | 
356 |     test "is isolated to the demonitored process only", ctx do
357 |       subscriber = self()
358 |       target_reference = make_ref()
359 |       other_reference = make_ref()
360 |       target = ctx.compatible_pid
361 |       other = ctx.compatible_pid_b
362 |       connector = Connector.get_for_node(ctx.compatible)
363 | 
364 |       # Monitor both processes
365 |       Connector.monitor(target, target_reference, subscriber)
366 |       Connector.monitor(other, other_reference, subscriber)
367 | 
368 |       # Force a sweep
369 |       send(connector, :sweep)
370 | 
371 |       # Demonitor the target
372 |       Connector.demonitor(target, target_reference)
373 | 
374 |       # Kill both processes
375 |       Process.exit(target, :kill)
376 |       Process.exit(other, :kill)
377 | 
378 |       # Assert that a dispatch is enqueued for other only
379 |       assert_receive {:"$gen_cast",
380 |                       {:enqueue,
381 |                        [
382 |                          {^subscriber,
383 |                           {:DOWN, ^other_reference, :process, ^other, {:zen_monitor, _}}}
384 |                        ]}}
385 |     end
386 | 
387 |     test "incorrect reference does nothing", ctx do
388 |       subscriber = self()
389 |       right_reference = make_ref()
390 |       wrong_reference = make_ref()
391 |       target = ctx.compatible_pid
392 |       connector = Connector.get_for_node(ctx.compatible)
393 | 
394 |       # Monitor the target
395 |       Connector.monitor(target, right_reference, subscriber)
396 | 
397 |       # Force a sweep
398 |       send(connector, :sweep)
399 | 
400 |       # Demonitor but with the right target / wrong reference
401 |       Connector.demonitor(target, wrong_reference)
402 | 
403 |       # Kill the target
404 |       Process.exit(target, :kill)
405 | 
406 |       # Assert that a dispatch is still enqueued
407 |       assert_receive {:"$gen_cast",
408 |                       {:enqueue,
409 |                        [
410 |                          {^subscriber,
411 |                           {:DOWN, ^right_reference, :process, ^target, {:zen_monitor, _}}}
412 |                        ]}}
413 |     end
414 | 
415 |     test "incorrect pid does nothing", ctx do
416 |       subscriber = self()
417 |       reference = make_ref()
418 |       right_target = ctx.compatible_pid
419 |       wrong_target = ctx.compatible_pid_b
420 |       connector = Connector.get_for_node(ctx.compatible)
421 | 
422 |       # Monitor the target
423 |       Connector.monitor(right_target, reference, subscriber)
424 | 
425 |       # Force a sweep
426 |       send(connector, :sweep)
427 | 
428 |       # Demonitor but with the wrong target / right reference
429 |       Connector.demonitor(wrong_target, reference)
430 | 
431 |       # Kill the target
432 |       Process.exit(right_target, :kill)
433 | 
434 |       # Assert that a dispatch is still enqueued
435 |       assert_receive {:"$gen_cast",
436 |                       {:enqueue,
437 |                        [
438 |                          {^subscriber,
439 |                           {:DOWN, ^reference, :process, ^right_target, {:zen_monitor, _}}}
440 |                        ]}}
441 |     end
442 | 
443 |     test "demonitoring the only monitor adds an unsubscribe to the queue", ctx do
444 |       subscriber = self()
445 |       reference = make_ref()
446 |       target = ctx.compatible_pid
447 |       connector = Connector.get_for_node(ctx.compatible)
448 | 
449 |       # Monitor the target
450 |       Connector.monitor(target, reference, subscriber)
451 | 
452 |       # Check the monitor state
453 |       monitor_state = :sys.get_state(connector)
454 |       assert monitor_state.length == 1
455 |       assert :queue.len(monitor_state.batch) == 1
456 |       assert {:value, {:subscribe, ^target}} = :queue.peek(monitor_state.batch)
457 | 
458 |       # Force a sweep
459 |       send(connector, :sweep)
460 | 
461 |       # Demonitor the target
462 |       Connector.demonitor(target, reference)
463 | 
464 |       # Check the demonitor state
465 |       demonitor_state = :sys.get_state(connector)
466 |       assert demonitor_state.length == 1
467 |       assert :queue.len(demonitor_state.batch) == 1
468 |       assert {:value, {:unsubscribe, ^target}} = :queue.peek(demonitor_state.batch)
469 |     end
470 | 
471 |     test "demonitoring one of many monitors does not add an unsubscribe to the queue", ctx do
472 |       subscriber = self()
473 |       reference = make_ref()
474 |       other_reference = make_ref()
475 |       target = ctx.compatible_pid
476 |       connector = Connector.get_for_node(ctx.compatible)
477 | 
478 |       # Monitor the target multiple times
479 |       Connector.monitor(target, reference, subscriber)
480 |       Connector.monitor(target, other_reference, subscriber)
481 | 
482 |       # Check the monitor state
483 |       monitor_state = :sys.get_state(connector)
484 |       assert monitor_state.length == 1
485 |       assert :queue.len(monitor_state.batch) == 1
486 |       assert {:value, {:subscribe, ^target}} = :queue.peek(monitor_state.batch)
487 | 
488 |       # Force a sweep
489 |       send(connector, :sweep)
490 | 
491 |       # Demonitor one of the references
492 |       Connector.demonitor(target, reference)
493 | 
494 |       # Check the demonitor state
495 |       demonitor_state = :sys.get_state(connector)
496 |       assert demonitor_state.length == 0
497 |       assert :queue.len(demonitor_state.batch) == 0
498 |     end
499 | 
500 |     test "demonitoring the last of many monitors adds an unsubscribe to the queue", ctx do
501 |       subscriber = self()
502 |       reference = make_ref()
503 |       other_reference = make_ref()
504 |       target = ctx.compatible_pid
505 |       connector = Connector.get_for_node(ctx.compatible)
506 | 
507 |       # Monitor the target multiple times
508 |       Connector.monitor(target, reference, subscriber)
509 |       Connector.monitor(target, other_reference, subscriber)
510 | 
511 |       # Check the monitor state
512 |       monitor_state = :sys.get_state(connector)
513 |       assert monitor_state.length == 1
514 |       assert :queue.len(monitor_state.batch) == 1
515 |       assert {:value, {:subscribe, ^target}} = :queue.peek(monitor_state.batch)
516 | 
517 |       # Force a sweep
518 |       send(connector, :sweep)
519 | 
520 |       # Demonitor all of the references
521 |       Connector.demonitor(target, reference)
522 |       Connector.demonitor(target, other_reference)
523 | 
524 |       # Check the demonitor state
525 |       demonitor_state = :sys.get_state(connector)
526 |       assert demonitor_state.length == 1
527 |       assert :queue.len(demonitor_state.batch) == 1
528 |       assert {:value, {:unsubscribe, ^target}} = :queue.peek(demonitor_state.batch)
529 |     end
530 |   end
531 | 
532 |   describe "Handles nodedown" do
533 |     setup [:fast_zen_monitor, :observe_zen_monitor, :disable_sweep, :start_remote_process]
534 | 
535 |     test "marks the node as unavailable", ctx do
536 |       assert :compatible = Connector.connect(ctx.compatible)
537 | 
538 |       # Stop the node
539 |       :slave.stop(ctx.compatible)
540 | 
541 |       # Assert that it becomes incompatible
542 |       assert Helper.wait_until(fn ->
543 |                Connector.cached_compatibility(ctx.compatible) == :unavailable
544 |              end)
545 |     end
546 | 
547 |     test "fires all monitors", ctx do
548 |       subscriber = self()
549 |       target_ref_a_1 = make_ref()
550 |       target_ref_a_2 = make_ref()
551 |       target_ref_b_1 = make_ref()
552 |       target_ref_b_2 = make_ref()
553 |       target_ref_c_1 = make_ref()
554 |       target_ref_c_2 = make_ref()
555 |       target_a = ctx.compatible_pid
556 |       target_b = ctx.compatible_pid_b
557 |       target_c = ctx.compatible_pid_c
558 |       connector = Connector.get_for_node(ctx.compatible)
559 | 
560 |       # Make some monitors
561 |       Connector.monitor(target_a, target_ref_a_1, subscriber)
562 |       Connector.monitor(target_a, target_ref_a_2, subscriber)
563 |       Connector.monitor(target_b, target_ref_b_1, subscriber)
564 |       Connector.monitor(target_b, target_ref_b_2, subscriber)
565 |       Connector.monitor(target_c, target_ref_c_1, subscriber)
566 |       Connector.monitor(target_c, target_ref_c_2, subscriber)
567 | 
568 |       # Force a sweep
569 |       send(connector, :sweep)
570 | 
571 |       # Wait for the monitors to establish
572 |       Process.sleep(50)
573 | 
574 |       # Stop the node
575 |       :slave.stop(ctx.compatible)
576 | 
577 |       # Assert that all the expected messages get enqueued
578 |       assert_receive {:"$gen_cast",
579 |                       {:enqueue,
580 |                        [
581 |                          {^subscriber,
582 |                           {:DOWN, ^target_ref_a_1, :process, ^target_a, {:zen_monitor, :nodedown}}},
583 |                          {^subscriber,
584 |                           {:DOWN, ^target_ref_a_2, :process, ^target_a, {:zen_monitor, :nodedown}}},
585 |                          {^subscriber,
586 |                           {:DOWN, ^target_ref_b_1, :process, ^target_b, {:zen_monitor, :nodedown}}},
587 |                          {^subscriber,
588 |                           {:DOWN, ^target_ref_b_2, :process, ^target_b, {:zen_monitor, :nodedown}}},
589 |                          {^subscriber,
590 |                           {:DOWN, ^target_ref_c_1, :process, ^target_c, {:zen_monitor, :nodedown}}},
591 |                          {^subscriber,
592 |                           {:DOWN, ^target_ref_c_2, :process, ^target_c, {:zen_monitor, :nodedown}}}
593 |                        ]}}
594 |     end
595 |   end
596 | 
597 |   describe "Handles summaries" do
598 |     setup [:fast_zen_monitor, :observe_zen_monitor, :disable_sweep, :start_remote_process]
599 | 
600 |     test "empty summary does nothing", ctx do
601 |       connector = Connector.get_for_node(ctx.compatible)
602 | 
603 |       # Send the connector an empty list of death certificates
604 |       send(connector, {:dead, ctx.compatible, []})
605 | 
606 |       # Assert that ZenMonitor.Local doesn't receive an enqueue
607 |       refute_receive _
608 |     end
609 | 
610 |     test "unmonitored pids does nothing", ctx do
611 |       connector = Connector.get_for_node(ctx.compatible)
612 | 
613 |       # Send some unmonitored pids
614 |       send(
615 |         connector,
616 |         {:dead, ctx.compatible,
617 |          [
618 |            {ctx.compatible_pid, :test_a},
619 |            {ctx.compatible_pid_b, :test_b},
620 |            {ctx.compatible_pid_c, :test_c}
621 |          ]}
622 |       )
623 | 
624 |       # Assert that ZenMonitor.Local doesn't receive an enqueue
625 |       refute_receive _
626 |     end
627 | 
628 |     test "monitored pids get enqueued", ctx do
629 |       subscriber = self()
630 |       reference_a = make_ref()
631 |       reference_b = make_ref()
632 |       reference_c = make_ref()
633 |       target_a = ctx.compatible_pid
634 |       target_b = ctx.compatible_pid_b
635 |       target_c = ctx.compatible_pid_c
636 |       connector = Connector.get_for_node(ctx.compatible)
637 | 
638 |       # Monitor some pids
639 |       Connector.monitor(target_a, reference_a, subscriber)
640 |       Connector.monitor(target_b, reference_b, subscriber)
641 |       Connector.monitor(target_c, reference_c, subscriber)
642 | 
643 |       # Send the monitored pids in the summary
644 |       send(
645 |         connector,
646 |         {:dead, ctx.compatible, [{target_a, :test_a}, {target_b, :test_b}, {target_c, :test_c}]}
647 |       )
648 | 
649 |       # Assert that ZenMonitor.Local receives an enqueue
650 |       assert_receive {:"$gen_cast",
651 |                       {:enqueue,
652 |                        [
653 |                          {^subscriber,
654 |                           {:DOWN, ^reference_a, :process, ^target_a, {:zen_monitor, :test_a}}},
655 |                          {^subscriber,
656 |                           {:DOWN, ^reference_b, :process, ^target_b, {:zen_monitor, :test_b}}},
657 |                          {^subscriber,
658 |                           {:DOWN, ^reference_c, :process, ^target_c, {:zen_monitor, :test_c}}}
659 |                        ]}}
660 |     end
661 | 
662 |     test "mixed pids, monitored enqueue, unmonitored ignored", ctx do
663 |       subscriber = self()
664 |       reference_a = make_ref()
665 |       reference_c = make_ref()
666 |       target_a = ctx.compatible_pid
667 |       target_b = ctx.compatible_pid_b
668 |       target_c = ctx.compatible_pid_c
669 |       connector = Connector.get_for_node(ctx.compatible)
670 | 
671 |       # Monitor some pids (intentionally skip target_b)
672 |       Connector.monitor(target_a, reference_a, subscriber)
673 |       Connector.monitor(target_c, reference_c, subscriber)
674 | 
675 |       # Send the mixed pids in the summary
676 |       send(
677 |         connector,
678 |         {:dead, ctx.compatible, [{target_a, :test_a}, {target_b, :test_b}, {target_c, :test_c}]}
679 |       )
680 | 
681 |       # Assert that ZenMonitor.Local receives an enqueue
682 |       assert_receive {:"$gen_cast",
683 |                       {:enqueue,
684 |                        [
685 |                          {^subscriber,
686 |                           {:DOWN, ^reference_a, :process, ^target_a, {:zen_monitor, :test_a}}},
687 |                          {^subscriber,
688 |                           {:DOWN, ^reference_c, :process, ^target_c, {:zen_monitor, :test_c}}}
689 |                        ]}}
690 | 
691 |       # Assert that no other messages arrive
692 |       refute_receive _
693 |     end
694 |   end
695 | 
696 |   describe "Periodic Sweep to compatible remote" do
697 |     setup [:observe_gen, :disable_sweep, :reduce_chunk_size, :start_remote_process]
698 | 
699 |     test "sweep does not send a subscription if there are no newly monitored pids", ctx do
700 |       connector = Connector.get_for_node(ctx.compatible)
701 | 
702 |       # Force the connector to sweep nothing
703 |       send(connector, :sweep)
704 | 
705 |       # Assert that no subscription is sent because nothing is pending
706 |       refute_receive {:observe, :cast, _, _}
707 |     end
708 | 
709 |     test "sweep sends the monitored pids since the last sweep", ctx do
710 |       target = ctx.compatible_pid
711 |       remote = ctx.compatible
712 |       connector = Connector.get_for_node(remote)
713 | 
714 |       # Monitor the target pid
715 |       Connector.monitor(target, make_ref(), self())
716 | 
717 |       # Force a sweep
718 |       send(connector, :sweep)
719 | 
720 |       assert_receive {
721 |         :observe,
722 |         :cast,
723 |         {ZenMonitor.Proxy, ^remote},
724 |         {:process, ^connector, [{:subscribe, ^target}]}
725 |       }
726 |     end
727 | 
728 |     test "sweep ignores already monitored pids on subsequenet sweeps", ctx do
729 |       target = ctx.compatible_pid
730 |       remote = ctx.compatible
731 |       connector = Connector.get_for_node(remote)
732 | 
733 |       # Monitor the target pid
734 |       Connector.monitor(target, make_ref(), self())
735 | 
736 |       # Force a sweep
737 |       send(connector, :sweep)
738 | 
739 |       # Flush out the initial message
740 |       assert_receive {
741 |         :observe,
742 |         :cast,
743 |         {ZenMonitor.Proxy, ^remote},
744 |         {:process, ^connector, [{:subscribe, ^target}]}
745 |       }
746 | 
747 |       # Monitor the target pid again
748 |       Connector.monitor(target, make_ref(), self())
749 | 
750 |       # Force another sweep
751 |       send(connector, :sweep)
752 | 
753 |       # Assert that no additional subscriptions are sent
754 |       refute_receive {:observe, :cast, _, _}
755 |     end
756 | 
757 |     test "sweep transmits pids in the order received", ctx do
758 |       first = ctx.compatible_pid
759 |       second = ctx.compatible_pid_b
760 |       remote = ctx.compatible
761 |       connector = Connector.get_for_node(remote)
762 | 
763 |       # Monitor the targets
764 |       Connector.monitor(first, make_ref(), self())
765 |       Connector.monitor(second, make_ref(), self())
766 | 
767 |       # Force a sweep
768 |       send(connector, :sweep)
769 | 
770 |       # Assert that we got a subscription in the correct order (first, second)
771 |       assert_receive {
772 |         :observe,
773 |         :cast,
774 |         {ZenMonitor.Proxy, ^remote},
775 |         {:process, ^connector, [{:subscribe, ^first}, {:subscribe, ^second}]}
776 |       }
777 |     end
778 | 
779 |     test "sweep will only transmit the requested chunk size", ctx do
780 |       target_a = ctx.compatible_pid()
781 |       target_b = ctx.compatible_pid_b()
782 |       target_c = ctx.compatible_pid_c()
783 |       remote = ctx.compatible()
784 |       connector = Connector.get_for_node(remote)
785 | 
786 |       # Monitor all targets
787 |       Connector.monitor(target_a, make_ref(), self())
788 |       Connector.monitor(target_b, make_ref(), self())
789 |       Connector.monitor(target_c, make_ref(), self())
790 | 
791 |       # Force a sweep
792 |       send(connector, :sweep)
793 | 
794 |       # Assert that we got a subscription for the first chunk (target_a, target_b)
795 |       assert_receive {
796 |         :observe,
797 |         :cast,
798 |         {ZenMonitor.Proxy, ^remote},
799 |         {:process, ^connector, [{:subscribe, ^target_a}, {:subscribe, ^target_b}]}
800 |       }
801 | 
802 |       # Force another sweep
803 |       send(connector, :sweep)
804 | 
805 |       # Assert that we got a subscription for the second chunk (target_b)
806 |       assert_receive {
807 |         :observe,
808 |         :cast,
809 |         {ZenMonitor.Proxy, ^remote},
810 |         {:process, ^connector, [{:subscribe, ^target_c}]}
811 |       }
812 |     end
813 |   end
814 | 
815 |   describe "Periodic Sweep to incompatible remote" do
816 |     setup [:observe_zen_monitor, :disable_sweep, :reduce_chunk_size, :start_remote_process]
817 | 
818 |     test "sweep does not send a message if there are no newly monitored pids", ctx do
819 |       connector = Connector.get_for_node(ctx.incompatible)
820 | 
821 |       # Force the connector to sweep nothing
822 |       send(connector, :sweep)
823 | 
824 |       # Assert that no dead message is sent
825 |       refute_receive {:"$gen_cast", {:enqueue, _}}
826 |     end
827 | 
828 |     test "sweep sends nodedown for incompatible remote", ctx do
829 |       subscriber = self()
830 |       reference = make_ref()
831 |       target = ctx.incompatible_pid
832 |       remote = ctx.incompatible
833 |       connector = Connector.get_for_node(remote)
834 | 
835 |       # Monitor the target pid
836 |       Connector.monitor(target, reference, subscriber)
837 | 
838 |       # Force a sweep
839 |       send(connector, :sweep)
840 | 
841 |       # Assert that the message is enqueued with ZenMonitor.Local (via GenServer.cast)
842 |       assert_receive {
843 |         :"$gen_cast",
844 |         {
845 |           :enqueue,
846 |           [{^subscriber, {:DOWN, ^reference, :process, ^target, {:zen_monitor, :nodedown}}}]
847 |         }
848 |       }
849 |     end
850 |   end
851 | end
852 | 


--------------------------------------------------------------------------------
/test/local/dispatcher_test.exs:
--------------------------------------------------------------------------------
  1 | defmodule ZenMonitor.Local.Dispatcher.Test do
  2 |   use ExUnit.Case
  3 | 
  4 |   alias ZenMonitor.Local.{Connector, Dispatcher}
  5 |   alias ZenMonitor.Proxy.Batcher
  6 | 
  7 |   setup do
  8 |     {:ok, remote, nil} = ChildNode.start_link(:zen_monitor, :Remote)
  9 | 
 10 |     start_supervised(ZenMonitor.Supervisor)
 11 |     {:ok, _} = Application.ensure_all_started(:instruments)
 12 | 
 13 |     on_exit(fn ->
 14 |       Node.monitor(remote, true)
 15 | 
 16 |       receive do
 17 |         {:nodedown, ^remote} -> :ok
 18 |       end
 19 |     end)
 20 | 
 21 |     {:ok, remote: remote}
 22 |   end
 23 | 
 24 |   @doc """
 25 |   Reduces the intervals for all the batching parts of ZenMonitor so that the default
 26 |   assert_receive / refute_receive timeouts are an order of magnitude larger.
 27 |   """
 28 |   def fast_zen_monitor(ctx) do
 29 |     # Tune the local dispatcher
 30 |     original_demand_interval = Dispatcher.demand_interval()
 31 |     Dispatcher.demand_interval(10)
 32 | 
 33 |     # Tune the local connector
 34 |     original_connector_interval = Connector.sweep_interval()
 35 |     Connector.sweep_interval(10)
 36 | 
 37 |     # Tune the remote batcher
 38 |     original_batcher_interval = :rpc.call(ctx.remote, Batcher, :sweep_interval, [])
 39 |     :ok = :rpc.call(ctx.remote, Batcher, :sweep_interval, [10])
 40 | 
 41 |     on_exit(fn ->
 42 |       # Restore the local settings
 43 |       Dispatcher.demand_interval(original_demand_interval)
 44 |       Connector.sweep_interval(original_connector_interval)
 45 | 
 46 |       # Restore the remote settings
 47 |       :rpc.call(ctx.remote, Batcher, :sweep_interval, [original_batcher_interval])
 48 |     end)
 49 | 
 50 |     :ok
 51 |   end
 52 | 
 53 |   def start_remote_process(ctx) do
 54 |     remote_pid = Node.spawn(ctx.remote, Process, :sleep, [:infinity])
 55 |     alternate_remote_pid = Node.spawn(ctx.remote, Process, :sleep, [:infinity])
 56 | 
 57 |     {:ok, remote_pid: remote_pid, alternate_remote_pid: alternate_remote_pid}
 58 |   end
 59 | 
 60 |   describe "Event Dispatch" do
 61 |     setup [:fast_zen_monitor, :start_remote_process]
 62 | 
 63 |     test "no messages are sent when there is nothing monitored" do
 64 |       # Assert that we receive no messages
 65 |       refute_receive _
 66 |     end
 67 | 
 68 |     test "no messages are sent when the monitored process is still running", ctx do
 69 |       # Monitor the remote process
 70 |       ZenMonitor.monitor(ctx.remote_pid)
 71 | 
 72 |       # Assert that we receive no messages
 73 |       refute_receive _
 74 |     end
 75 | 
 76 |     test "a message is dispatched when the monitored process dies", ctx do
 77 |       target = ctx.remote_pid()
 78 | 
 79 |       # Monitor the remote process
 80 |       ref = ZenMonitor.monitor(target)
 81 | 
 82 |       # Kill the remote process
 83 |       Process.exit(target, :kill)
 84 | 
 85 |       # Assert delivery of a :DOWN for the killed process
 86 |       assert_receive {:DOWN, ^ref, :process, ^target, {:zen_monitor, _}}, 1000
 87 |     end
 88 | 
 89 |     test "only the dead process gets a message dispatched", ctx do
 90 |       target = ctx.remote_pid()
 91 |       alternate = ctx.alternate_remote_pid()
 92 | 
 93 |       # Monitor both remote processes
 94 |       ref = ZenMonitor.monitor(target)
 95 |       ZenMonitor.monitor(alternate)
 96 | 
 97 |       # Kill the target remote process
 98 |       Process.exit(target, :kill)
 99 | 
100 |       # Assert delivery of a :DOWN for the killed process and nothing else
101 |       assert_receive {:DOWN, ^ref, :process, ^target, {:zen_monitor, _}}, 1000
102 |       refute_receive _
103 |     end
104 | 
105 |     test "monitoring a dead process should dispatch a :DOWN with :noproc", ctx do
106 |       target = ctx.remote_pid()
107 | 
108 |       # Kill the remote process
109 |       Process.exit(target, :kill)
110 | 
111 |       # Monitor the now dead remote process
112 |       ref = ZenMonitor.monitor(target)
113 | 
114 |       # Assert delivery of a :DOWN :noproc
115 |       assert_receive {:DOWN, ^ref, :process, ^target, {:zen_monitor, :noproc}}, 1000
116 |     end
117 | 
118 |     test "monitoring a process after down and dispatched message dispatches another message",
119 |          ctx do
120 |       target = ctx.remote_pid()
121 | 
122 |       # Monitor the remote process
123 |       ref = ZenMonitor.monitor(target)
124 | 
125 |       # Kill the target remote process
126 |       Process.exit(target, :kill)
127 | 
128 |       # Assert initial delivery
129 |       assert_receive {:DOWN, ^ref, :process, ^target, {:zen_monitor, _}}, 1000
130 | 
131 |       # Re-monitor the remote process
132 |       another_ref = ZenMonitor.monitor(target)
133 | 
134 |       # Assert delivery of a :DOWN :noproc
135 |       assert_receive {:DOWN, ^another_ref, :process, ^target, {:zen_monitor, :noproc}}, 1000
136 |     end
137 | 
138 |     test "all monitored processes get delivered at nodedown", ctx do
139 |       target = ctx.remote_pid()
140 |       alternate = ctx.alternate_remote_pid()
141 | 
142 |       # Monitor both remote processes
143 |       target_ref = ZenMonitor.monitor(target)
144 |       alternate_ref = ZenMonitor.monitor(alternate)
145 | 
146 |       # Wait for the monitors to get dispatched to the remote
147 |       Process.sleep(50)
148 | 
149 |       # Kill the remote node
150 |       :slave.stop(ctx.remote)
151 | 
152 |       # Assert delivery of both :DOWN :nodedown messages
153 |       assert_receive {:DOWN, ^target_ref, :process, ^target, {:zen_monitor, :nodedown}}, 1000
154 | 
155 |       assert_receive {:DOWN, ^alternate_ref, :process, ^alternate, {:zen_monitor, :nodedown}},
156 |                      1000
157 |     end
158 |   end
159 | end
160 | 


--------------------------------------------------------------------------------
/test/local/local_test.exs:
--------------------------------------------------------------------------------
 1 | defmodule ZenMonitor.Local.Test do
 2 |   @moduledoc """
 3 |   Tests the ZenMonitor.Local module.
 4 | 
 5 |   Since the bulk of monitor/1 and compatibility/1 are delegated to ZenMonitor.Local.Connector, see
 6 |   the ZenMonitor.Local.Connector.Test module for tests concerning that functionality.
 7 | 
 8 |   Most of the other functionality of this module is internal and handled by the
 9 |   ZenMonitor.BlackBox.Test
10 |   """
11 |   use ExUnit.Case
12 | 
13 |   alias ZenMonitor.Local
14 |   alias ZenMonitor.Local.Tables
15 | 
16 |   setup do
17 |     start_supervised(ZenMonitor.Supervisor)
18 |     :ok
19 |   end
20 | 
21 |   def pids(count) do
22 |     Enum.map(1..count, fn _ -> spawn(fn -> Process.sleep(:infinity) end) end)
23 |   end
24 | 
25 |   describe "Demonitoring a reference" do
26 |     test "demonitored references are consumed from the references table" do
27 |       [pid] = pids(1)
28 |       ref = Local.monitor(pid)
29 | 
30 |       assert :ets.member(Tables.references(), {self(), ref})
31 | 
32 |       assert true = Local.demonitor(ref)
33 | 
34 |       refute :ets.member(Tables.references(), {self(), ref})
35 |     end
36 | 
37 |     test "demonitor without flush does not clear already delivered :DOWN message" do
38 |       ref = make_ref()
39 | 
40 |       # Simulate receiving a :DOWN message about the reference
41 |       send(self(), {:DOWN, ref, :process, :pid, :reason})
42 | 
43 |       assert true = Local.demonitor(ref)
44 | 
45 |       assert_received {:DOWN, ^ref, _, _, _}
46 |     end
47 | 
48 |     test "demonitor with flush will clear already delivered :DOWN message" do
49 |       ref = make_ref()
50 | 
51 |       # Simulate receiving a :DOWN message about the reference
52 |       send(self(), {:DOWN, ref, :process, :pid, :reason})
53 | 
54 |       assert true = Local.demonitor(ref, [:flush])
55 | 
56 |       refute_received {:DOWN, ^ref, _, _, _}
57 |     end
58 |   end
59 | 
60 |   describe "Handles subscriber down" do
61 |     test "cleans up references" do
62 |       me = self()
63 | 
64 |       # Spawn a new subscriber process, have it send us some information and sleep
65 |       subscriber =
66 |         spawn(fn ->
67 |           [pid] = pids(1)
68 |           ref = Local.monitor(pid)
69 | 
70 |           send(me, {:monitor, ref})
71 | 
72 |           Process.sleep(:infinity)
73 |         end)
74 | 
75 |       assert_receive {:monitor, ref}
76 | 
77 |       # Assert that the reference was recorded
78 |       assert :ets.member(Tables.references(), {subscriber, ref})
79 | 
80 |       # Kill the subscriber
81 |       Process.exit(subscriber, :kill)
82 | 
83 |       # Assert that the reference was cleaned up
84 |       assert Helper.wait_until(fn ->
85 |                not :ets.member(Tables.references(), {subscriber, ref})
86 |              end)
87 |     end
88 |   end
89 | end
90 | 


--------------------------------------------------------------------------------
/test/proxy/batcher_test.exs:
--------------------------------------------------------------------------------
  1 | defmodule ZenMonitor.Proxy.Batcher.Test do
  2 |   @moduledoc """
  3 |   Tests for the ZenMonitor.Proxy.Batcher module
  4 | 
  5 |   ZenMonitor is a distributed system, in this suite the ZenMonitor.Proxy.Batcher that we will be
  6 |   exercising is running on the local node.  Since the ZenMonitor.Proxy system works off of
  7 |   subscriber pids we will make the test process the subscriber and forgo the need for ChildNodes.
  8 |   """
  9 |   use ExUnit.Case
 10 | 
 11 |   alias ZenMonitor.Proxy.{Batcher, Tables}
 12 | 
 13 |   # Batchers stop when their subscriber goes DOWN, this tag tells ExUnit to suppress stops reports
 14 |   @moduletag :capture_log
 15 | 
 16 |   setup do
 17 |     start_supervised(ZenMonitor.Supervisor)
 18 |     :ok
 19 |   end
 20 | 
 21 |   def disable_sweep(_) do
 22 |     # Set sweep interval to 1 minute (effectively disable for this describe block)
 23 |     original_sweep_interval = Batcher.sweep_interval()
 24 |     Batcher.sweep_interval(60_000)
 25 | 
 26 |     on_exit(fn ->
 27 |       Batcher.sweep_interval(original_sweep_interval)
 28 |     end)
 29 | 
 30 |     :ok
 31 |   end
 32 | 
 33 |   def reduce_chunk_size(_) do
 34 |     # Set chunk size to 2 for testing convenience
 35 |     original_chunk_size = Batcher.chunk_size()
 36 |     Batcher.chunk_size(2)
 37 | 
 38 |     on_exit(fn ->
 39 |       Batcher.chunk_size(original_chunk_size)
 40 |     end)
 41 | 
 42 |     :ok
 43 |   end
 44 | 
 45 |   describe "Getting a Batcher" do
 46 |     test "batcher for pid" do
 47 |       batcher = Batcher.get(self())
 48 |       assert Process.alive?(batcher)
 49 |     end
 50 | 
 51 |     test "multiple gets for the same pid should return the same batcher" do
 52 |       batcher_a = Batcher.get(self())
 53 |       batcher_b = Batcher.get(self())
 54 | 
 55 |       assert batcher_a == batcher_b
 56 |     end
 57 | 
 58 |     test "batcher is replaced if it dies" do
 59 |       original = Batcher.get(self())
 60 |       assert Process.alive?(original)
 61 | 
 62 |       Process.exit(original, :kill)
 63 |       refute Process.alive?(original)
 64 | 
 65 |       replacement = Batcher.get(self())
 66 | 
 67 |       # Give a chance for the GenRegistry to react to the above
 68 |       # death.
 69 |       replacement = if replacement == original do
 70 |         Process.sleep(50)
 71 |         Batcher.get(self())
 72 |       else
 73 |         replacement
 74 |       end
 75 | 
 76 |       assert Process.alive?(replacement)
 77 | 
 78 |       assert original != replacement
 79 |     end
 80 | 
 81 |     test "each pid gets its own batcher" do
 82 |       batcher_a = Batcher.get(self())
 83 |       batcher_b = Batcher.get(:other)
 84 | 
 85 |       assert batcher_a != batcher_b
 86 |     end
 87 |   end
 88 | 
 89 |   describe "Enqueuing Certificates" do
 90 |     setup [:disable_sweep]
 91 | 
 92 |     test "certificate gets added to the current batch" do
 93 |       batcher = Batcher.get(self())
 94 | 
 95 |       initial_state = :sys.get_state(batcher)
 96 |       assert initial_state.length == 0
 97 |       assert :queue.len(initial_state.batch) == 0
 98 | 
 99 |       Batcher.enqueue(batcher, :test_pid, :test_reason)
100 | 
101 |       updated_state = :sys.get_state(batcher)
102 |       assert updated_state.length == 1
103 |       assert :queue.len(updated_state.batch) == 1
104 |       assert {:value, {:test_pid, :test_reason}} = :queue.peek(updated_state.batch)
105 |     end
106 |   end
107 | 
108 |   describe "Periodic Sweeps" do
109 |     setup [:disable_sweep, :reduce_chunk_size]
110 | 
111 |     test "sweep should do nothing if the batch is empty" do
112 |       batcher = Batcher.get(self())
113 | 
114 |       # Force a sweep
115 |       send(batcher, :sweep)
116 | 
117 |       refute_receive _
118 |     end
119 | 
120 |     test "sweep should deliver a summary to the subscriber" do
121 |       batcher = Batcher.get(self())
122 |       Batcher.enqueue(batcher, :test_pid, :test_reason)
123 | 
124 |       # Force a sweep
125 |       send(batcher, :sweep)
126 | 
127 |       # Assert that we received the expected summary
128 |       assert_receive {:dead, _, [{:test_pid, :test_reason}]}
129 |     end
130 | 
131 |     test "sweep should deliver the summary in the same order it received them" do
132 |       batcher = Batcher.get(self())
133 | 
134 |       # Enqueue a full chunk of unique certificates
135 |       Batcher.enqueue(batcher, :test_pid_1, :test_reason_1)
136 |       Batcher.enqueue(batcher, :test_pid_2, :test_reason_2)
137 | 
138 |       # Force a sweep
139 |       send(batcher, :sweep)
140 | 
141 |       # Assert that we received the expected summary in the right order (1, 2)
142 |       assert_receive {:dead, _, [{:test_pid_1, :test_reason_1}, {:test_pid_2, :test_reason_2}]}
143 |     end
144 | 
145 |     test "sweep will only deliver the requested chunk size" do
146 |       batcher = Batcher.get(self())
147 | 
148 |       # Enqueue two chunks worth of certificates
149 |       Batcher.enqueue(batcher, :test_pid_1, :test_reason_1)
150 |       Batcher.enqueue(batcher, :test_pid_2, :test_reason_2)
151 |       Batcher.enqueue(batcher, :test_pid_3, :test_reason_3)
152 |       Batcher.enqueue(batcher, :test_pid_4, :test_reason_4)
153 | 
154 |       # Force a sweep
155 |       send(batcher, :sweep)
156 | 
157 |       # Assert that we received the first chunk in order (1, 2)
158 |       assert_receive {:dead, _, [{:test_pid_1, :test_reason_1}, {:test_pid_2, :test_reason_2}]}
159 | 
160 |       # Force an additional sweep
161 |       send(batcher, :sweep)
162 | 
163 |       # Assert that we received the second chunk in order (3, 4)
164 |       assert_receive {:dead, _, [{:test_pid_3, :test_reason_3}, {:test_pid_4, :test_reason_4}]}
165 |     end
166 |   end
167 | 
168 |   describe "Handling Subscriber Down" do
169 |     test "batcher cleans up subscriptions" do
170 |       # Spawn a subscriber that we can kill later
171 |       subscriber = spawn(fn -> Process.sleep(:infinity) end)
172 | 
173 |       # Start a batcher for the subscribers
174 |       Batcher.get(subscriber)
175 | 
176 |       # Insert some subscriptions into the subscriber table
177 |       :ets.insert(Tables.subscribers(), [
178 |         {{:test_pid_a, subscriber}},
179 |         {{:test_pid_b, subscriber}},
180 |         {{:test_pid_a, :other_subscriber}},
181 |         {{:test_pid_c, :other_subscriber}}
182 |       ])
183 | 
184 |       # Kill the subscriber
185 |       Process.exit(subscriber, :kill)
186 | 
187 |       # Assert that the subscribers table gets cleaned up
188 |       assert Helper.wait_until(fn ->
189 |                Tables.subscribers()
190 |                |> :ets.tab2list()
191 |                |> length() == 2
192 |              end)
193 | 
194 |       # Assert that the rows that remain are the expected ones
195 |       assert :ets.member(Tables.subscribers(), {:test_pid_a, :other_subscriber})
196 |       assert :ets.member(Tables.subscribers(), {:test_pid_c, :other_subscriber})
197 |     end
198 |   end
199 | end
200 | 


--------------------------------------------------------------------------------
/test/proxy/proxy_test.exs:
--------------------------------------------------------------------------------
  1 | defmodule ZenMonitor.Proxy.Test do
  2 |   @moduledoc """
  3 |   Tests for the ZenMonitor.Proxy module
  4 |   """
  5 |   use ExUnit.Case
  6 | 
  7 |   alias ZenMonitor.Proxy
  8 |   alias ZenMonitor.Proxy.{Batcher, Tables}
  9 |   alias ZenMonitor.Test.Support.Subscriber
 10 | 
 11 |   # Batchers stop when their subscriber goes DOWN, this tag tells ExUnit to suppress stops reports
 12 |   @moduletag :capture_log
 13 | 
 14 |   setup do
 15 |     # Speed up the Batcher so its interval is much faster than the default timeout for
 16 |     # assert_receive / refute_receive
 17 |     original_sweep_interval = Batcher.sweep_interval()
 18 |     Batcher.sweep_interval(10)
 19 |     start_supervised(ZenMonitor.Supervisor)
 20 | 
 21 |     on_exit(fn ->
 22 |       # Restore original setting
 23 |       Batcher.sweep_interval(original_sweep_interval)
 24 |     end)
 25 | 
 26 |     {:ok, proxy: Process.whereis(ZenMonitor.Proxy)}
 27 |   end
 28 | 
 29 |   def pids(count) do
 30 |     Enum.map(1..count, fn _ -> spawn(fn -> Process.sleep(:infinity) end) end)
 31 |   end
 32 | 
 33 |   def row_count(table) do
 34 |     :ets.tab2list(table) |> length()
 35 |   end
 36 | 
 37 |   def monitor_count(pid) do
 38 |     Process.info(pid, :monitors) |> elem(1) |> length
 39 |   end
 40 | 
 41 |   describe "Ping" do
 42 |     test "returns :pong" do
 43 |       assert :pong = Proxy.ping()
 44 |     end
 45 |   end
 46 | 
 47 |   describe "Process" do
 48 |     test "no monitors before subscription", ctx do
 49 |       assert {:monitors, []} = Process.info(ctx.proxy, :monitors)
 50 |     end
 51 | 
 52 |     test "subscriptions add entries to the subscribers table", ctx do
 53 |       subscriber = self()
 54 |       targets = pids(3)
 55 |       instructions = Enum.map(targets, &{:subscribe, &1})
 56 | 
 57 |       # Send the subscribe instructions
 58 |       GenServer.cast(ctx.proxy, {:process, subscriber, instructions})
 59 | 
 60 |       # Assert that three entries get written
 61 |       assert Helper.wait_until(fn ->
 62 |                row_count(Tables.subscribers()) == 3
 63 |              end)
 64 | 
 65 |       # Assert each entry individually
 66 |       [t1, t2, t3] = targets
 67 |       assert :ets.member(Tables.subscribers(), {t1, subscriber})
 68 |       assert :ets.member(Tables.subscribers(), {t2, subscriber})
 69 |       assert :ets.member(Tables.subscribers(), {t3, subscriber})
 70 |     end
 71 | 
 72 |     test "multiple subscriptions to the same pid do not get additional subscriber rows", ctx do
 73 |       subscriber = self()
 74 |       [target] = pids(1)
 75 |       instructions = [{:subscribe, target}]
 76 | 
 77 |       # Create the initial subscription
 78 |       GenServer.cast(ctx.proxy, {:process, subscriber, instructions})
 79 | 
 80 |       # Assert that the entry gets written to the subscriber table
 81 |       assert Helper.wait_until(fn ->
 82 |                row_count(Tables.subscribers()) == 1
 83 |              end)
 84 | 
 85 |       # Assert that it's the row we expect
 86 |       assert :ets.member(Tables.subscribers(), {target, subscriber})
 87 | 
 88 |       # Perform a duplicate subscription
 89 |       GenServer.cast(ctx.proxy, {:process, subscriber, instructions})
 90 | 
 91 |       # Assert that no new entry gets written
 92 |       assert Helper.wait_until(fn ->
 93 |                row_count(Tables.subscribers()) != 2
 94 |              end)
 95 |     end
 96 | 
 97 |     test "subscriptions result in processes being monitored", ctx do
 98 |       subscriber = self()
 99 |       targets = pids(3)
100 |       instructions = Enum.map(targets, &{:subscribe, &1})
101 | 
102 |       # Create the Subscriptions
103 |       GenServer.cast(ctx.proxy, {:process, subscriber, instructions})
104 | 
105 |       # Wait for three monitors to show up
106 |       assert Helper.wait_until(fn ->
107 |                monitor_count(ctx.proxy) == 3
108 |              end)
109 | 
110 |       {:monitors, monitors} = Process.info(ctx.proxy, :monitors)
111 |       pids = Keyword.get_values(monitors, :process) |> Enum.sort()
112 |       targets = Enum.sort(targets)
113 | 
114 |       assert pids == targets
115 |     end
116 | 
117 |     test "duplicate subscriptions do not result in multiple monitors", ctx do
118 |       subscriber = self()
119 |       [target] = pids(1)
120 |       instructions = [{:subscribe, target}]
121 | 
122 |       # Create an initial subscription
123 |       GenServer.cast(ctx.proxy, {:process, subscriber, instructions})
124 | 
125 |       # Wait for the monitor to be established
126 |       assert Helper.wait_until(fn ->
127 |                monitor_count(ctx.proxy) == 1
128 |              end)
129 | 
130 |       # Create a duplicate subscription
131 |       GenServer.cast(ctx.proxy, {:process, subscriber, instructions})
132 | 
133 |       # Assert that no new monitors are created
134 |       assert Helper.wait_until(fn ->
135 |                monitor_count(ctx.proxy) != 2
136 |              end)
137 |     end
138 | 
139 |     test "unsubscribe removes the subscriber", ctx do
140 |       subscriber = self()
141 |       [target] = pids(1)
142 |       subscribe_instructions = [{:subscribe, target}]
143 |       unsubscribe_instructions = [{:unsubscribe, target}]
144 | 
145 |       # Create an initial subscription
146 |       GenServer.cast(ctx.proxy, {:process, subscriber, subscribe_instructions})
147 | 
148 |       # Wait for the subscriber row to be written
149 |       assert Helper.wait_until(fn ->
150 |                row_count(Tables.subscribers()) == 1
151 |              end)
152 | 
153 |       # Unsubscribe
154 |       GenServer.cast(ctx.proxy, {:process, subscriber, unsubscribe_instructions})
155 | 
156 |       # Make sure the subscriber is removed
157 |       assert Helper.wait_until(fn ->
158 |                row_count(Tables.subscribers()) == 0
159 |              end)
160 |     end
161 | 
162 |     test "instruction order is respected (terminal subscribed)", ctx do
163 |       subscriber = self()
164 |       [target] = pids(1)
165 |       instructions = [{:unsubscribe, target}, {:subscribe, target}]
166 | 
167 |       # Process the instructions
168 |       GenServer.cast(ctx.proxy, {:process, subscriber, instructions})
169 | 
170 |       # Confirm that the subscriber exists
171 |       assert Helper.wait_until(fn ->
172 |                row_count(Tables.subscribers()) == 1
173 |              end)
174 | 
175 |       assert :ets.member(Tables.subscribers(), {target, subscriber})
176 |     end
177 | 
178 |     test "instruction order is respected (terminal unsubscribed)", ctx do
179 |       subscriber = self()
180 |       [target] = pids(1)
181 |       instructions = [{:subscribe, target}, {:unsubscribe, target}]
182 | 
183 |       # Process the instructions
184 |       GenServer.cast(ctx.proxy, {:process, subscriber, instructions})
185 | 
186 |       # Confirm that no subscription exists
187 |       assert Helper.wait_until(fn ->
188 |                row_count(Tables.subscribers()) == 0
189 |              end)
190 |     end
191 | 
192 |     test "unsubscribe is isolated to the unsubscriber", ctx do
193 |       subscriber = Subscriber.start(self())
194 |       [target] = pids(1)
195 |       subscribe_instructions = [{:subscribe, target}]
196 |       unsubscribe_instructions = [{:unsubscribe, target}]
197 | 
198 |       # Subscribe both parties to the target
199 |       GenServer.cast(ctx.proxy, {:process, self(), subscribe_instructions})
200 |       GenServer.cast(ctx.proxy, {:process, subscriber, subscribe_instructions})
201 | 
202 |       # Assert that the subscribers get written to the table
203 |       assert Helper.wait_until(fn ->
204 |                row_count(Tables.subscribers()) == 2
205 |              end)
206 | 
207 |       assert :ets.member(Tables.subscribers(), {target, subscriber})
208 |       assert :ets.member(Tables.subscribers(), {target, self()})
209 | 
210 |       # Unsubscribe on of the parties
211 |       GenServer.cast(ctx.proxy, {:process, self(), unsubscribe_instructions})
212 | 
213 |       # Assert that the correct row was removed
214 |       assert Helper.wait_until(fn ->
215 |                row_count(Tables.subscribers()) == 1
216 |              end)
217 | 
218 |       assert :ets.member(Tables.subscribers(), {target, subscriber})
219 |       refute :ets.member(Tables.subscribers(), {target, self()})
220 |     end
221 | 
222 |     test "unsubscribe is isolated to the target", ctx do
223 |       subscriber = self()
224 |       [target, other] = pids(2)
225 | 
226 |       instructions = [
227 |         {:subscribe, target},
228 |         {:unsubscribe, target},
229 |         {:subscribe, other}
230 |       ]
231 | 
232 |       # Process the instructions
233 |       GenServer.cast(ctx.proxy, {:process, subscriber, instructions})
234 | 
235 |       # Assert that only the other target exists in the table
236 |       assert Helper.wait_until(fn ->
237 |                row_count(Tables.subscribers()) == 1
238 |              end)
239 | 
240 |       assert :ets.member(Tables.subscribers(), {other, subscriber})
241 |       refute :ets.member(Tables.subscribers(), {target, subscriber})
242 |     end
243 | 
244 |     test "ERTS monitors persist after unsubscribe", ctx do
245 |       subscriber = self()
246 |       [target] = pids(1)
247 |       subscribe_instructions = [{:subscribe, target}]
248 |       unsubscribe_instructions = [{:unsubscribe, target}]
249 | 
250 |       # Create the initial subscription
251 |       GenServer.cast(ctx.proxy, {:process, subscriber, subscribe_instructions})
252 | 
253 |       # Assert that the subscriber row is written
254 |       assert Helper.wait_until(fn ->
255 |                row_count(Tables.subscribers()) == 1
256 |              end)
257 | 
258 |       # Assert that the monitor is established
259 |       assert Helper.wait_until(fn ->
260 |                monitor_count(ctx.proxy) == 1
261 |              end)
262 | 
263 |       # Unsubscribe
264 |       GenServer.cast(ctx.proxy, {:process, subscriber, unsubscribe_instructions})
265 | 
266 |       # Assert that the subscriber row was cleaned up
267 |       assert Helper.wait_until(fn ->
268 |                row_count(Tables.subscribers()) == 0
269 |              end)
270 | 
271 |       # Assert that the monitor persists
272 |       assert Helper.wait_until(fn ->
273 |                monitor_count(ctx.proxy) == 1
274 |              end)
275 |     end
276 |   end
277 | 
278 |   describe "DOWN handling" do
279 |     test "removes subscribers for the down pid", ctx do
280 |       subscriber = Subscriber.start(self())
281 |       [target, other] = pids(2)
282 |       instructions = [{:subscribe, target}, {:subscribe, other}]
283 | 
284 |       # Subscribe both parties to the same target and another process that will be kept alive
285 |       GenServer.cast(ctx.proxy, {:process, self(), instructions})
286 |       GenServer.cast(ctx.proxy, {:process, subscriber, instructions})
287 | 
288 |       # Assert that the subscribers get written to the table
289 |       assert Helper.wait_until(fn ->
290 |                row_count(Tables.subscribers()) == 4
291 |              end)
292 | 
293 |       # Kill the target
294 |       Process.exit(target, :kill)
295 | 
296 |       # Assert delivery of messages
297 |       assert_receive {:dead, _, [{^target, _}]}
298 |       assert_receive {:forward, {:dead, _, [{^target, _}]}}
299 | 
300 |       # Make sure the subscriber rows were cleared out
301 |       assert row_count(Tables.subscribers()) == 2
302 |       subscribers = :ets.match(Tables.subscribers(), {{other, :"$1"}})
303 |       assert [self()] in subscribers
304 |       assert [subscriber] in subscribers
305 |     end
306 | 
307 |     test "truncates reasons", ctx do
308 |       subscriber = Subscriber.start(self())
309 |       [target, other] = pids(2)
310 |       instructions = [{:subscribe, target}, {:subscribe, other}]
311 | 
312 |       # Subscribe both parties to the same target and another process that will be kept alive
313 |       GenServer.cast(ctx.proxy, {:process, self(), instructions})
314 |       GenServer.cast(ctx.proxy, {:process, subscriber, instructions})
315 | 
316 |       # Assert that the subscribers get written to the table
317 |       assert Helper.wait_until(fn ->
318 |                row_count(Tables.subscribers()) == 4
319 |              end)
320 | 
321 |       reason = {:this, :is, :an, {:especially, {:deeply, {:nested, {:tuple}}}}}
322 |       # Kill the target
323 |       Process.exit(target, reason)
324 | 
325 |       # Assert delivery of messages
326 |       assert_receive {:dead, _, [{^target, reason}]}
327 |       assert_receive {:forward, {:dead, _, [{^target, _}]}}
328 | 
329 |       assert {:this, :is, :an, {:especially, {:truncated, :truncated}}} == reason
330 |     end
331 | 
332 |     test "truncates state", ctx do
333 |       defmodule Crasher do
334 |         use GenServer
335 | 
336 |         def start do
337 |           state = for i <- 1..10000, into: %{}, do: {i, i * 2}
338 |           GenServer.start(__MODULE__, state)
339 |         end
340 | 
341 |         def init(args) do
342 |           {:ok, args}
343 |         end
344 | 
345 |         def crash(pid), do: GenServer.call(pid, :crash)
346 | 
347 |         def handle_call(nil, _from, state), do: {:reply, :ok, state}
348 |       end
349 | 
350 |       {:ok, crasher} = Crasher.start()
351 |       instructions = [{:subscribe, crasher}]
352 | 
353 |       GenServer.cast(ctx.proxy, {:process, self(), instructions})
354 |       # Assert that the subscribers get written to the table
355 |       assert Helper.wait_until(fn ->
356 |                row_count(Tables.subscribers()) == 1
357 |              end)
358 | 
359 |       spawn(Crasher, :crash, [crasher])
360 | 
361 |       assert_receive {:dead, _, [{^crasher, reason}]}, 500
362 |       assert {:function_clause, frames} = reason
363 | 
364 |       for frame <- frames do
365 |         # this generates a big stack, everything should be truncated.
366 |         assert [:truncated] = frame |> Tuple.to_list() |> Enum.uniq()
367 |       end
368 |     end
369 |   end
370 | end
371 | 


--------------------------------------------------------------------------------
/test/stress_test.exs:
--------------------------------------------------------------------------------
  1 | defmodule ZenMonitor.Stress.Test do
  2 |   use ExUnit.Case
  3 | 
  4 |   alias ZenMonitor.Local.Connector
  5 | 
  6 |   @fast_interval 10
  7 |   @slow_interval 100
  8 | 
  9 |   @small_chunk 10
 10 |   @big_chunk 100_000
 11 | 
 12 |   setup do
 13 |     # Make the Batcher and Dispatcher dispatch at a controlled rate
 14 |     tune(node(), :batcher, :slow)
 15 |     tune(node(), :dispatcher, :slow)
 16 | 
 17 |     # Make the Connector flush everything very quickly
 18 |     tune(node(), :connector, :fast)
 19 | 
 20 |     start_supervised(ZenMonitor.Supervisor)
 21 |     {:ok, compatible, nil} = ChildNode.start_link(:zen_monitor, :Compatible)
 22 | 
 23 |     on_exit(fn ->
 24 |       Node.monitor(compatible, true)
 25 | 
 26 |       receive do
 27 |         {:nodedown, ^compatible} -> :ok
 28 |       end
 29 |     end)
 30 | 
 31 |     # Make the remote batcher flush at a controlled rate
 32 |     tune(compatible, :batcher, :slow)
 33 | 
 34 |     {:ok, down: :down@down, compatible: compatible, remotes: [compatible]}
 35 |   end
 36 | 
 37 |   def tune(remote, :batcher, :fast) do
 38 |     :rpc.call(remote, Application, :put_env, [
 39 |       :zen_monitor,
 40 |       :batcher_sweep_interval,
 41 |       @fast_interval
 42 |     ])
 43 | 
 44 |     :rpc.call(remote, Application, :put_env, [:zen_monitor, :batcher_chunk_size, @big_chunk])
 45 |   end
 46 | 
 47 |   def tune(remote, :batcher, :slow) do
 48 |     :rpc.call(remote, Application, :put_env, [
 49 |       :zen_monitor,
 50 |       :batcher_sweep_interval,
 51 |       @slow_interval
 52 |     ])
 53 | 
 54 |     :rpc.call(remote, Application, :put_env, [:zen_monitor, :batcher_chunk_size, @small_chunk])
 55 |   end
 56 | 
 57 |   def tune(remote, :connector, :fast) do
 58 |     :rpc.call(remote, Application, :put_env, [
 59 |       :zen_monitor,
 60 |       :connector_sweep_interval,
 61 |       @fast_interval
 62 |     ])
 63 | 
 64 |     :rpc.call(remote, Application, :put_env, [:zen_monitor, :connector_chunk_size, @big_chunk])
 65 |   end
 66 | 
 67 |   def tune(remote, :connector, :slow) do
 68 |     :rpc.call(remote, Application, :put_env, [
 69 |       :zen_monitor,
 70 |       :connector_sweep_interval,
 71 |       @slow_interval
 72 |     ])
 73 | 
 74 |     :rpc.call(remote, Application, :put_env, [:zen_monitor, :connector_chunk_size, @small_chunk])
 75 |   end
 76 | 
 77 |   def tune(remote, :dispatcher, :fast) do
 78 |     :rpc.call(remote, Application, :put_env, [:zen_monitor, :demand_interavl, @fast_interval])
 79 |     :rpc.call(remote, Application, :put_env, [:zen_monitor, :demand_amount, @big_chunk])
 80 |   end
 81 | 
 82 |   def tune(remote, :dispatcher, :slow) do
 83 |     :rpc.call(remote, Application, :put_env, [:zen_monitor, :demand_interval, @slow_interval])
 84 |     :rpc.call(remote, Application, :put_env, [:zen_monitor, :demand_amount, @small_chunk])
 85 |   end
 86 | 
 87 |   def start_processes(remote, amount) do
 88 |     Enum.map(1..amount, fn _ ->
 89 |       Node.spawn(remote, Process, :sleep, [:infinity])
 90 |     end)
 91 |   end
 92 | 
 93 |   def stop_processes(targets) do
 94 |     spawn(fn ->
 95 |       Enum.each(targets, &Process.exit(&1, :kill))
 96 |     end)
 97 |   end
 98 | 
 99 |   def flush_messages() do
100 |     send(self(), :flush)
101 | 
102 |     receive_until_flush([])
103 |   end
104 | 
105 |   def receive_until_flush(acc) do
106 |     receive do
107 |       msg ->
108 |         if match?(:flush, msg) do
109 |           Enum.reverse(acc)
110 |         else
111 |           receive_until_flush([msg | acc])
112 |         end
113 |     after
114 |       0 ->
115 |         raise "Flush not found!"
116 |     end
117 |   end
118 | 
119 |   describe "Massive remote failure" do
120 |     test "local environment configured correctly" do
121 |       assert @slow_interval == Application.get_env(:zen_monitor, :batcher_sweep_interval)
122 |       assert @small_chunk == Application.get_env(:zen_monitor, :batcher_chunk_size)
123 | 
124 |       assert @slow_interval == Application.get_env(:zen_monitor, :demand_interval)
125 |       assert @small_chunk == Application.get_env(:zen_monitor, :demand_amount)
126 | 
127 |       assert @fast_interval == Application.get_env(:zen_monitor, :connector_sweep_interval)
128 |       assert @big_chunk == Application.get_env(:zen_monitor, :connector_chunk_size)
129 |     end
130 | 
131 |     test "remote environment configured correctly", ctx do
132 |       assert @slow_interval ==
133 |                :rpc.call(ctx.compatible, Application, :get_env, [
134 |                  :zen_monitor,
135 |                  :batcher_sweep_interval
136 |                ])
137 | 
138 |       assert @small_chunk ==
139 |                :rpc.call(ctx.compatible, Application, :get_env, [
140 |                  :zen_monitor,
141 |                  :batcher_chunk_size
142 |                ])
143 |     end
144 | 
145 |     test "down messages are throttled", ctx do
146 |       # Start a lot of remote processes
147 |       remote_pids = start_processes(ctx.compatible, 100_000)
148 | 
149 |       # Monitor everything
150 |       assert :ok = Enum.each(remote_pids, &ZenMonitor.monitor/1)
151 | 
152 |       # Make sure the connector flushes the monitors over to the remote
153 |       connector = Connector.get(ctx.compatible)
154 | 
155 |       assert Helper.wait_until(fn ->
156 |                :sys.get_state(connector).length == 0
157 |              end)
158 | 
159 |       # Assert that the message queue is empty
160 |       assert {:message_queue_len, 0} = Process.info(self(), :message_queue_len)
161 | 
162 |       # Choose some processes to kill
163 |       targets =
164 |         remote_pids
165 |         |> Enum.shuffle()
166 |         |> Enum.slice(0, 10_000)
167 | 
168 |       # Start stopping all the targets
169 |       stop_processes(targets)
170 | 
171 |       # Wait for 10 intervals
172 |       Process.sleep(@slow_interval * 10)
173 | 
174 |       # Get the message queue
175 |       messages = flush_messages()
176 | 
177 |       # Check that we got an appropriate amount of messages
178 |       flush_length = length(messages)
179 |       assert @small_chunk * 5 <= flush_length
180 |       assert flush_length <= @small_chunk * 15
181 | 
182 |       # Check each message is a :DOWN for a stopped process
183 |       for message <- messages do
184 |         assert {:DOWN, _, :process, received_pid, {:zen_monitor, _}} = message
185 |         assert received_pid in targets
186 |       end
187 |     end
188 | 
189 |     test "does not crash ZenMonitor", ctx do
190 |       # Save the current ZenMonitor pids
191 |       connector = Connector.get(ctx.compatible)
192 |       local = Process.whereis(ZenMonitor.Local)
193 |       proxy = :rpc.call(ctx.compatible, Process, :whereis, [ZenMonitor.Proxy])
194 |       batcher = :rpc.call(ctx.compatible, ZenMonitor.Proxy.Batcher, :get, [connector])
195 | 
196 |       # Start a lot of remote processes
197 |       remote_pids = start_processes(ctx.compatible, 100_000)
198 | 
199 |       # Monitor everything
200 |       assert :ok = Enum.each(remote_pids, &ZenMonitor.monitor/1)
201 | 
202 |       # Make sure the connector flushes the monitors over to the remote
203 |       assert Helper.wait_until(fn ->
204 |                :sys.get_state(connector).length == 0
205 |              end)
206 | 
207 |       # Kill all remote processes
208 |       stopper = stop_processes(remote_pids)
209 | 
210 |       # Wait for the stopper to finish its job
211 |       assert Helper.wait_until(fn ->
212 |                not Process.alive?(stopper)
213 |              end)
214 | 
215 |       # Make sure that nothing crashed
216 |       assert Process.alive?(local)
217 |       assert Process.alive?(connector)
218 |       assert :rpc.call(ctx.compatible, Process, :alive, [proxy])
219 |       assert :rpc.call(ctx.compatible, Process, :alive, [batcher])
220 |     end
221 |   end
222 | end
223 | 


--------------------------------------------------------------------------------
/test/support/child_node.ex:
--------------------------------------------------------------------------------
  1 | defmodule ChildNode do
  2 |   @moduledoc """
  3 |   ChildNode provides facilities for starting another erlang node on the current machine.
  4 | 
  5 |   This module enhances and abstracts the erlang `slave` module. After calling `slave.start` to
  6 |   make sure the child node is running, it ensures that Elixir is started, after which it will run
  7 |   any function passed in as the `:on_start` param. This function must be compiled and loaded on
  8 |   both nodes.
  9 | 
 10 |   After that, control is handed back to the caller who can use the `:rpc` module to invoke
 11 |   functions remotely.
 12 | 
 13 |   The child nodes process is linked to the caller's process, so if the caller dies, so will the
 14 |   child node.
 15 | 
 16 |   If additional logging is required, set `enable_sasl` option to `true`.
 17 |   """
 18 | 
 19 |   @type param :: {:enable_sasl, boolean} | {:on_start, (() -> any)}
 20 |   @type params :: [param]
 21 | 
 22 |   defmodule Runner do
 23 |     @moduledoc """
 24 |     When the new node starts up, we often want to set up a supervision tree by calling
 25 |     a function with `:rpc.call`. However, when the call ends, all the linked processes
 26 |     in the rpc call will die. This runner encapsulates them and doesn't link to its caller,
 27 |     so that any processes started by `Runner` will continue to live after the `:rpc` call.
 28 |     """
 29 |     use GenServer
 30 | 
 31 |     def start(mod, fun, args) do
 32 |       GenServer.start(__MODULE__, [mod, fun, args])
 33 |     end
 34 | 
 35 |     def start(init_fn) when is_function(init_fn) do
 36 |       GenServer.start(__MODULE__, [init_fn])
 37 |     end
 38 | 
 39 |     def init([mod, fun, args]) do
 40 |       rv = apply(mod, fun, args)
 41 |       {:ok, rv}
 42 |     end
 43 | 
 44 |     def init([init_fn]) do
 45 |       {:ok, init_fn}
 46 |     end
 47 | 
 48 |     def get(runner_pid) do
 49 |       GenServer.call(runner_pid, :get)
 50 |     end
 51 | 
 52 |     def do_init(runner_pid, args) do
 53 |       GenServer.call(runner_pid, {:do_init, args})
 54 |     end
 55 | 
 56 |     def handle_call({:do_init, args}, _from, init_fn) do
 57 |       {:reply, init_fn.(args), init_fn}
 58 |     end
 59 | 
 60 |     def handle_call(:get, _from, v) do
 61 |       {:reply, v, v}
 62 |     end
 63 |   end
 64 | 
 65 |   @spec start_link(Application.t(), atom, params) :: {:ok, pid} | {:error, any}
 66 |   def start_link(app_to_start, node_name, params \\ [], timeout \\ 5_000) do
 67 |     unless Node.alive?() do
 68 |       {:ok, _} = Node.start(:"local@0.0.0.0")
 69 |     end
 70 | 
 71 |     code_paths = Enum.join(:code.get_path(), " ")
 72 | 
 73 |     default_node_start_args = [
 74 |       "-setcookie #{Node.get_cookie()}",
 75 |       "-pa #{code_paths}",
 76 |       "-connect_all false"
 77 |     ]
 78 | 
 79 |     node_start_args =
 80 |       if params[:enable_sasl] do
 81 |         default_node_start_args ++ ["-logger handle_sasl_reports true"]
 82 |       else
 83 |         default_node_start_args
 84 |       end
 85 |       |> Enum.join(" ")
 86 |       |> String.to_charlist()
 87 | 
 88 |     node_name = to_node_name(node_name)
 89 |     {:ok, node_name} = :slave.start_link('0.0.0.0', node_name, node_start_args)
 90 |     {:ok, _} = :rpc.call(node_name, :application, :ensure_all_started, [:elixir])
 91 | 
 92 |     on_start = params[:on_start]
 93 |     rpc_args = [node_name, app_to_start, on_start, self()]
 94 | 
 95 |     case :rpc.call(node_name, __MODULE__, :on_start, rpc_args, timeout) do
 96 |       {:ok, start_fn_results} ->
 97 |         {:ok, node_name, start_fn_results}
 98 | 
 99 |       {:badrpc, :timeout} ->
100 |         {:error, :timeout}
101 |     end
102 |   end
103 | 
104 |   def on_start(node_name, app_to_start, start_callback, _caller) do
105 |     case app_to_start do
106 |       apps when is_list(apps) ->
107 |         for app <- apps do
108 |           {:ok, _} = Application.ensure_all_started(app)
109 |         end
110 | 
111 |       app when is_atom(app) ->
112 |         {:ok, _started_apps} = Application.ensure_all_started(app)
113 |     end
114 | 
115 |     start_fn_results =
116 |       case start_callback do
117 |         callback when is_function(callback) ->
118 |           {:ok, runner_pid} = Runner.start(callback)
119 |           Runner.do_init(runner_pid, node_name)
120 | 
121 |         {m, f, a} ->
122 |           {:ok, runner_pid} = Runner.start(m, f, a)
123 |           Runner.get(runner_pid)
124 | 
125 |         nil ->
126 |           nil
127 |       end
128 | 
129 |     {:ok, start_fn_results}
130 |   end
131 | 
132 |   @doc "Runs the MFA in a process on the remote node"
133 |   @spec run(node, module(), atom(), [any]) :: any
134 |   def run(node, m, f, a) do
135 |     {:ok, runner_pid} = :rpc.call(node, Runner, :start, [m, f, a])
136 |     :rpc.call(node, Runner, :get, [runner_pid])
137 |   end
138 | 
139 |   defp to_node_name(node_name) when is_atom(node_name) do
140 |     node_name
141 |     |> Atom.to_string()
142 |     |> String.split(".")
143 |     |> sanitize_node_name
144 |   end
145 | 
146 |   defp sanitize_node_name([node_name]) do
147 |     String.to_atom(node_name)
148 |   end
149 | 
150 |   defp sanitize_node_name(node_name) when is_list(node_name) do
151 |     node_name
152 |     |> List.last()
153 |     |> Macro.underscore()
154 |     |> String.downcase()
155 |     |> String.to_atom()
156 |   end
157 | end
158 | 


--------------------------------------------------------------------------------
/test/support/observable_gen.ex:
--------------------------------------------------------------------------------
 1 | defmodule ZenMonitor.Test.Support.ObservableGen do
 2 |   @moduledoc """
 3 |   ObservableGen is a test spy that can observe all calls to call/3 and cast/2 and forward them to
 4 |   a spy process with an {:observe, :call | :cast, *args}
 5 | 
 6 |   It is used in ZenMonitor tests to verify that the proper communication is happening between
 7 |   various components.
 8 |   """
 9 |   use Agent
10 | 
11 |   def start_link(spy) do
12 |     Agent.start_link(fn -> spy end, name: __MODULE__)
13 |   end
14 | 
15 |   def call(destination, message, timeout \\ 5000) do
16 |     Agent.get(__MODULE__, fn spy ->
17 |       send(spy, {:observe, :call, destination, message, timeout})
18 |     end)
19 | 
20 |     GenServer.call(destination, message, timeout)
21 |   end
22 | 
23 |   def cast(destination, message) do
24 |     Agent.get(__MODULE__, fn spy ->
25 |       send(spy, {:observe, :cast, destination, message})
26 |     end)
27 | 
28 |     GenServer.cast(destination, message)
29 |   end
30 | end
31 | 


--------------------------------------------------------------------------------
/test/support/subscriber.ex:
--------------------------------------------------------------------------------
 1 | defmodule ZenMonitor.Test.Support.Subscriber do
 2 |   def start(spy) do
 3 |     spawn(__MODULE__, :forward, [spy])
 4 |   end
 5 | 
 6 |   def forward(spy) do
 7 |     receive do
 8 |       message -> send(spy, {:forward, message})
 9 |     end
10 | 
11 |     forward(spy)
12 |   end
13 | end
14 | 


--------------------------------------------------------------------------------
/test/test_helper.exs:
--------------------------------------------------------------------------------
 1 | defmodule Helper do
 2 |   alias ZenMonitor.{Local, Proxy}
 3 |   import ExUnit.Assertions
 4 | 
 5 |   def await_monitors_established(subscriber \\ nil, refs, target) do
 6 |     subscriber = subscriber || self()
 7 |     Enum.each(refs, &await_monitor_established(subscriber, &1, target))
 8 |   end
 9 | 
10 |   def await_monitor_established(subscriber \\ nil, ref, target) do
11 |     subscriber = subscriber || self()
12 | 
13 |     assert wait_until(fn ->
14 |              local_monitor_established?(subscriber, ref, target)
15 |            end),
16 |            "Local Monitor #{inspect(ref)}: #{inspect(subscriber)} -> #{inspect(target)} did not get established"
17 | 
18 |     assert wait_until(fn ->
19 |              proxy_monitor_established?(target)
20 |            end),
21 |            "Proxy Monitor #{inspect(ref)}: #{inspect(subscriber)} -> #{inspect(target)} did not get established"
22 |   end
23 | 
24 |   def await_monitors_cleared(subscriber \\ nil, refs, target) do
25 |     subscriber = subscriber || self()
26 |     Enum.each(refs, &await_monitor_cleared(subscriber, &1, target))
27 |   end
28 | 
29 |   def await_monitor_cleared(subscriber \\ nil, ref, target) do
30 |     subscriber = subscriber || self()
31 | 
32 |     assert wait_until(fn ->
33 |              !local_monitor_established?(subscriber, ref, target)
34 |            end),
35 |            "Local Monitor #{inspect(ref)}: #{inspect(subscriber)} -> #{inspect(target)} did not get cleared"
36 |   end
37 | 
38 |   def local_monitor_established?(subscriber \\ nil, ref, target) do
39 |     subscriber = subscriber || self()
40 | 
41 |     monitors = Local.Connector.monitors(target, subscriber)
42 | 
43 |     ref in monitors
44 |   end
45 | 
46 |   def proxy_monitor_established?(target) do
47 |     subscriber = Local.Connector.get(target)
48 |     target_node = node(target)
49 |     table = Proxy.Tables.subscribers()
50 | 
51 |     row =
52 |       if target_node == Node.self() do
53 |         :ets.lookup(table, {target, subscriber})
54 |       else
55 |         args = [table, {target, subscriber}]
56 |         :rpc.call(target_node, :ets, :lookup, args)
57 |       end
58 | 
59 |     !Enum.empty?(row)
60 |   end
61 | 
62 |   @doc """
63 |   Helper that executes a function until it returns true
64 | 
65 |   Useful for operations that will eventually complete, instead of sleeping to allow an async
66 |   operation to complete, wait_until will call the function in a loop up to the specified number of
67 |   attempts with the specified delay between attempts.
68 |   """
69 |   @spec wait_until(fun :: (() -> boolean), attempts :: non_neg_integer, delay :: pos_integer) ::
70 |           boolean
71 |   def wait_until(fun, attempts \\ 50, delay \\ 100)
72 | 
73 |   def wait_until(_, 0, _), do: false
74 | 
75 |   def wait_until(fun, attempts, delay) do
76 |     try do
77 |       case fun.() do
78 |         true ->
79 |           true
80 | 
81 |         _ ->
82 |           Process.sleep(delay)
83 |           wait_until(fun, attempts - 1, delay)
84 |       end
85 |     rescue
86 |       MatchError ->
87 |         Process.sleep(delay)
88 |         wait_until(fun, attempts - 1, delay)
89 |     end
90 |   end
91 | end
92 | 
93 | Application.ensure_all_started(:instruments)
94 | 
95 | ExUnit.start()
96 | 


--------------------------------------------------------------------------------
/test/truncator_test.exs:
--------------------------------------------------------------------------------
  1 | defmodule TruncatorTest do
  2 |   use ExUnit.Case
  3 |   alias ZenMonitor.Truncator
  4 | 
  5 |   describe "scalars should pass through" do
  6 |     test "atoms" do
  7 |       assert :test_atom == Truncator.truncate(:test_atom)
  8 |     end
  9 | 
 10 |     test "floats" do
 11 |       assert 1.2 == Truncator.truncate(1.2)
 12 |     end
 13 | 
 14 |     test "integers" do
 15 |       assert 1 == Truncator.truncate(1)
 16 |     end
 17 | 
 18 |     test "strings" do
 19 |       assert "hello" == Truncator.truncate("hello")
 20 |     end
 21 | 
 22 |     test "pids" do
 23 |       pid = self()
 24 |       assert pid == Truncator.truncate(pid)
 25 |     end
 26 |   end
 27 | 
 28 |   describe "top level shutdown messages should pass through" do
 29 |     test "long list that would normally be truncated" do
 30 |       long_list = [:a, :b, :c, :d, :e, :f, :g]
 31 |       assert {:shutdown, ^long_list} = Truncator.truncate({:shutdown, long_list})
 32 |     end
 33 | 
 34 |     test "only at top level, nested shutdown tuples should be truncated" do
 35 |       long_list = [:a, :b, :c, :d, :e, :f, :g]
 36 |       assert {:foo, {:shutdown, :truncated}} = Truncator.truncate({:foo, {:shutdown, long_list}})
 37 |     end
 38 |   end
 39 | 
 40 |   describe "bistring truncation" do
 41 |     test "less than limit should pass through" do
 42 |       input = "test-string"
 43 |       assert input == Truncator.truncate(input)
 44 |     end
 45 | 
 46 |     test "equal to limit should pass through" do
 47 |       input = String.duplicate("a", 1024)
 48 |       assert input == Truncator.truncate(input)
 49 |     end
 50 | 
 51 |     test "greater than limit should be truncated" do
 52 |       assert <<_::binary-size(1021), "...">> = Truncator.truncate(String.duplicate("a", 2048))
 53 |     end
 54 |   end
 55 | 
 56 |   describe "list truncation" do
 57 |     test "lists of size less than 5 should pass through" do
 58 |       assert [] == Truncator.truncate([])
 59 |       assert [1] == Truncator.truncate([1])
 60 |       assert [1, 2] == Truncator.truncate([1, 2])
 61 |       assert [1, 2, 3] == Truncator.truncate([1, 2, 3])
 62 |       assert [1, 2, 3, 4] == Truncator.truncate([1, 2, 3, 4])
 63 |     end
 64 | 
 65 |     test "lists of size 5 should be truncated" do
 66 |       assert :truncated == Truncator.truncate([1, 2, 3, 4, 5])
 67 |     end
 68 | 
 69 |     test "lists of size greater than 5 should be truncated" do
 70 |       assert :truncated == Truncator.truncate([1, 2, 3, 4, 5, 6])
 71 |     end
 72 |   end
 73 | 
 74 |   describe "tuple truncation" do
 75 |     test "tuples of size less than 5 should pass through" do
 76 |       assert {} == Truncator.truncate({})
 77 |       assert {1} == Truncator.truncate({1})
 78 |       assert {1, 2} == Truncator.truncate({1, 2})
 79 |       assert {1, 2, 3} == Truncator.truncate({1, 2, 3})
 80 |       assert {1, 2, 3, 4} == Truncator.truncate({1, 2, 3, 4})
 81 |     end
 82 | 
 83 |     test "tuples of size 5 should be truncated" do
 84 |       assert :truncated = Truncator.truncate({1, 2, 3, 4, 5})
 85 |     end
 86 | 
 87 |     test "tuples of size greater than 5 should be truncated" do
 88 |       assert :truncated = Truncator.truncate({1, 2, 3, 4, 5, 6})
 89 |     end
 90 |   end
 91 | 
 92 |   describe "map truncation" do
 93 |     test "maps of size less than 5 should pass through" do
 94 |       assert %{a: 1} == Truncator.truncate(%{a: 1})
 95 |       assert %{a: 1, b: 2} == Truncator.truncate(%{a: 1, b: 2})
 96 |       assert %{a: 1, b: 2, c: 3} == Truncator.truncate(%{a: 1, b: 2, c: 3})
 97 |       assert %{a: 1, b: 2, c: 3, d: 4} == Truncator.truncate(%{a: 1, b: 2, c: 3, d: 4})
 98 |     end
 99 | 
100 |     test "maps of size 5 should be truncated" do
101 |       assert :truncated == Truncator.truncate(%{a: 1, b: 2, c: 3, d: 4, e: 5})
102 |     end
103 | 
104 |     test "maps of size greater than 5 should be truncated" do
105 |       assert :truncated == Truncator.truncate(%{a: 1, b: 2, c: 3, d: 4, e: 5, f: 6})
106 |     end
107 |   end
108 | 
109 |   describe "struct truncation" do
110 |     defmodule OneFieldStruct do
111 |       defstruct a: 1
112 |     end
113 | 
114 |     defmodule TwoFieldStruct do
115 |       defstruct a: 1, b: 2
116 |     end
117 | 
118 |     defmodule ThreeFieldStruct do
119 |       defstruct a: 1, b: 2, c: 3
120 |     end
121 | 
122 |     defmodule FourFieldStruct do
123 |       defstruct a: 1, b: 2, c: 3, d: 4
124 |     end
125 | 
126 |     defmodule FiveFieldStruct do
127 |       defstruct a: 1, b: 2, c: 3, d: 4, e: 5
128 |     end
129 | 
130 |     defmodule SixFieldStruct do
131 |       defstruct a: 1, b: 2, c: 3, d: 4, e: 5, f: 6
132 |     end
133 | 
134 |     test "structs of size less than 5 should pass through" do
135 |       one = %OneFieldStruct{}
136 |       two = %TwoFieldStruct{}
137 |       three = %ThreeFieldStruct{}
138 |       four = %FourFieldStruct{}
139 | 
140 |       assert one == Truncator.truncate(one)
141 |       assert two == Truncator.truncate(two)
142 |       assert three == Truncator.truncate(three)
143 |       assert four == Truncator.truncate(four)
144 |     end
145 | 
146 |     test "structs of size 5 should be truncated" do
147 |       assert :truncated == Truncator.truncate(%FiveFieldStruct{})
148 |     end
149 | 
150 |     test "structs of size greater than 5 should be truncated" do
151 |       assert :truncated == Truncator.truncate(%SixFieldStruct{})
152 |     end
153 |   end
154 | 
155 |   describe "struct robustness" do
156 |     test "small unknown struct stays as-is" do
157 |       unknown_struct = %{
158 |         :__struct__ => NotARealModule,
159 |         a: :b,
160 |         c: :d
161 |       }
162 | 
163 |       assert unknown_struct == Truncator.truncate(unknown_struct)
164 |     end
165 | 
166 |     test "large unknown struct should be truncated" do
167 |       unknown_struct = %{
168 |         :__struct__ => NotARealModule,
169 |         a: :b,
170 |         c: :d,
171 |         e: :f,
172 |         g: :h,
173 |         i: :j,
174 |       }
175 | 
176 |       assert :truncated == Truncator.truncate(unknown_struct)
177 |     end
178 |   end
179 | 
180 |   describe "limited nesting" do
181 |     defmodule Nested do
182 |       defstruct map: %{},
183 |                 list: [],
184 |                 tuple: {},
185 |                 struct: nil
186 |     end
187 | 
188 |     test "it should prevent deeply nested lists" do
189 |       nested = [:a, [:b, [:c, [:d, [:e, [:f]]]]]]
190 | 
191 |       assert :truncated == Truncator.truncate(nested, 0)
192 |       assert [:truncated, :truncated] == Truncator.truncate(nested, 1)
193 |       assert [:a, [:truncated, :truncated]] == Truncator.truncate(nested, 2)
194 |       assert [:a, [:b, [:truncated, :truncated]]] == Truncator.truncate(nested, 3)
195 |       assert [:a, [:b, [:c, [:truncated, :truncated]]]] == Truncator.truncate(nested, 4)
196 |     end
197 | 
198 |     test "it should prevent deeply nested maps" do
199 |       nested = %{a: %{b: %{c: %{d: %{}}}}}
200 | 
201 |       assert :truncated == Truncator.truncate(nested, 0)
202 |       assert %{a: :truncated} == Truncator.truncate(nested, 1)
203 |       assert %{a: %{b: :truncated}} == Truncator.truncate(nested, 2)
204 |       assert %{a: %{b: %{c: :truncated}}} == Truncator.truncate(nested, 3)
205 |       assert %{a: %{b: %{c: %{d: :truncated}}}} == Truncator.truncate(nested, 4)
206 |       assert nested == Truncator.truncate(nested, 5)
207 |     end
208 | 
209 |     test "it should prevent deeply nested tuples" do
210 |       nested = {:a, {:b, {:c, {:d, {}}}}}
211 | 
212 |       assert :truncated == Truncator.truncate(nested, 0)
213 |       assert {:truncated, :truncated} == Truncator.truncate(nested, 1)
214 |       assert {:a, {:truncated, :truncated}} == Truncator.truncate(nested, 2)
215 |       assert {:a, {:b, {:truncated, :truncated}}} == Truncator.truncate(nested, 3)
216 |       assert {:a, {:b, {:c, {:truncated, :truncated}}}} == Truncator.truncate(nested, 4)
217 |       assert {:a, {:b, {:c, {:d, {}}}}} == Truncator.truncate(nested, 5)
218 |     end
219 | 
220 |     test "it should prevent deeply nested structs" do
221 |       assert %Nested{map: :truncated} = Truncator.truncate(%Nested{map: %{a: 1}}, 1)
222 | 
223 |       assert %Nested{map: %{a: :truncated}} = Truncator.truncate(%Nested{map: %{a: %{b: 2}}}, 2)
224 | 
225 |       assert %Nested{list: :truncated} = Truncator.truncate(%Nested{list: [1, [2, [3]]]}, 1)
226 | 
227 |       assert %Nested{list: [:truncated, :truncated]} =
228 |                Truncator.truncate(%Nested{list: [1, [2, [3]]]}, 2)
229 | 
230 |       assert %Nested{struct: :truncated} =
231 |                Truncator.truncate(%Nested{struct: MapSet.new([1, 2, 3])}, 1)
232 | 
233 |       assert %Nested{struct: %MapSet{map: :truncated}} =
234 |                Truncator.truncate(%Nested{struct: MapSet.new([1, 2, 3])}, 2)
235 | 
236 |       assert %Nested{tuple: :truncated} =
237 |                Truncator.truncate(%Nested{tuple: {:a, {:b, {:c, {}}}}}, 1)
238 | 
239 |       assert %Nested{tuple: {:truncated, :truncated}} =
240 |                Truncator.truncate(%Nested{tuple: {:a, {:b, {:c, {}}}}}, 2)
241 | 
242 |       assert %Nested{tuple: {:a, {:truncated, :truncated}}} =
243 |                Truncator.truncate(%Nested{tuple: {:a, {:b, {:c, {}}}}}, 3)
244 |     end
245 |   end
246 | end
247 | 


--------------------------------------------------------------------------------