├── .formatter.exs
├── .gitignore
├── .travis.yml
├── CHANGELOG.md
├── LICENSE.md
├── README.md
├── bench
    └── tracker.exs
├── config
    └── config.exs
├── lib
    ├── swarm.ex
    └── swarm
    │   ├── app.ex
    │   ├── distribution
    │       ├── ring.ex
    │       ├── static_quorum_ring.ex
    │       └── strategy.ex
    │   ├── logger.ex
    │   ├── registry.ex
    │   └── tracker
    │       ├── crdt.ex
    │       ├── entry.ex
    │       └── tracker.ex
├── mix.exs
├── mix.lock
├── src
    └── swarm.erl
└── test
    ├── crdt_test.exs
    ├── distributed_test.exs
    ├── distribution
        └── static_quorum_ring_test.exs
    ├── integration_test.exs
    ├── quorum_test.exs
    ├── registry_test.exs
    ├── support
        ├── cluster.ex
        ├── example_sup.ex
        ├── example_worker.ex
        ├── node_case.ex
        ├── nodes.ex
        ├── restart_worker.ex
        ├── run.sh
        ├── sys.config
        └── sys_debug.config
    ├── test_helper.exs
    ├── tracker_replica_event_test.exs
    └── tracker_sync_test.exs


/.formatter.exs:
--------------------------------------------------------------------------------
1 | [
2 |   inputs: ["mix.exs", "{config,lib,test}/**/*.{ex,exs}"]
3 | ]
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # vscode elixir_ls plugin artifacts
 2 | /.elixir_ls
 3 | 
 4 | # The directory Mix will write compiled artifacts to.
 5 | /_build
 6 | 
 7 | # If you run "mix test --cover", coverage assets end up here.
 8 | /cover
 9 | 
10 | # The directory Mix downloads your dependencies sources to.
11 | /deps
12 | 
13 | # Where 3rd-party dependencies like ExDoc output generated docs.
14 | /doc
15 | 
16 | # If the VM crashes, it generates a dump, let's ignore it too.
17 | erl_crash.dump
18 | 
19 | # Also ignore archive artifacts (built via "mix archive.build").
20 | *.ez
21 | 
22 | # Intellij Files
23 | /.idea/
24 | /swarm.iml


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: elixir
 2 | elixir:
 3 |   - 1.6
 4 |   - 1.7
 5 | otp_release:
 6 |   - 20.0
 7 |   - 21.0
 8 | env:
 9 |   - MIX_ENV=test VERBOSE_TESTS=true
10 | cache:
11 |   directories:
12 |     - _build
13 |     - deps
14 | before_script:
15 |   - epmd -daemon
16 |   - mix deps.get
17 | script: mix test
18 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | ## Next Release
 2 | 
 3 | ### Changed
 4 | 
 5 | N/A
 6 | 
 7 | ### Added
 8 | 
 9 | - New `Swarm.Tracker.handoff/2` function which moves all processes of a worker to the remaining ones, so the worker node can be shut down gracefully [#83](https://github.com/bitwalker/swarm/pull/83).
10 | 
11 | ### Removed
12 | 
13 | N/A
14 | 
15 | ### Fixed
16 | 
17 | Don't attempt to hand-off or restart processes started with `Swarm.register_name/2` ([#63](https://github.com/bitwalker/swarm/pull/63)). Fixes #62.
18 | 
19 | ## 3.1
20 | 
21 | ### Changed
22 | 
23 | - Default `node_blacklist` was expanded to ignore hot upgrade scripting nodes as setup by exrm/relx/distillery.
24 | 
25 | ### Added
26 | 
27 | - New distribution strategy module `Swarm.Distribution.StaticQuorumRing` used to provide consistency during a network partition ([#38](https://github.com/bitwalker/swarm/pull/38)).
28 | - Name registration error returned if no available node ([#42](https://github.com/bitwalker/swarm/pull/42)).
29 | 
30 | ### Fixed
31 | 
32 | - When registering a name via `register_name/4` which is already registered,
33 |   ensure the process we created via `apply/3` is killed.
34 | - Remember process joined groups when nodes topology change ([#37](https://github.com/bitwalker/swarm/pull/37)).
35 | - Retry node up when `:swarm` fails to start ([#40](https://github.com/bitwalker/swarm/pull/40)).
36 | - Do not break local start order of application ([#43](https://github.com/bitwalker/swarm/pull/43)).
37 | - Add local registration when restarted named process is already started but unknown locally ([#46](https://github.com/bitwalker/swarm/pull/46)).
38 | - Retry starting remote process when module not yet available on target node ([#56](https://github.com/bitwalker/swarm/pull/56)).
39 | 
40 | ## 2.0
41 | 
42 | ### Removed
43 | 
44 | - Clustering functionality, this is now provided by the `libcluster` package
45 | - `:autocluster` config setting
46 | 
47 | ### Changed
48 | 
49 | - `debug: true` now enables `:sys` tracing of the tracker, use the Logger level to disable `:debug` level logs when `debug: false`
50 | 
51 | ### Added
52 | 
53 | - This file
54 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | # The MIT License (MIT)
 2 | 
 3 | ## Copyright (c) 2016 Paul Schoenfelder
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Swarm
  2 | 
  3 | [![Hex.pm Version](http://img.shields.io/hexpm/v/swarm.svg?style=flat)](https://hex.pm/packages/swarm) [![Build Status](https://travis-ci.com/bitwalker/swarm.svg?branch=master)](https://travis-ci.com/bitwalker/swarm)
  4 | 
  5 | **NOTE**: If you are upgrading from 1.0, be aware that the autoclustering functionality has been extracted
  6 | to its own package, which you will need to depend on if you use that feature.
  7 | The package is [libcluster](http://github.com/bitwalker/libcluster) and is available on
  8 | [Hex](https://hex.pm/packages/libcluster). Please be sure to read over the README to make sure your
  9 | config is properly updated.
 10 | 
 11 | Swarm is a global distributed registry, offering a feature set similar to that of `gproc`,
 12 | but architected to handle dynamic node membership and large volumes of process registrations
 13 | being created/removed in short time windows.
 14 | 
 15 | To be more clear, Swarm was born out of the need for a global process registry which could
 16 | handle large numbers of persistent processes representing devices/device connections, which
 17 | needed to be distributed around a cluster of Erlang nodes, and easily found. Messages need
 18 | to be routed to those processes from anywhere in the cluster, both individually, and as groups.
 19 | Additionally, those processes need to be shifted around the cluster based on cluster topology
 20 | changes, or restarted if their owning node goes down.
 21 | 
 22 | Before writing Swarm, I tried both `global` and `gproc`, but the former is not very flexible, and
 23 | both of them require leader election, which, in the face of dynamic node membership and the sheer
 24 | volume of registrations, ended up causing deadlocks/timeouts during leadership contention.
 25 | 
 26 | I also attempted to use `syn`, but because it uses `mnesia` at the time, dynamic node membership as a requirement
 27 | meant it was dead on arrival for my use case.
 28 | 
 29 | In short, are you running a cluster of Erlang nodes under something like Kubernetes? If so, Swarm is
 30 | for you!
 31 | 
 32 | View the docs [here](https://hexdocs.pm/swarm).
 33 | 
 34 | **PLEASE READ**: If you are giving Swarm a spin, it is important to understand that you can concoct scenarios whereby
 35 | the registry appears to be out of sync temporarily, this is a side effect of an eventually consistent model and does not mean that
 36 | Swarm is not working correctly, rather you need to ensure that applications you build on top of Swarm are written to embrace eventual
 37 | consistency, such that periods of inconsistency are tolerated. For the most part though, the registry replicates extremely
 38 | quickly, so noticeable inconsistency is more of an exception than a rule, but a proper distributed system should always be designed to
 39 | tolerate the exceptions, as they become more and more common as you scale up. If however you notice extreme inconsistency or delayed
 40 | replication, then it is possible it may be a bug, or performance issue, so feel free to open an issue if you are unsure, and we will gladly look into it.
 41 | 
 42 | ## Installation
 43 | 
 44 | ```elixir
 45 | defp deps do
 46 |   [{:swarm, "~> 3.0"}]
 47 | end
 48 | ```
 49 | 
 50 | ## Features
 51 | 
 52 | - automatic distribution of registered processes across
 53 |   the cluster based on a consistent hashing algorithm,
 54 |   where names are partitioned across nodes based on their hash.
 55 | - easy [handoff of processes](#process-handoff) between one node and another, including
 56 |   handoff of current process state.
 57 | - can do simple registration with `{:via, :swarm, name}`
 58 | - both an Erlang and Elixir API
 59 | 
 60 | ## Restrictions
 61 | 
 62 | - auto-balancing of processes in the cluster requires registrations to be done via
 63 |   `register_name/5`, which takes module/function/args params, and handles starting
 64 |   the process for you. The MFA must return `{:ok, pid}`.
 65 |   This is how Swarm handles process handoff between nodes, and automatic restarts when nodedown
 66 |   events occur and the cluster topology changes.
 67 | 
 68 | ### Process handoff
 69 | 
 70 | Processes may be redistributed between nodes when a node joins, or leaves, a cluster. You can indicate whether the handoff should simply restart the process on the new node, start the process and then send it the handoff message containing state, or ignore the handoff and remain on its current node.
 71 | 
 72 | Process state can be transferred between running nodes during process redistribution by using the `{:swarm, :begin_handoff}` and `{:swarm, :end_handoff, state}` callbacks. However process state will be lost when a node hosting a distributed process terminates. In this scenario you must restore the state yourself.
 73 | 
 74 | ## Consistency Guarantees
 75 | 
 76 | Like any distributed system, a choice must be made in terms of guarantees provided. You can choose between
 77 | availability or consistency during a network partition by selecting the appropriate process distribution strategy.
 78 | 
 79 | Swarm provides two strategies for you to use:
 80 | 
 81 | - #### `Swarm.Distribution.Ring`
 82 | 
 83 |   This strategy favors availability over consistency, even though it is eventually consistent, as
 84 |   network partitions, when healed, will be resolved by asking any copies of a given name that live on
 85 |   nodes where they don't belong to shutdown.
 86 | 
 87 |   Network partitions result in all partitions running an instance of processes created with Swarm.
 88 |   Swarm was designed for use in an IoT platform, where process names are generally based on physical
 89 |   device ids, and as such, the consistency issue is less of a problem. If events get routed to two
 90 |   separate partitions, it's generally not an issue if those events are for the same device. However
 91 |   this is clearly not ideal in all situations. Swarm also aims to be fast, so registrations and
 92 |   lookups must be as low latency as possible, even when the number of processes in the registry grows
 93 |   very large. This is achieved without consensus by using a consistent hash of the name which
 94 |   deterministically defines which node a process belongs on, and all requests to start a process on
 95 |   that node will be serialized through that node to prevent conflicts.
 96 | 
 97 |   This is the default strategy and requires no configuration.
 98 | 
 99 | - #### `Swarm.Distribution.StaticQuorumRing`
100 | 
101 |   A quorum is the minimum number of nodes that a distributed cluster has to obtain in order to be
102 |   allowed to perform an operation. This can be used to enforce consistent operation in a distributed
103 |   system.
104 | 
105 |   You configure the quorum size by defining the minimum number of nodes that must be connected in the
106 |   cluster to allow process registration and distribution. Calls to `Swarm.register_name/5` will return `{:error, :no_node_available}` if there are fewer nodes available than the configured minimum quorum size.
107 | 
108 |   In a network partition, the partition containing at least the quorum size number of clusters will
109 |   continue operation. Processes running on the other side of the split will be stopped and restarted
110 |   on the active side. This ensures that only one instance of a registered process will be running in
111 |   the cluster.
112 | 
113 |   You must configure this strategy and its minimum quorum size using the `:static_quorum_size` setting:
114 | 
115 |   ```elixir
116 |   config :swarm,
117 |     distribution_strategy: Swarm.Distribution.StaticQuorumRing,
118 |     static_quorum_size: 5
119 |   ```
120 | 
121 |   The quorum size should be set to half the cluster size, plus one node. So a three node cluster
122 |   would be two, a five node cluster is three, and a nine node cluster is five. You *must* not add more
123 |   than 2 x quorum size - 1 nodes to the cluster as this would cause a network split to result in
124 |   both partitions continuing operation.
125 | 
126 |   Processes are distributed amongst the cluster using the same consistent hash of their name as in
127 |   the ring strategy above.
128 | 
129 |   This strategy is a good choice when you have a fixed number of nodes in the cluster.
130 | 
131 | ## Clustering
132 | 
133 | Swarm pre-2.0 included auto-clustering functionality, but that has been split out into its own package,
134 | [libcluster](https://github.com/bitwalker/libcluster). Swarm works out of the box with Erlang's distribution
135 | tools (i.e. `Node.connect/1`, `:net_kernel.connect_node/1`, etc.), but if you need the auto-clustering that Swarm
136 | previously provided, you will need to add `:libcluster` to your deps, and make sure it's in your applications
137 | list *before* `:swarm`. Some of the configuration has changed slightly in `:libcluster`, so be sure to review
138 | the docs.
139 | 
140 | ### Node Blacklisting/Whitelisting
141 | 
142 | You can explicitly whitelist or blacklist nodes to prevent certain nodes from being included in Swarm's consistent
143 | hash ring. This is done with either the `node_whitelist` and `node_blacklist` settings respectively. These settings
144 | must be lists containing either literal strings or valid Elixir regex patterns as either string or regex literals.
145 | If no whitelist is set, then the blacklist is used, and if no blacklist is provided, the default blacklist includes
146 | two patterns, in both cases to ignore nodes which are created by Relx/ExRM/Distillery when using releases, in order
147 | to setup remote shells (the first) and hot upgrade scripting (the second), the patterns can be found in this repo's
148 | `config/config.exs` file, and you can find a quick example below:
149 | 
150 | ```elixir
151 | config :swarm,
152 |   node_whitelist: [~r/^myapp-[\d]@.*$/]
153 | ```
154 | 
155 | The above will only allow nodes named something like `myapp-1@somehost` to be included in Swarm's clustering. **NOTE**:
156 | It is important to understand that this does not prevent those nodes from connecting to the cluster, only that Swarm will
157 | not include those nodes in its distribution algorithm, or communicate with those nodes.
158 | 
159 | ## Registration/Process Grouping
160 | 
161 | Swarm is intended to be used by registering processes *before* they are created, and letting Swarm start
162 | them for you on the proper node in the cluster. This is done via `Swarm.register_name/5`. You may also register
163 | processes the normal way, i.e. `GenServer.start_link({:via, :swarm, name}, ...)`. Swarm will manage these
164 | registrations, and replicate them across the cluster, however these processes will not be moved in response
165 | to cluster topology changes.
166 | 
167 | Swarm also offers process grouping, similar to the way `gproc` does properties. You "join" a process to a group
168 | after it is started, (beware of doing so in `init/1` outside of a Task, or it will deadlock), with `Swarm.join/2`.
169 | You can then publish messages (i.e. `cast`) with
170 | `Swarm.publish/2`, and/or call all processes in a group and collect results (i.e. `call`) with `Swarm.multi_call/2` or
171 | `Swarm.multi_call/3`. Leaving a group can be done with `Swarm.leave/2`, but will automatically be done when a process
172 | dies. Join/leave can be used to do pubsub like things, or perform operations over a group of related processes.
173 | 
174 | ## Debugging/Troubleshooting
175 | 
176 | By configuring Swarm with `debug: true` and setting Logger's log level to `:debug`, you can get much more
177 | information about what it is doing during operation to troubleshoot issues.
178 | 
179 | To dump the tracker's state, you can use `:sys.get_state(Swarm.Tracker)` or `:sys.get_status(Swarm.Tracker)`.
180 | The former will dump the tracker state including what nodes it is tracking, what nodes are in the hash ring,
181 | and the state of the interval tree clock. The latter will dump more detailed process info, including the current
182 | function and its arguments. This is particularly useful if it appears that the tracker is stuck and not doing
183 | anything. If you do find such things, please gist all of these results and open an issue so that I can fix these
184 | issues if they arise.
185 | 
186 | ## Example
187 | 
188 | The following example shows a simple case where workers are dynamically created in response
189 | to some events under a supervisor, and we want them to be distributed across the cluster and
190 | be discoverable by name from anywhere in the cluster. Swarm is a perfect fit for this
191 | situation.
192 | 
193 | ```elixir
194 | defmodule MyApp.Supervisor do
195 |   @moduledoc """
196 |   This is the supervisor for the worker processes you wish to distribute
197 |   across the cluster, Swarm is primarily designed around the use case
198 |   where you are dynamically creating many workers in response to events. It
199 |   works with other use cases as well, but that's the ideal use case.
200 |   """
201 |   use Supervisor
202 | 
203 |   def start_link() do
204 |     Supervisor.start_link(__MODULE__, [], name: __MODULE__)
205 |   end
206 | 
207 |   def init(_) do
208 |     children = [
209 |       worker(MyApp.Worker, [], restart: :temporary)
210 |     ]
211 |     supervise(children, strategy: :simple_one_for_one)
212 |   end
213 | 
214 |   @doc """
215 |   Registers a new worker, and creates the worker process
216 |   """
217 |   def register(worker_name) do
218 |     {:ok, _pid} = Supervisor.start_child(__MODULE__, [worker_name])
219 |   end
220 | end
221 | 
222 | defmodule MyApp.Worker do
223 |   @moduledoc """
224 |   This is the worker process, in this case, it simply posts on a
225 |   random recurring interval to stdout.
226 |   """
227 |   def start_link(name) do
228 |     GenServer.start_link(__MODULE__, [name])
229 |   end
230 | 
231 |   def init([name]) do
232 |     {:ok, {name, :rand.uniform(5_000)}, 0}
233 |   end
234 | 
235 |   # called when a handoff has been initiated due to changes
236 |   # in cluster topology, valid response values are:
237 |   #
238 |   #   - `:restart`, to simply restart the process on the new node
239 |   #   - `{:resume, state}`, to hand off some state to the new process
240 |   #   - `:ignore`, to leave the process running on its current node
241 |   #
242 |   def handle_call({:swarm, :begin_handoff}, _from, {name, delay}) do
243 |     {:reply, {:resume, {name, delay}}, {name, delay}}
244 |   end
245 |   # called after the process has been restarted on its new node,
246 |   # and the old process' state is being handed off. This is only
247 |   # sent if the return to `begin_handoff` was `{:resume, state}`.
248 |   # **NOTE**: This is called *after* the process is successfully started,
249 |   # so make sure to design your processes around this caveat if you
250 |   # wish to hand off state like this.
251 |   def handle_cast({:swarm, :end_handoff, delay}, {name, _}) do
252 |     {:noreply, {name, delay}}
253 |   end
254 |   # called when a network split is healed and the local process
255 |   # should continue running, but a duplicate process on the other
256 |   # side of the split is handing off its state to us. You can choose
257 |   # to ignore the handoff state, or apply your own conflict resolution
258 |   # strategy
259 |   def handle_cast({:swarm, :resolve_conflict, _delay}, state) do
260 |     {:noreply, state}
261 |   end
262 | 
263 |   def handle_info(:timeout, {name, delay}) do
264 |     IO.puts "#{inspect name} says hi!"
265 |     Process.send_after(self(), :timeout, delay)
266 |     {:noreply, {name, delay}}
267 |   end
268 |   # this message is sent when this process should die
269 |   # because it is being moved, use this as an opportunity
270 |   # to clean up
271 |   def handle_info({:swarm, :die}, state) do
272 |     {:stop, :shutdown, state}
273 |   end
274 | end
275 | 
276 | defmodule MyApp.ExampleUsage do
277 |   ...snip...
278 | 
279 |   @doc """
280 |   Starts worker and registers name in the cluster, then joins the process
281 |   to the `:foo` group
282 |   """
283 |   def start_worker(name) do
284 |     {:ok, pid} = Swarm.register_name(name, MyApp.Supervisor, :register, [name])
285 |     Swarm.join(:foo, pid)
286 |   end
287 | 
288 |   @doc """
289 |   Gets the pid of the worker with the given name
290 |   """
291 |   def get_worker(name), do: Swarm.whereis_name(name)
292 | 
293 |   @doc """
294 |   Gets all of the pids that are members of the `:foo` group
295 |   """
296 |   def get_foos(), do: Swarm.members(:foo)
297 | 
298 |   @doc """
299 |   Call some worker by name
300 |   """
301 |   def call_worker(name, msg), do: GenServer.call({:via, :swarm, name}, msg)
302 | 
303 |   @doc """
304 |   Cast to some worker by name
305 |   """
306 |   def cast_worker(name, msg), do: GenServer.cast({:via, :swarm, name}, msg)
307 | 
308 |   @doc """
309 |   Publish a message to all members of group `:foo`
310 |   """
311 |   def publish_foos(msg), do: Swarm.publish(:foo, msg)
312 | 
313 |   @doc """
314 |   Call all members of group `:foo` and collect the results,
315 |   any failures or nil values are filtered out of the result list
316 |   """
317 |   def call_foos(msg), do: Swarm.multi_call(:foo, msg)
318 | 
319 |   ...snip...
320 | end
321 | ```
322 | 
323 | ## License
324 | 
325 | MIT
326 | 
327 | ## Testing
328 | 
329 | `mix test` runs a variety of tests, most of them use a cluster of
330 | Elixir nodes to test the tracker and the registry. If you want more
331 | verbose output during the tests, run them like this:
332 | 
333 |     # SWARM_DEBUG=true mix test
334 | 
335 | This sets the log level to `:debug`, runs ExUnit with `--trace`, and
336 | enables GenServer tracing on the Tracker processes.
337 | 
338 | ### Executing the tests locally
339 | In order to execute the tests locally you'll need to have
340 | [Erlang Port Mapper Daemon](http://erlang.org/doc/man/epmd.html) running.
341 | 
342 | If you don't have `epmd` running you can start it using the following command:
343 | 
344 |     epmd -daemon
345 | 
346 | 
347 | ## TODO
348 | 
349 | - automated testing (some are present)
350 | - QuickCheck model
351 | 


--------------------------------------------------------------------------------
/bench/tracker.exs:
--------------------------------------------------------------------------------
 1 | Application.put_env(:swarm, :debug, false)
 2 | Application.ensure_started(:swarm)
 3 | 
 4 | defmodule SwarmTest.Worker do
 5 |   use GenServer
 6 | 
 7 |   def start_link(), do: GenServer.start_link(__MODULE__, [])
 8 |   def init(_), do: {:ok, nil}
 9 | 
10 |   def handle_call(msg, _from, state) do
11 |     {:reply, msg, state}
12 |   end
13 | end
14 | 
15 | :rand.seed(:exs64)
16 | 
17 | # Best single-node run so far 470ms for 10k
18 | Benchee.run(%{time: 10}, %{
19 |   "Swarm.register_name/4" => fn ->
20 |     for i <- 1..10_000 do
21 |       r = :rand.uniform(100_000_000)
22 |       {:ok, _pid} = Swarm.register_name({:myapp, i, r}, SwarmTest.Worker, :start_link, [])
23 |     end
24 |   end
25 | })
26 | 


--------------------------------------------------------------------------------
/config/config.exs:
--------------------------------------------------------------------------------
 1 | use Mix.Config
 2 | 
 3 | config :swarm,
 4 |   nodes: [:"node1@127.0.0.1", :"node2@127.0.0.1"],
 5 |   sync_nodes_timeout: 0,
 6 |   anti_entropy_interval: 5_000,
 7 |   node_blacklist: [
 8 |     # the following blacklists nodes set up by exrm/relx/distillery
 9 |     # for remote shells (the first) and hot upgrade scripting (the second)
10 |     ~r/^primary@.+$/,
11 |     ~r/^remsh.*$/,
12 |     ~r/^.+_upgrader_.+$/
13 |     # or using strings..
14 |     # "some_node" - literals
15 |     # "^remsh.*$" - regex patterns
16 |   ],
17 |   node_whitelist: [
18 |     # the same type of list as node_blacklist
19 |   ]
20 | 
21 | config :logger, level: :warn
22 | 
23 | if System.get_env("SWARM_DEBUG") == "true" do
24 |   config :swarm, debug: true
25 |   config :logger, level: :debug
26 | end
27 | 
28 | config :porcelain,
29 |   goon_warn_if_missing: false
30 | 


--------------------------------------------------------------------------------
/lib/swarm.ex:
--------------------------------------------------------------------------------
  1 | defmodule Swarm do
  2 |   @moduledoc """
  3 |   This is the public Elixir API for `:swarm`.
  4 |   """
  5 |   use Application
  6 | 
  7 |   @doc """
  8 |   Starts the Swarm application. You should not need to do this unless
  9 |   you are manually handling Swarm's application lifecycle.
 10 |   """
 11 |   def start(_type, _args) do
 12 |     Swarm.App.start_link()
 13 |   end
 14 | 
 15 |   @doc """
 16 |   Registers the given name to the given pid, however names
 17 |   registered this way will not be shifted when the cluster
 18 |   topology changes, but this allows you to use `:swarm` as
 19 |   a distributed process registry, including registering names
 20 |   with `{:via, :swarm, name}`.
 21 |   """
 22 |   @spec register_name(term, pid) :: :yes | :no
 23 |   def register_name(name, pid) when is_pid(pid) do
 24 |     case Swarm.Registry.register(name, pid) do
 25 |       {:ok, _} -> :yes
 26 |       _ -> :no
 27 |     end
 28 |   end
 29 | 
 30 |   @doc """
 31 |   Similar to register_name/2, except this version takes module/function/args
 32 |   parameters, and starts the process, registers the pid with the given name,
 33 |   and handles cluster topology changes by restarting the process on its new
 34 |   node using the given MFA.
 35 | 
 36 |   This version also returns an ok tuple with the pid if it registers successfully,
 37 |   or an error tuple if registration fails. You cannot use this with processes which
 38 |   are already started, it must be started by `:swarm`.
 39 | 
 40 |   A call to `Swarm.register_name/5` will return `{:error, :no_node_available}`
 41 |   when the configured distribution strategy returns `:undefined` as the node to
 42 |   host the named process. This indicates that there are too few nodes available to
 43 |   start a process. You can retry the name registration while waiting for nodes
 44 |   to join the cluster.
 45 | 
 46 |   Provide an optional `:timeout` value to limit the duration of register name calls.
 47 |   The default value is `:infinity` to block indefinitely.
 48 |   """
 49 |   @spec register_name(term, atom(), atom(), [term]) :: {:ok, pid} | {:error, term}
 50 |   @spec register_name(term, atom(), atom(), [term], non_neg_integer() | :infinity) ::
 51 |           {:ok, pid} | {:error, term}
 52 |   def register_name(name, m, f, a, timeout \\ :infinity)
 53 |   def register_name(name, m, f, a, timeout), do: Swarm.Registry.register(name, m, f, a, timeout)
 54 | 
 55 |   @doc """
 56 |    Either finds the named process in the swarm or registers it using the register function.
 57 |   """
 58 |   @spec whereis_or_register_name(term, atom(), atom(), [term]) :: {:ok, pid} | {:error, term}
 59 |   @spec whereis_or_register_name(term, atom(), atom(), [term], non_neg_integer() | :infinity) ::
 60 |           {:ok, pid} | {:error, term}
 61 |   def whereis_or_register_name(name, m, f, a, timeout \\ :infinity)
 62 | 
 63 |   def whereis_or_register_name(name, m, f, a, timeout),
 64 |     do: Swarm.Registry.whereis_or_register(name, m, f, a, timeout)
 65 | 
 66 |   @doc """
 67 |   Unregisters the given name from the registry.
 68 |   """
 69 |   @spec unregister_name(term) :: :ok
 70 |   defdelegate unregister_name(name), to: Swarm.Registry, as: :unregister
 71 | 
 72 |   @doc """
 73 |   Get the pid of a registered name.
 74 |   """
 75 |   @spec whereis_name(term) :: pid | :undefined
 76 |   defdelegate whereis_name(name), to: Swarm.Registry, as: :whereis
 77 | 
 78 |   @doc """
 79 |   Joins a process to a group
 80 |   """
 81 |   @spec join(term, pid) :: :ok
 82 |   defdelegate join(group, pid), to: Swarm.Registry
 83 | 
 84 |   @doc """
 85 |   Removes a process from a group
 86 |   """
 87 |   @spec leave(term, pid) :: :ok
 88 |   defdelegate leave(group, pid), to: Swarm.Registry
 89 | 
 90 |   @doc """
 91 |   Gets all the members of a group. Returns a list of pids.
 92 |   """
 93 |   @spec members(term()) :: [pid]
 94 |   defdelegate members(group), to: Swarm.Registry
 95 | 
 96 |   @doc """
 97 |   Gets a list of all registered names and their pids
 98 |   """
 99 |   @spec registered() :: [{name :: term, pid}]
100 |   defdelegate registered, to: Swarm.Registry
101 | 
102 |   @doc """
103 |   Publishes a message to a group. This is done via `Kernel.send/2`,
104 |   so GenServers and the like will receive it via `handle_info/2`.
105 |   """
106 |   @spec publish(term, term) :: :ok
107 |   defdelegate publish(group, msg), to: Swarm.Registry
108 | 
109 |   @doc """
110 |   Calls each process in a group, and collects the results into a list.
111 |   The order of the results is not guaranteed. Calls are made via `GenServer.call/2`,
112 |   so process will need to handle a message in that format.
113 |   """
114 |   @spec multi_call(term, term) :: [any()]
115 |   defdelegate multi_call(group, msg), to: Swarm.Registry
116 | 
117 |   @doc """
118 |   Same as `multi_call/2`, except allows for a configurable timeout. The timeout
119 |   is per-call, but since all calls are done in parallel, this is effectively the absolute
120 |   timeout as well.
121 |   """
122 |   @spec multi_call(term, term, pos_integer) :: [any()]
123 |   defdelegate multi_call(group, msg, timeout), to: Swarm.Registry
124 | 
125 |   @doc """
126 |   This is primarily here for use by the standard library facilities for sending messages
127 |   to a process, e.g. by `GenServer.cast/2`. It sends a message to a process by name, using
128 |   `Kernel.send/2`.
129 |   """
130 |   @spec send(term, term) :: :ok
131 |   defdelegate send(name, msg), to: Swarm.Registry
132 | end
133 | 


--------------------------------------------------------------------------------
/lib/swarm/app.ex:
--------------------------------------------------------------------------------
 1 | defmodule Swarm.App do
 2 |   @moduledoc false
 3 |   use Supervisor
 4 | 
 5 |   def start_link() do
 6 |     Supervisor.start_link(__MODULE__, [], name: __MODULE__)
 7 |   end
 8 | 
 9 |   def init(_) do
10 |     children = [
11 |       supervisor(Task.Supervisor, [[name: Swarm.TaskSupervisor]]),
12 |       worker(Swarm.Registry, []),
13 |       worker(Swarm.Tracker, [])
14 |     ]
15 | 
16 |     supervise(children, strategy: :one_for_one)
17 |   end
18 | end
19 | 


--------------------------------------------------------------------------------
/lib/swarm/distribution/ring.ex:
--------------------------------------------------------------------------------
 1 | defmodule Swarm.Distribution.Ring do
 2 |   @moduledoc false
 3 |   use Swarm.Distribution.Strategy
 4 | 
 5 |   def create(), do: HashRing.new()
 6 |   def add_node(ring, node), do: HashRing.add_node(ring, node)
 7 |   def add_node(ring, node, weight), do: HashRing.add_node(ring, node, weight)
 8 |   def add_nodes(ring, nodes), do: HashRing.add_nodes(ring, nodes)
 9 |   def remove_node(ring, node), do: HashRing.remove_node(ring, node)
10 |   def key_to_node(ring, key), do: HashRing.key_to_node(ring, key)
11 | end
12 | 


--------------------------------------------------------------------------------
/lib/swarm/distribution/static_quorum_ring.ex:
--------------------------------------------------------------------------------
  1 | defmodule Swarm.Distribution.StaticQuorumRing do
  2 |   @moduledoc """
  3 |   A quorum is the minimum number of nodes that a distributed cluster has to
  4 |   obtain in order to be allowed to perform an operation. This can be used to
  5 |   enforce consistent operation in a distributed system.
  6 | 
  7 |   ## Quorum size
  8 | 
  9 |   You must configure this distribution strategy and specify its minimum quorum
 10 |   size:
 11 | 
 12 |       config :swarm,
 13 |         distribution_strategy: Swarm.Distribution.StaticQuorumRing,
 14 |         static_quorum_size: 5
 15 | 
 16 |   It defines the minimum number of nodes that must be connected in the cluster
 17 |   to allow process registration and distribution.
 18 | 
 19 |   If there are fewer nodes currently available than the quorum size, any calls
 20 |   to `Swarm.register_name/5` will return `{:error, :no_node_available}` until
 21 |   enough nodes have started.
 22 | 
 23 |   You can configure the `:kernel` application to wait for cluster formation
 24 |   before starting your application during node start up. The
 25 |   `sync_nodes_optional` configuration specifies which nodes to attempt to
 26 |   connect to within the `sync_nodes_timeout` window, defined in milliseconds,
 27 |   before continuing with startup. There is also a `sync_nodes_mandatory` setting
 28 |   which can be used to enforce all nodes are connected within the timeout window
 29 |   or else the node terminates.
 30 | 
 31 |       config :kernel,
 32 |         sync_nodes_optional: [:"node1@192.168.1.1", :"node2@192.168.1.2"],
 33 |         sync_nodes_timeout: 60_000
 34 | 
 35 |   The `sync_nodes_timeout` can be configured as `:infinity` to wait indefinitely
 36 |   for all nodes to connect. All involved nodes must have the same value for
 37 |   `sync_nodes_timeout`.
 38 | 
 39 |   ### Example
 40 | 
 41 |   In a 9 node cluster you would configure the `:static_quorum_size` as 5. During
 42 |   a network split of 4 and 5 nodes, processes on the side with 5 nodes
 43 |   will continue running, whereas processes on the other 4 nodes will be stopped.
 44 | 
 45 |   Be aware that in the running 5 node cluster, no more failures can be handled
 46 |   because the remaining cluster size would be less than the required 5 node
 47 |   minimum. All running processes would be stopped in the case of another single
 48 |   node failure.
 49 |   """
 50 | 
 51 |   use Swarm.Distribution.Strategy
 52 | 
 53 |   alias Swarm.Distribution.StaticQuorumRing
 54 | 
 55 |   defstruct [:static_quorum_size, :ring]
 56 | 
 57 |   def create do
 58 |     %StaticQuorumRing{
 59 |       static_quorum_size: static_quorum_size(),
 60 |       ring: HashRing.new()
 61 |     }
 62 |   end
 63 | 
 64 |   def add_node(quorum, node) do
 65 |     %StaticQuorumRing{quorum | ring: HashRing.add_node(quorum.ring, node)}
 66 |   end
 67 | 
 68 |   def add_node(quorum, node, weight) do
 69 |     %StaticQuorumRing{quorum | ring: HashRing.add_node(quorum.ring, node, weight)}
 70 |   end
 71 | 
 72 |   def add_nodes(quorum, nodes) do
 73 |     %StaticQuorumRing{quorum | ring: HashRing.add_nodes(quorum.ring, nodes)}
 74 |   end
 75 | 
 76 |   def remove_node(quorum, node) do
 77 |     %StaticQuorumRing{quorum | ring: HashRing.remove_node(quorum.ring, node)}
 78 |   end
 79 | 
 80 |   @doc """
 81 |   Maps a key to a specific node via the current distribution strategy.
 82 | 
 83 |   If the available nodes in the cluster are fewer than the minimum node count it returns `:undefined`.
 84 |   """
 85 |   def key_to_node(%StaticQuorumRing{static_quorum_size: static_quorum_size, ring: ring}, key) do
 86 |     case length(ring.nodes) do
 87 |       node_count when node_count < static_quorum_size -> :undefined
 88 |       _ -> HashRing.key_to_node(ring, key)
 89 |     end
 90 |   end
 91 | 
 92 |   defp static_quorum_size() do
 93 |     Application.get_env(:swarm, :static_quorum_size, 2)
 94 |     |> static_quorum_size()
 95 |   end
 96 | 
 97 |   defp static_quorum_size(nil), do: static_quorum_size(2)
 98 | 
 99 |   defp static_quorum_size(binary) when is_binary(binary) do
100 |     binary
101 |     |> Integer.parse()
102 |     |> convert_to_integer()
103 |     |> static_quorum_size()
104 |   end
105 | 
106 |   defp static_quorum_size(size) when is_integer(size) and size > 0, do: size
107 | 
108 |   defp static_quorum_size(_size),
109 |     do: raise("config :static_quorum_size should be a positive integer")
110 | 
111 |   defp convert_to_integer({integer, _}) when is_integer(integer), do: integer
112 |   defp convert_to_integer(other), do: other
113 | end
114 | 


--------------------------------------------------------------------------------
/lib/swarm/distribution/strategy.ex:
--------------------------------------------------------------------------------
 1 | defmodule Swarm.Distribution.Strategy do
 2 |   @moduledoc """
 3 |   This module implements the interface for custom distribution strategies.
 4 |   The default strategy used by Swarm is a consistent hash ring implemented
 5 |   via the `libring` library.
 6 | 
 7 |   Custom strategies are expected to return a datastructure or pid which will be
 8 |   passed along to any functions which need to manipulate the current distribution state.
 9 |   This can be either a plain datastructure (as is the case with the libring-based strategy),
10 |   or a pid which your strategy module then uses to call a process in your own supervision tree.
11 | 
12 |   For efficiency reasons, it is highly recommended to use plain datastructures rather than a
13 |   process for storing the distribution state, because it has the potential to become a bottleneck otherwise,
14 |   however this is really up to the needs of your situation, just know that you can go either way.
15 |   """
16 |   alias Swarm.Distribution.Ring, as: RingStrategy
17 | 
18 |   defmacro __using__(_) do
19 |     quote do
20 |       @behaviour Swarm.Distribution.Strategy
21 |     end
22 |   end
23 | 
24 |   @type reason :: String.t()
25 |   @type strategy :: term
26 |   @type weight :: pos_integer
27 |   @type nodelist :: [node() | {node(), weight}]
28 |   @type key :: term
29 | 
30 |   @type t :: strategy
31 | 
32 |   @callback create() :: strategy | {:error, reason}
33 |   @callback add_node(strategy, node) :: strategy | {:error, reason}
34 |   @callback add_node(strategy, node, weight) :: strategy | {:error, reason}
35 |   @callback add_nodes(strategy, nodelist) :: strategy | {:error, reason}
36 |   @callback remove_node(strategy, node) :: strategy | {:error, reason}
37 |   @callback key_to_node(strategy, key) :: node() | :undefined
38 | 
39 |   def create(), do: strategy_module().create()
40 |   def create(node), do: strategy_module().add_node(create(), node)
41 | 
42 |   @doc """
43 |   Adds a node to the state of the current distribution strategy.
44 |   """
45 |   def add_node(strategy, node) do
46 |     strategy_module().add_node(strategy, node)
47 |   end
48 | 
49 |   @doc """
50 |   Adds a node to the state of the current distribution strategy,
51 |   and give it a specific weighting relative to other nodes.
52 |   """
53 |   def add_node(strategy, node, weight) do
54 |     strategy_module().add_node(strategy, node, weight)
55 |   end
56 | 
57 |   @doc """
58 |   Adds a list of nodes to the state of the current distribution strategy.
59 |   The node list can be composed of both node names (atoms) or tuples containing
60 |   a node name and a weight for that node.
61 |   """
62 |   def add_nodes(strategy, nodes) do
63 |     strategy_module().add_nodes(strategy, nodes)
64 |   end
65 | 
66 |   @doc """
67 |   Removes a node from the state of the current distribution strategy.
68 |   """
69 |   def remove_node(strategy, node) do
70 |     strategy_module().remove_node(strategy, node)
71 |   end
72 | 
73 |   @doc """
74 |   Maps a key to a specific node via the current distribution strategy.
75 |   """
76 |   def key_to_node(strategy, node) do
77 |     strategy_module().key_to_node(strategy, node)
78 |   end
79 | 
80 |   defp strategy_module(), do: Application.get_env(:swarm, :distribution_strategy, RingStrategy)
81 | end
82 | 


--------------------------------------------------------------------------------
/lib/swarm/logger.ex:
--------------------------------------------------------------------------------
 1 | defmodule Swarm.Logger do
 2 |   @moduledoc false
 3 | 
 4 |   @doc """
 5 |   Formats a log message to include info on which node swarm is running on
 6 |   """
 7 |   @spec format(String.t()) :: String.t()
 8 |   def format(message), do: "[swarm on #{Node.self()}] #{message}"
 9 | end
10 | 


--------------------------------------------------------------------------------
/lib/swarm/registry.ex:
--------------------------------------------------------------------------------
  1 | defmodule Swarm.Registry do
  2 |   @moduledoc false
  3 |   import Swarm.Entry
  4 |   alias Swarm.{Tracker, Entry}
  5 |   use GenServer
  6 | 
  7 |   @table_name :swarm_registry
  8 | 
  9 |   ## Public API
 10 | 
 11 |   defdelegate register(name, pid), to: Tracker, as: :track
 12 |   defdelegate register(name, module, fun, args, timeout), to: Tracker, as: :track
 13 | 
 14 |   @spec unregister(term) :: :ok
 15 |   def unregister(name) do
 16 |     case get_by_name(name) do
 17 |       :undefined -> :ok
 18 |       entry(pid: pid) when is_pid(pid) -> Tracker.untrack(pid)
 19 |     end
 20 |   end
 21 | 
 22 |   @spec whereis(term) :: :undefined | pid
 23 |   def whereis(name) do
 24 |     case get_by_name(name) do
 25 |       :undefined ->
 26 |         Tracker.whereis(name)
 27 | 
 28 |       entry(pid: pid) when is_pid(pid) ->
 29 |         pid
 30 |     end
 31 |   end
 32 | 
 33 |   @spec whereis_or_register(term, atom(), atom(), [term]) :: {:ok, pid} | {:error, term}
 34 |   def whereis_or_register(name, m, f, a, timeout \\ :infinity)
 35 | 
 36 |   @spec whereis_or_register(term, atom(), atom(), [term], non_neg_integer() | :infinity) ::
 37 |           {:ok, pid} | {:error, term}
 38 |   def whereis_or_register(name, module, fun, args, timeout) do
 39 |     with :undefined <- whereis(name),
 40 |          {:ok, pid} <- register(name, module, fun, args, timeout) do
 41 |       {:ok, pid}
 42 |     else
 43 |       pid when is_pid(pid) ->
 44 |         {:ok, pid}
 45 | 
 46 |       {:error, {:already_registered, pid}} ->
 47 |         {:ok, pid}
 48 | 
 49 |       {:error, _} = err ->
 50 |         err
 51 |     end
 52 |   end
 53 | 
 54 |   @spec join(term, pid) :: :ok
 55 |   def join(group, pid), do: Tracker.add_meta(group, true, pid)
 56 | 
 57 |   @spec leave(term, pid) :: :ok
 58 |   defdelegate leave(group, pid), to: Tracker, as: :remove_meta
 59 | 
 60 |   @spec members(group :: term) :: [pid]
 61 |   def members(group) do
 62 |     :ets.select(@table_name, [
 63 |       {entry(name: :"$1", pid: :"$2", ref: :"$3", meta: %{group => :"$4"}, clock: :"$5"), [],
 64 |        [:"$_"]}
 65 |     ])
 66 |     |> Enum.map(fn entry(pid: pid) -> pid end)
 67 |     |> Enum.uniq()
 68 |   end
 69 | 
 70 |   @spec registered() :: [{name :: term, pid}]
 71 |   defdelegate registered(), to: __MODULE__, as: :all
 72 | 
 73 |   @spec publish(term, term) :: :ok
 74 |   def publish(group, msg) do
 75 |     for pid <- members(group), do: Kernel.send(pid, msg)
 76 |     :ok
 77 |   end
 78 | 
 79 |   @spec multi_call(term, term, pos_integer) :: [term]
 80 |   def multi_call(group, msg, timeout \\ 5_000) do
 81 |     Enum.map(members(group), fn member ->
 82 |       Task.Supervisor.async_nolink(Swarm.TaskSupervisor, fn ->
 83 |         GenServer.call(member, msg, timeout)
 84 |       end)
 85 |     end)
 86 |     |> Enum.map(&Task.await(&1, :infinity))
 87 |   end
 88 | 
 89 |   @spec send(name :: term, msg :: term) :: :ok
 90 |   def send(name, msg) do
 91 |     case whereis(name) do
 92 |       :undefined ->
 93 |         :ok
 94 | 
 95 |       pid when is_pid(pid) ->
 96 |         Kernel.send(pid, msg)
 97 |     end
 98 |   end
 99 | 
100 |   ### Low-level ETS manipulation functions
101 | 
102 |   @spec all() :: [{name :: term(), pid()}]
103 |   def all() do
104 |     :ets.tab2list(@table_name)
105 |     |> Enum.map(fn entry(name: name, pid: pid) -> {name, pid} end)
106 |   end
107 | 
108 |   @spec snapshot() :: [Entry.entry()]
109 |   def snapshot() do
110 |     :ets.tab2list(@table_name)
111 |   end
112 | 
113 |   @doc """
114 |   Inserts a new registration, and returns true if successful, or false if not
115 |   """
116 |   @spec new(Entry.entry()) :: boolean
117 |   def new(entry() = reg) do
118 |     :ets.insert_new(@table_name, reg)
119 |   end
120 | 
121 |   @doc """
122 |   Like `new/1`, but raises if the insertion fails.
123 |   """
124 |   @spec new!(Entry.entry()) :: true | no_return
125 |   def new!(entry() = reg) do
126 |     true = :ets.insert_new(@table_name, reg)
127 |   end
128 | 
129 |   @spec remove(Entry.entry()) :: true
130 |   def remove(entry() = reg) do
131 |     :ets.delete_object(@table_name, reg)
132 |   end
133 | 
134 |   @spec remove_by_pid(pid) :: true
135 |   def remove_by_pid(pid) when is_pid(pid) do
136 |     case get_by_pid(pid) do
137 |       :undefined ->
138 |         true
139 | 
140 |       entries when is_list(entries) ->
141 |         Enum.each(entries, &:ets.delete_object(@table_name, &1))
142 |         true
143 |     end
144 |   end
145 | 
146 |   @spec get_by_name(term()) :: :undefined | Entry.entry()
147 |   def get_by_name(name) do
148 |     case :ets.lookup(@table_name, name) do
149 |       [] -> :undefined
150 |       [obj] -> obj
151 |     end
152 |   end
153 | 
154 |   @spec get_by_pid(pid) :: :undefined | [Entry.entry()]
155 |   def get_by_pid(pid) do
156 |     case :ets.match_object(
157 |            @table_name,
158 |            entry(name: :"$1", pid: pid, ref: :"$2", meta: :"$3", clock: :"$4")
159 |          ) do
160 |       [] -> :undefined
161 |       list when is_list(list) -> list
162 |     end
163 |   end
164 | 
165 |   @spec get_by_pid_and_name(pid(), term()) :: :undefined | Entry.entry()
166 |   def get_by_pid_and_name(pid, name) do
167 |     case :ets.match_object(
168 |            @table_name,
169 |            entry(name: name, pid: pid, ref: :"$1", meta: :"$2", clock: :"$3")
170 |          ) do
171 |       [] -> :undefined
172 |       [obj] -> obj
173 |     end
174 |   end
175 | 
176 |   @spec get_by_ref(reference()) :: :undefined | Entry.entry()
177 |   def get_by_ref(ref) do
178 |     case :ets.match_object(
179 |            @table_name,
180 |            entry(name: :"$1", pid: :"$2", ref: ref, meta: :"$3", clock: :"$4")
181 |          ) do
182 |       [] -> :undefined
183 |       [obj] -> obj
184 |     end
185 |   end
186 | 
187 |   @spec get_by_meta(term()) :: :undefined | [Entry.entry()]
188 |   def get_by_meta(key) do
189 |     case :ets.match_object(
190 |            @table_name,
191 |            entry(name: :"$1", pid: :"$2", ref: :"$3", meta: %{key => :"$4"}, clock: :"$5")
192 |          ) do
193 |       [] -> :undefined
194 |       list when is_list(list) -> list
195 |     end
196 |   end
197 | 
198 |   @spec get_by_meta(term(), term()) :: :undefined | [Entry.entry()]
199 |   def get_by_meta(key, value) do
200 |     case :ets.match_object(
201 |            @table_name,
202 |            entry(name: :"$1", pid: :"$2", ref: :"$3", meta: %{key => value}, clock: :"$4")
203 |          ) do
204 |       [] -> :undefined
205 |       list when is_list(list) -> list
206 |     end
207 |   end
208 | 
209 |   @spec reduce(term(), (Entry.entry(), term() -> term())) :: term()
210 |   def reduce(acc, fun) when is_function(fun, 2) do
211 |     :ets.foldl(fun, acc, @table_name)
212 |   end
213 | 
214 |   @spec update(term(), Keyword.t()) :: boolean
215 |   defmacro update(key, updates) do
216 |     fields = Enum.map(updates, fn {k, v} -> {Entry.index(k) + 1, v} end)
217 | 
218 |     quote bind_quoted: [table_name: @table_name, key: key, fields: fields] do
219 |       :ets.update_element(table_name, key, fields)
220 |     end
221 |   end
222 | 
223 |   ## GenServer Implementation
224 | 
225 |   def start_link(), do: GenServer.start_link(__MODULE__, [], name: __MODULE__)
226 | 
227 |   def init(_) do
228 |     # start ETS table for registry
229 |     t =
230 |       :ets.new(@table_name, [
231 |         :set,
232 |         :named_table,
233 |         :public,
234 |         keypos: 2,
235 |         read_concurrency: true,
236 |         write_concurrency: true
237 |       ])
238 | 
239 |     {:ok, t}
240 |   end
241 | end
242 | 


--------------------------------------------------------------------------------
/lib/swarm/tracker/crdt.ex:
--------------------------------------------------------------------------------
  1 | defmodule Swarm.IntervalTreeClock do
  2 |   @moduledoc """
  3 |   This is an implementation of an Interval Clock Tree, ported from
  4 |   the implementation in Erlang written by Paulo Sergio Almeida <psa@di.uminho.pt>
  5 |   found [here](https://github.com/ricardobcl/Interval-Tree-Clocks/blob/master/erlang/itc.erl).
  6 |   """
  7 |   use Bitwise
  8 |   import Kernel, except: [max: 2, min: 2]
  9 |   @compile {:inline, [min: 2, max: 2, drop: 2, lift: 2, base: 1, height: 1]}
 10 | 
 11 |   @type int_tuple :: {non_neg_integer, non_neg_integer}
 12 |   @type t ::
 13 |           int_tuple
 14 |           | {int_tuple, non_neg_integer}
 15 |           | {non_neg_integer, int_tuple}
 16 |           | {int_tuple, int_tuple}
 17 | 
 18 |   @doc """
 19 |   Creates a new interval tree clock
 20 |   """
 21 |   @spec seed() :: __MODULE__.t()
 22 |   def seed(), do: {1, 0}
 23 | 
 24 |   @doc """
 25 |   Joins two forked clocks into a single clock with both causal histories,
 26 |   used for retiring a replica.
 27 |   """
 28 |   @spec join(__MODULE__.t(), __MODULE__.t()) :: __MODULE__.t()
 29 |   def join({i1, e1}, {i2, e2}), do: {sum(i1, i2), join_ev(e1, e2)}
 30 | 
 31 |   @doc """
 32 |   Forks a clock containing a shared causal history, used for creating new replicas.
 33 |   """
 34 |   @spec fork(__MODULE__.t()) :: __MODULE__.t()
 35 |   def fork({i, e}) do
 36 |     {i1, i2} = split(i)
 37 |     {{i1, e}, {i2, e}}
 38 |   end
 39 | 
 40 |   @doc """
 41 |   Gets a snapshot of a clock without its identity. Useful for sending the clock with messages,
 42 |   but cannot be used to track events.
 43 |   """
 44 |   @spec peek(__MODULE__.t()) :: __MODULE__.t()
 45 |   def peek({_i, e}), do: {0, e}
 46 | 
 47 |   @doc """
 48 |   Records an event on the given clock
 49 |   """
 50 |   @spec event(__MODULE__.t()) :: __MODULE__.t()
 51 |   def event({i, e}) do
 52 |     case fill(i, e) do
 53 |       ^e ->
 54 |         {_, e1} = grow(i, e)
 55 |         {i, e1}
 56 | 
 57 |       e1 ->
 58 |         {i, e1}
 59 |     end
 60 |   end
 61 | 
 62 |   @doc """
 63 |   Determines if the left-hand clock is causally dominated by the right-hand clock.
 64 |   If the left-hand clock is LEQ than the right-hand clock, and vice-versa, then they are
 65 |   causally equivalent.
 66 |   """
 67 |   @spec leq(__MODULE__.t(), __MODULE__.t()) :: boolean
 68 |   def leq({_, e1}, {_, e2}), do: leq_ev(e1, e2)
 69 | 
 70 |   @doc """
 71 |   Compares two clocks.
 72 |   If :eq is returned, the two clocks are causally equivalent
 73 |   If :lt is returned, the first clock is causally dominated by the second
 74 |   If :gt is returned, the second clock is causally dominated by the first
 75 |   If :concurrent is returned, the two clocks are concurrent (conflicting)
 76 |   """
 77 |   @spec compare(__MODULE__.t(), __MODULE__.t()) :: :lt | :gt | :eq | :concurrent
 78 |   def compare(a, b) do
 79 |     a_leq = leq(a, b)
 80 |     b_leq = leq(b, a)
 81 | 
 82 |     cond do
 83 |       a_leq and b_leq -> :eq
 84 |       a_leq -> :lt
 85 |       b_leq -> :gt
 86 |       :else -> :concurrent
 87 |     end
 88 |   end
 89 | 
 90 |   @doc """
 91 |   Encodes the clock as a binary
 92 |   """
 93 |   @spec encode(__MODULE__.t()) :: binary
 94 |   def encode({i, e}), do: :erlang.term_to_binary({i, e})
 95 | 
 96 |   @doc """
 97 |   Decodes the clock from a binary
 98 |   """
 99 |   @spec decode(binary) :: {:ok, __MODULE__.t()} | {:error, {:invalid_clock, term}}
100 |   def decode(b) when is_binary(b) do
101 |     case :erlang.binary_to_term(b) do
102 |       {_i, _e} = clock ->
103 |         clock
104 | 
105 |       other ->
106 |         {:error, {:invalid_clock, other}}
107 |     end
108 |   end
109 | 
110 |   @doc """
111 |   Returns the length of the encoded binary representation of the clock
112 |   """
113 |   @spec len(__MODULE__.t()) :: non_neg_integer
114 |   def len(d), do: :erlang.size(encode(d))
115 | 
116 |   ## Private API
117 | 
118 |   defp leq_ev({n1, l1, r1}, {n2, l2, r2}) do
119 |     n1 <= n2 and leq_ev(lift(n1, l1), lift(n2, l2)) and leq_ev(lift(n1, r1), lift(n2, r2))
120 |   end
121 | 
122 |   defp leq_ev({n1, l1, r1}, n2) do
123 |     n1 <= n2 and leq_ev(lift(n1, l1), n2) and leq_ev(lift(n1, r1), n2)
124 |   end
125 | 
126 |   defp leq_ev(n1, {n2, _, _}), do: n1 <= n2
127 |   defp leq_ev(n1, n2), do: n1 <= n2
128 | 
129 |   defp norm_id({0, 0}), do: 0
130 |   defp norm_id({1, 1}), do: 1
131 |   defp norm_id(x), do: x
132 | 
133 |   defp norm_ev({n, m, m}) when is_integer(m), do: n + m
134 | 
135 |   defp norm_ev({n, l, r}) do
136 |     m = min(base(l), base(r))
137 |     {n + m, drop(m, l), drop(m, r)}
138 |   end
139 | 
140 |   defp sum(0, x), do: x
141 |   defp sum(x, 0), do: x
142 |   defp sum({l1, r1}, {l2, r2}), do: norm_id({sum(l1, l2), sum(r1, r2)})
143 | 
144 |   defp split(0), do: {0, 0}
145 |   defp split(1), do: {{1, 0}, {0, 1}}
146 | 
147 |   defp split({0, i}) do
148 |     {i1, i2} = split(i)
149 |     {{0, i1}, {0, i2}}
150 |   end
151 | 
152 |   defp split({i, 0}) do
153 |     {i1, i2} = split(i)
154 |     {{i1, 0}, {i2, 0}}
155 |   end
156 | 
157 |   defp split({i1, i2}), do: {{i1, 0}, {0, i2}}
158 | 
159 |   defp join_ev({n1, _, _} = e1, {n2, _, _} = e2) when n1 > n2, do: join_ev(e2, e1)
160 | 
161 |   defp join_ev({n1, l1, r1}, {n2, l2, r2}) when n1 <= n2 do
162 |     d = n2 - n1
163 |     norm_ev({n1, join_ev(l1, lift(d, l2)), join_ev(r1, lift(d, r2))})
164 |   end
165 | 
166 |   defp join_ev(n1, {n2, l2, r2}), do: join_ev({n1, 0, 0}, {n2, l2, r2})
167 |   defp join_ev({n1, l1, r1}, n2), do: join_ev({n1, l1, r1}, {n2, 0, 0})
168 |   defp join_ev(n1, n2), do: max(n1, n2)
169 | 
170 |   defp fill(0, e), do: e
171 |   defp fill(1, {_, _, _} = e), do: height(e)
172 |   defp fill(_, n) when is_integer(n), do: n
173 | 
174 |   defp fill({1, r}, {n, el, er}) do
175 |     er1 = fill(r, er)
176 |     d = max(height(el), base(er1))
177 |     norm_ev({n, d, er1})
178 |   end
179 | 
180 |   defp fill({l, 1}, {n, el, er}) do
181 |     el1 = fill(l, el)
182 |     d = max(height(er), base(el1))
183 |     norm_ev({n, el1, d})
184 |   end
185 | 
186 |   defp fill({l, r}, {n, el, er}) do
187 |     norm_ev({n, fill(l, el), fill(r, er)})
188 |   end
189 | 
190 |   defp grow(1, n) when is_integer(n), do: {0, n + 1}
191 | 
192 |   defp grow({0, i}, {n, l, r}) do
193 |     {h, e1} = grow(i, r)
194 |     {h + 1, {n, l, e1}}
195 |   end
196 | 
197 |   defp grow({i, 0}, {n, l, r}) do
198 |     {h, e1} = grow(i, l)
199 |     {h + 1, {n, e1, r}}
200 |   end
201 | 
202 |   defp grow({il, ir}, {n, l, r}) do
203 |     {hl, el} = grow(il, l)
204 |     {hr, er} = grow(ir, r)
205 | 
206 |     cond do
207 |       hl < hr -> {hl + 1, {n, el, r}}
208 |       :else -> {hr + 1, {n, l, er}}
209 |     end
210 |   end
211 | 
212 |   defp grow(i, n) when is_integer(n) do
213 |     {h, e} = grow(i, {n, 0, 0})
214 |     {h + 100_000, e}
215 |   end
216 | 
217 |   defp height({n, l, r}), do: n + max(height(l), height(r))
218 |   defp height(n), do: n
219 | 
220 |   defp base({n, _, _}), do: n
221 |   defp base(n), do: n
222 | 
223 |   defp lift(m, {n, l, r}), do: {n + m, l, r}
224 |   defp lift(m, n), do: n + m
225 | 
226 |   defp drop(m, {n, l, r}) when m <= n, do: {n - m, l, r}
227 |   defp drop(m, n) when m <= n, do: n - m
228 | 
229 |   defp max(x, y) when x <= y, do: y
230 |   defp max(x, _), do: x
231 | 
232 |   defp min(x, y) when x <= y, do: x
233 |   defp min(_, y), do: y
234 | 
235 |   def str({i, e}),
236 |     do: List.to_string(List.flatten([List.flatten(stri(i)), List.flatten(stre(e))]))
237 | 
238 |   defp stri(0), do: '0'
239 |   defp stri(1), do: ''
240 |   defp stri({0, i}), do: 'R' ++ stri(i)
241 |   defp stri({i, 0}), do: 'L' ++ stri(i)
242 |   defp stri({l, r}), do: ['(L' ++ stri(l), '+', 'R' ++ stri(r), ')']
243 | 
244 |   defp stre({n, l, 0}), do: [stre(n), 'L', stre(l)]
245 |   defp stre({n, 0, r}), do: [stre(n), 'R', stre(r)]
246 |   defp stre({n, l, r}), do: [stre(n), '(L', stre(l), '+R', stre(r), ')']
247 |   defp stre(n) when n > 0, do: :erlang.integer_to_list(n)
248 |   defp stre(_), do: ''
249 | end
250 | 


--------------------------------------------------------------------------------
/lib/swarm/tracker/entry.ex:
--------------------------------------------------------------------------------
 1 | defmodule Swarm.Entry do
 2 |   @moduledoc false
 3 |   alias Swarm.IntervalTreeClock, as: ITC
 4 | 
 5 |   @fields [name: nil, pid: nil, ref: nil, meta: %{}, clock: nil]
 6 | 
 7 |   require Record
 8 |   Record.defrecord(:entry, @fields)
 9 | 
10 |   @type entry ::
11 |           record(
12 |             :entry,
13 |             name: term,
14 |             pid: pid,
15 |             ref: reference,
16 |             meta: nil | map,
17 |             clock: nil | ITC.t()
18 |           )
19 | 
20 |   def index(field) when is_atom(field) do
21 |     Record.__access__(:entry, @fields, field, Swarm.Entry)
22 |   end
23 | end
24 | 


--------------------------------------------------------------------------------
/lib/swarm/tracker/tracker.ex:
--------------------------------------------------------------------------------
   1 | defmodule Swarm.Tracker do
   2 |   @moduledoc """
   3 |   This module implements the distributed tracker for process registrations and groups.
   4 |   It is implemented as a finite state machine, via `:gen_statem`.
   5 | 
   6 |   Each node Swarm runs on will have a single instance of this process, and the trackers will
   7 |   replicate data between each other, and/or forward requests to remote trackers as necessary.
   8 |   """
   9 |   use GenStateMachine, callback_mode: :state_functions
  10 | 
  11 |   @sync_nodes_timeout 5_000
  12 |   @retry_interval 1_000
  13 |   @retry_max_attempts 10
  14 |   @default_anti_entropy_interval 5 * 60_000
  15 | 
  16 |   import Swarm.Entry
  17 |   require Logger
  18 |   require Swarm.Registry
  19 |   alias Swarm.IntervalTreeClock, as: Clock
  20 |   alias Swarm.Registry
  21 |   alias Swarm.Distribution.Strategy
  22 | 
  23 |   defmodule Tracking do
  24 |     @moduledoc false
  25 |     @type t :: %__MODULE__{
  26 |             name: term(),
  27 |             meta: %{mfa: {m :: atom(), f :: function(), a :: list()}},
  28 |             from: {pid, tag :: term}
  29 |           }
  30 |     defstruct [:name, :meta, :from]
  31 |   end
  32 | 
  33 |   defmodule TrackerState do
  34 |     @moduledoc false
  35 |     @type t :: %__MODULE__{
  36 |             clock: nil | Swarm.IntervalTreeClock.t(),
  37 |             strategy: Strategy.t(),
  38 |             self: atom(),
  39 |             sync_node: nil | atom(),
  40 |             sync_ref: nil | reference(),
  41 |             pending_sync_reqs: [pid()]
  42 |           }
  43 |     defstruct clock: nil,
  44 |               nodes: [],
  45 |               strategy: nil,
  46 |               self: :nonode@nohost,
  47 |               sync_node: nil,
  48 |               sync_ref: nil,
  49 |               pending_sync_reqs: []
  50 |   end
  51 | 
  52 |   # Public API
  53 | 
  54 |   @doc """
  55 |   Authoritatively looks up the pid associated with a given name.
  56 |   """
  57 |   def whereis(name),
  58 |     do: GenStateMachine.call(__MODULE__, {:whereis, name}, :infinity)
  59 | 
  60 |   @doc """
  61 |   Hand off all the processes running on the given worker to the remaining nodes in the cluster.
  62 |   This can be used to gracefully shut down a node.
  63 |   Note that if you don't shut down the node after the handoff a rebalance can lead to processes being scheduled on it again.
  64 |   In other words the handoff doesn't blacklist the node for further rebalances.
  65 |   """
  66 |   def handoff(worker_name, state),
  67 |     do: GenStateMachine.call(__MODULE__, {:handoff, worker_name, state}, :infinity)
  68 | 
  69 |   @doc """
  70 |   Tracks a process (pid) with the given name.
  71 |   Tracking processes with this function will *not* restart the process when
  72 |   its parent node goes down, or shift the process to other nodes if the cluster
  73 |   topology changes. It is strictly for global name registration.
  74 |   """
  75 |   def track(name, pid) when is_pid(pid),
  76 |     do: GenStateMachine.call(__MODULE__, {:track, name, pid, %{}}, :infinity)
  77 | 
  78 |   @doc """
  79 |   Tracks a process created via the provided module/function/args with the given name.
  80 |   The process will be distributed on the cluster based on the implementation of the configured distribution strategy.
  81 |   If the process' parent node goes down, it will be restarted on the new node which owns its keyspace.
  82 |   If the cluster topology changes, and the owner of its keyspace changes, it will be shifted to
  83 |   the new owner, after initiating the handoff process as described in the documentation.
  84 |   A track call will return an error tagged tuple, `{:error, :no_node_available}`, if there is no node available to start the process.
  85 |   Provide a timeout value to limit the track call duration. A value of `:infinity` can be used to block indefinitely.
  86 |   """
  87 |   def track(name, m, f, a, timeout) when is_atom(m) and is_atom(f) and is_list(a),
  88 |     do: GenStateMachine.call(__MODULE__, {:track, name, %{mfa: {m, f, a}}}, timeout)
  89 | 
  90 |   @doc """
  91 |   Stops tracking the given process (pid).
  92 |   """
  93 |   def untrack(pid) when is_pid(pid),
  94 |     do: GenStateMachine.call(__MODULE__, {:untrack, pid}, :infinity)
  95 | 
  96 |   @doc """
  97 |   Adds some metadata to the given process (pid). This is primarily used for tracking group membership.
  98 |   """
  99 |   def add_meta(key, value, pid) when is_pid(pid),
 100 |     do: GenStateMachine.call(__MODULE__, {:add_meta, key, value, pid}, :infinity)
 101 | 
 102 |   @doc """
 103 |   Removes metadata from the given process (pid).
 104 |   """
 105 |   def remove_meta(key, pid) when is_pid(pid),
 106 |     do: GenStateMachine.call(__MODULE__, {:remove_meta, key, pid}, :infinity)
 107 | 
 108 |   ## Process Internals / Internal API
 109 | 
 110 |   defmacrop debug(msg) do
 111 |     {current_state, _arity} = __CALLER__.function
 112 | 
 113 |     quote do
 114 |       Logger.debug(Swarm.Logger.format("[tracker:#{unquote(current_state)}] #{unquote(msg)}"))
 115 |     end
 116 |   end
 117 | 
 118 |   defmacrop info(msg) do
 119 |     {current_state, _arity} = __CALLER__.function
 120 | 
 121 |     quote do
 122 |       Logger.info(Swarm.Logger.format("[tracker:#{unquote(current_state)}] #{unquote(msg)}"))
 123 |     end
 124 |   end
 125 | 
 126 |   defmacrop warn(msg) do
 127 |     {current_state, _arity} = __CALLER__.function
 128 | 
 129 |     quote do
 130 |       Logger.warn(Swarm.Logger.format("[tracker:#{unquote(current_state)}] #{unquote(msg)}"))
 131 |     end
 132 |   end
 133 | 
 134 |   defmacrop error(msg) do
 135 |     {current_state, _arity} = __CALLER__.function
 136 | 
 137 |     quote do
 138 |       Logger.error(Swarm.Logger.format("[tracker:#{unquote(current_state)}] #{unquote(msg)}"))
 139 |     end
 140 |   end
 141 | 
 142 |   def start_link() do
 143 |     GenStateMachine.start_link(__MODULE__, [], name: __MODULE__)
 144 |   end
 145 | 
 146 |   def init(_) do
 147 |     # Trap exits
 148 |     Process.flag(:trap_exit, true)
 149 |     # If this node is ignored, then make sure we ignore everyone else
 150 |     # to prevent accidentally interfering with the cluster
 151 |     if ignore_node?(Node.self()) do
 152 |       Application.put_env(:swarm, :node_blacklist, [~r/^.+$/])
 153 |     end
 154 | 
 155 |     # Start monitoring nodes
 156 |     :ok = :net_kernel.monitor_nodes(true, node_type: :all)
 157 |     info("started")
 158 |     nodelist = Enum.reject(Node.list(:connected), &ignore_node?/1)
 159 | 
 160 |     strategy =
 161 |       Node.self()
 162 |       |> Strategy.create()
 163 |       |> Strategy.add_nodes(nodelist)
 164 | 
 165 |     if Application.get_env(:swarm, :debug, false) do
 166 |       _ = Task.start(fn -> :sys.trace(Swarm.Tracker, true) end)
 167 |     end
 168 | 
 169 |     timeout = Application.get_env(:swarm, :sync_nodes_timeout, @sync_nodes_timeout)
 170 |     Process.send_after(self(), :cluster_join, timeout)
 171 | 
 172 |     state = %TrackerState{clock: Clock.seed(), nodes: nodelist, strategy: strategy, self: node()}
 173 | 
 174 |     {:ok, :cluster_wait, state}
 175 |   end
 176 | 
 177 |   def cluster_wait(:info, {:nodeup, node, _}, %TrackerState{} = state) do
 178 |     new_state =
 179 |       case nodeup(state, node) do
 180 |         {:ok, new_state} -> new_state
 181 |         {:ok, new_state, _next_state} -> new_state
 182 |       end
 183 | 
 184 |     {:keep_state, new_state}
 185 |   end
 186 | 
 187 |   def cluster_wait(:info, {:nodedown, node, _}, %TrackerState{} = state) do
 188 |     new_state =
 189 |       case nodedown(state, node) do
 190 |         {:ok, new_state} -> new_state
 191 |         {:ok, new_state, _next_state} -> new_state
 192 |       end
 193 | 
 194 |     {:keep_state, new_state}
 195 |   end
 196 | 
 197 |   def cluster_wait(:info, :cluster_join, %TrackerState{nodes: []} = state) do
 198 |     info("joining cluster..")
 199 |     info("no connected nodes, proceeding without sync")
 200 |     interval = Application.get_env(:swarm, :anti_entropy_interval, @default_anti_entropy_interval)
 201 |     Process.send_after(self(), :anti_entropy, interval)
 202 |     {:next_state, :tracking, %{state | clock: Clock.seed()}}
 203 |   end
 204 | 
 205 |   def cluster_wait(:info, :cluster_join, %TrackerState{nodes: nodes} = state) do
 206 |     info("joining cluster..")
 207 |     info("found connected nodes: #{inspect(nodes)}")
 208 |     # Connect to a random node and sync registries,
 209 |     # start anti-entropy, and start loop with forked clock of
 210 |     # remote node
 211 |     sync_node = Enum.random(nodes)
 212 |     info("selected sync node: #{sync_node}")
 213 |     # Send sync request
 214 |     ref = Process.monitor({__MODULE__, sync_node})
 215 |     GenStateMachine.cast({__MODULE__, sync_node}, {:sync, self(), state.clock})
 216 |     {:next_state, :syncing, %{state | sync_node: sync_node, sync_ref: ref}}
 217 |   end
 218 | 
 219 |   def cluster_wait(:cast, {:sync, from, rclock}, %TrackerState{nodes: [from_node]} = state)
 220 |       when node(from) == from_node do
 221 |     info("joining cluster..")
 222 |     sync_node = node(from)
 223 |     info("syncing with #{sync_node}")
 224 |     ref = Process.monitor({__MODULE__, sync_node})
 225 |     {lclock, rclock} = Clock.fork(rclock)
 226 |     debug("forking clock: #{inspect state.clock}, lclock: #{inspect lclock}, rclock: #{inspect rclock}")
 227 |     GenStateMachine.cast(from, {:sync_recv, self(), rclock, get_registry_snapshot()})
 228 | 
 229 |     {:next_state, :awaiting_sync_ack,
 230 |     %{state | clock: lclock, sync_node: sync_node, sync_ref: ref}}
 231 |   end
 232 | 
 233 |   def cluster_wait(:cast, {:sync, from, _rclock}, %TrackerState{} = state) do
 234 |     if ignore_node?(node(from)) do
 235 |       GenStateMachine.cast(from, {:sync_err, :node_ignored})
 236 |       :keep_state_and_data
 237 |     else
 238 |       info("pending sync request from #{node(from)}")
 239 |       {:keep_state, %{state | pending_sync_reqs: [from | state.pending_sync_reqs]}}
 240 |     end
 241 |   end
 242 | 
 243 |   def cluster_wait(_event_type, _event_data, _state) do
 244 |     {:keep_state_and_data, :postpone}
 245 |   end
 246 | 
 247 |   def syncing(:info, {:nodeup, node, _}, %TrackerState{} = state) do
 248 |     new_state =
 249 |       case nodeup(state, node) do
 250 |         {:ok, new_state} -> new_state
 251 |         {:ok, new_state, _next_state} -> new_state
 252 |       end
 253 | 
 254 |     {:keep_state, new_state}
 255 |   end
 256 | 
 257 |   def syncing(
 258 |         :info,
 259 |         {:DOWN, ref, _type, _pid, _info},
 260 |         %TrackerState{clock: clock, sync_ref: ref} = state
 261 |       ) do
 262 |     info("the remote tracker we're syncing with has crashed, selecting a new one")
 263 | 
 264 |     case state.nodes -- [state.sync_node] do
 265 |       [] ->
 266 |         info("no other available nodes, cancelling sync")
 267 |         new_state = %{state | sync_node: nil, sync_ref: nil}
 268 |         {:next_state, :tracking, new_state}
 269 | 
 270 |       new_nodes ->
 271 |         new_sync_node = Enum.random(new_nodes)
 272 |         info("selected sync node: #{new_sync_node}")
 273 |         # Send sync request
 274 |         ref = Process.monitor({__MODULE__, new_sync_node})
 275 |         GenStateMachine.cast({__MODULE__, new_sync_node}, {:sync, self(), clock})
 276 |         new_state = %{state | sync_node: new_sync_node, sync_ref: ref}
 277 |         {:keep_state, new_state}
 278 |     end
 279 |   end
 280 | 
 281 |   def syncing(
 282 |         :info,
 283 |         {:nodedown, node, _},
 284 |         %TrackerState{strategy: strategy, clock: clock, nodes: nodes, sync_node: node} = state
 285 |       ) do
 286 |     info("the selected sync node #{node} went down, selecting new node")
 287 |     Process.demonitor(state.sync_ref, [:flush])
 288 | 
 289 |     case nodes -- [node] do
 290 |       [] ->
 291 |         # there are no other nodes to select, nothing to do
 292 |         info("no other available nodes, cancelling sync")
 293 | 
 294 |         new_state = %{
 295 |           state
 296 |           | nodes: [],
 297 |             strategy: Strategy.remove_node(strategy, node),
 298 |             sync_node: nil,
 299 |             sync_ref: nil
 300 |         }
 301 | 
 302 |         {:next_state, :tracking, new_state}
 303 | 
 304 |       new_nodes ->
 305 |         new_sync_node = Enum.random(new_nodes)
 306 |         info("selected sync node: #{new_sync_node}")
 307 |         # Send sync request
 308 |         ref = Process.monitor({__MODULE__, new_sync_node})
 309 |         GenStateMachine.cast({__MODULE__, new_sync_node}, {:sync, self(), clock})
 310 | 
 311 |         new_state = %{
 312 |           state
 313 |           | nodes: new_nodes,
 314 |             strategy: Strategy.remove_node(strategy, node),
 315 |             sync_node: new_sync_node,
 316 |             sync_ref: ref
 317 |         }
 318 | 
 319 |         {:keep_state, new_state}
 320 |     end
 321 |   end
 322 | 
 323 |   def syncing(:info, {:nodedown, node, _}, %TrackerState{} = state) do
 324 |     new_state =
 325 |       case nodedown(state, node) do
 326 |         {:ok, new_state} -> new_state
 327 |         {:ok, new_state, _next_state} -> new_state
 328 |       end
 329 | 
 330 |     {:keep_state, new_state}
 331 |   end
 332 | 
 333 |   # Successful anti-entropy sync
 334 |   def syncing(
 335 |         :cast,
 336 |         {:sync_recv, from, sync_clock, registry},
 337 |         %TrackerState{sync_node: sync_node} = state
 338 |       )
 339 |       when node(from) == sync_node do
 340 |     info("received registry from #{sync_node}, merging..")
 341 |     new_state = sync_registry(from, sync_clock, registry, state)
 342 |     # let remote node know we've got the registry
 343 |     GenStateMachine.cast(from, {:sync_ack, self(), sync_clock, get_registry_snapshot()})
 344 |     info("local synchronization with #{sync_node} complete!")
 345 |     resolve_pending_sync_requests(%{new_state | clock: sync_clock})
 346 |   end
 347 | 
 348 |   def syncing(:cast, {:sync_err, from}, %TrackerState{nodes: nodes, sync_node: sync_node} = state)
 349 |       when node(from) == sync_node do
 350 |     Process.demonitor(state.sync_ref, [:flush])
 351 | 
 352 |     cond do
 353 |       # Something weird happened during sync, so try a different node,
 354 |       # with this implementation, we *could* end up selecting the same node
 355 |       # again, but that's fine as this is effectively a retry
 356 |       length(nodes) > 0 ->
 357 |         warn("a problem occurred during sync, choosing a new node to sync with")
 358 |         # we need to choose a different node to sync with and try again
 359 |         new_sync_node = Enum.random(nodes)
 360 |         ref = Process.monitor({__MODULE__, new_sync_node})
 361 |         GenStateMachine.cast({__MODULE__, new_sync_node}, {:sync, self(), state.clock})
 362 |         {:keep_state, %{state | sync_node: new_sync_node, sync_ref: ref}}
 363 | 
 364 |       # Something went wrong during sync, but there are no other nodes to sync with,
 365 |       # not even the original sync node (which probably implies it shutdown or crashed),
 366 |       # so we're the sync node now
 367 |       :else ->
 368 |         warn(
 369 |           "a problem occurred during sync, but no other available sync targets, becoming seed node"
 370 |         )
 371 | 
 372 |         {:next_state, :tracking, %{state | pending_sync_reqs: [], sync_node: nil, sync_ref: nil}}
 373 |     end
 374 |   end
 375 | 
 376 |   def syncing(:cast, {:sync, from, rclock}, %TrackerState{sync_node: sync_node} = state)
 377 |       when node(from) == sync_node do
 378 |     # We're trying to sync with another node while it is trying to sync with us, deterministically
 379 |     # choose the node which will coordinate the synchronization.
 380 |     local_node = Node.self()
 381 | 
 382 |     case Clock.compare(state.clock, rclock) do
 383 |       :lt ->
 384 |         # The local clock is dominated by the remote clock, so the remote node will begin the sync
 385 |         info("syncing from #{sync_node} based on tracker clock")
 386 |         :keep_state_and_data
 387 | 
 388 |       :gt ->
 389 |         # The local clock dominates the remote clock, so the local node will begin the sync
 390 |         info("syncing to #{sync_node} based on tracker clock")
 391 |         {lclock, rclock} = Clock.fork(state.clock)
 392 |         debug("forking clock when local: #{inspect state.clock}, lclock: #{inspect lclock}, rclock: #{inspect rclock}")
 393 |         GenStateMachine.cast(from, {:sync_recv, self(), rclock, get_registry_snapshot()})
 394 |         {:next_state, :awaiting_sync_ack, %{state | clock: lclock}}
 395 | 
 396 |       result when result in [:eq, :concurrent] and sync_node > local_node ->
 397 |         # The remote node will begin the sync
 398 |         info("syncing from #{sync_node} based on node precedence")
 399 |         :keep_state_and_data
 400 | 
 401 |       result when result in [:eq, :concurrent] ->
 402 |         # The local node begins the sync
 403 |         info("syncing to #{sync_node} based on node precedence")
 404 |         {lclock, rclock} = Clock.fork(state.clock)
 405 |         debug("forking clock when concurrent: #{inspect state.clock}, lclock: #{inspect lclock}, rclock: #{inspect rclock}")
 406 |         GenStateMachine.cast(from, {:sync_recv, self(), rclock, get_registry_snapshot()})
 407 |         {:next_state, :awaiting_sync_ack, %{state | clock: lclock}}
 408 |     end
 409 |   end
 410 | 
 411 |   def syncing(:cast, {:sync, from, _rclock}, %TrackerState{} = state) do
 412 |     if ignore_node?(node(from)) do
 413 |       GenStateMachine.cast(from, {:sync_err, :node_ignored})
 414 |       :keep_state_and_data
 415 |     else
 416 |       info("pending sync request from #{node(from)}")
 417 |       new_pending_reqs = Enum.uniq([from | state.pending_sync_reqs])
 418 |       {:keep_state, %{state | pending_sync_reqs: new_pending_reqs}}
 419 |     end
 420 |   end
 421 | 
 422 |   def syncing(_event_type, _event_data, _state) do
 423 |     {:keep_state_and_data, :postpone}
 424 |   end
 425 | 
 426 |   defp sync_registry(from, _sync_clock, registry, %TrackerState{} = state) when is_pid(from) do
 427 |     sync_node = node(from)
 428 |     # map over the registry and check that all local entries are correct
 429 |     Enum.each(registry, fn entry(name: rname, pid: rpid, meta: rmeta, clock: rclock) = rreg ->
 430 |       case Registry.get_by_name(rname) do
 431 |         :undefined ->
 432 |           # missing local registration
 433 |           debug("local tracker is missing #{inspect(rname)}, adding to registry")
 434 |           ref = Process.monitor(rpid)
 435 |           lclock = Clock.join(state.clock, rclock)
 436 |           Registry.new!(entry(name: rname, pid: rpid, ref: ref, meta: rmeta, clock: lclock))
 437 | 
 438 |         entry(pid: ^rpid, meta: lmeta, clock: lclock) ->
 439 |           case Clock.compare(lclock, rclock) do
 440 |             :lt ->
 441 |               # the remote clock dominates, take remote data
 442 |               lclock = Clock.join(lclock, rclock)
 443 |               Registry.update(rname, meta: rmeta, clock: lclock)
 444 | 
 445 |               debug(
 446 |                 "sync metadata for #{inspect(rpid)} (#{inspect(rmeta)}) is causally dominated by remote, updated registry..."
 447 |               )
 448 | 
 449 |             :gt ->
 450 |               # the local clock dominates, keep local data
 451 |               debug(
 452 |                 "sync metadata for #{inspect(rpid)} (#{inspect(rmeta)}) is causally dominated by local, ignoring..."
 453 |               )
 454 | 
 455 |               :ok
 456 | 
 457 |             :eq ->
 458 |               # the clocks are the same, no-op
 459 |               debug(
 460 |                 "sync metadata for #{inspect(rpid)} (#{inspect(rmeta)}) has equal clocks, ignoring..."
 461 |               )
 462 | 
 463 |               :ok
 464 | 
 465 |             :concurrent ->
 466 |               warn("local and remote metadata for #{inspect(rname)} was concurrently modified")
 467 |               new_meta = Map.merge(lmeta, rmeta)
 468 | 
 469 |               # we're going to join and bump our local clock though and re-broadcast the update to ensure we converge
 470 |               lclock = Clock.join(lclock, rclock)
 471 |               lclock = Clock.event(lclock)
 472 |               Registry.update(rname, meta: new_meta, clock: lclock)
 473 |               broadcast_event(state.nodes, lclock, {:update_meta, new_meta, rpid})
 474 |           end
 475 | 
 476 |         entry(pid: lpid, clock: lclock) = lreg ->
 477 |           # there are two different processes for the same name, we need to resolve
 478 |           case Clock.compare(lclock, rclock) do
 479 |             :lt ->
 480 |               # the remote registration dominates
 481 |               resolve_incorrect_local_reg(sync_node, lreg, rreg, state)
 482 | 
 483 |             :gt ->
 484 |               # local registration dominates
 485 |               debug("remote view of #{inspect(rname)} is outdated, resolving..")
 486 |               resolve_incorrect_remote_reg(sync_node, lreg, rreg, state)
 487 | 
 488 |             _ ->
 489 |               # the entry clocks conflict, determine which one is correct based on
 490 |               # current topology and resolve the conflict
 491 |               rpid_node = node(rpid)
 492 |               lpid_node = node(lpid)
 493 | 
 494 |               case Strategy.key_to_node(state.strategy, rname) do
 495 |                 ^rpid_node when lpid_node != rpid_node ->
 496 |                   debug(
 497 |                     "remote and local view of #{inspect(rname)} conflict, but remote is correct, resolving.."
 498 |                   )
 499 | 
 500 |                   resolve_incorrect_local_reg(sync_node, lreg, rreg, state)
 501 | 
 502 |                 ^lpid_node when lpid_node != rpid_node ->
 503 |                   debug(
 504 |                     "remote and local view of #{inspect(rname)} conflict, but local is correct, resolving.."
 505 |                   )
 506 | 
 507 |                   resolve_incorrect_remote_reg(sync_node, lreg, rreg, state)
 508 | 
 509 |                 _ ->
 510 |                   cond do
 511 |                     lpid_node == rpid_node and lpid > rpid ->
 512 |                       debug(
 513 |                         "remote and local view of #{inspect(rname)} conflict, but local is more recent, resolving.."
 514 |                       )
 515 | 
 516 |                       resolve_incorrect_remote_reg(sync_node, lreg, rreg, state)
 517 | 
 518 |                     lpid_node == rpid_node and lpid < rpid ->
 519 |                       debug(
 520 |                         "remote and local view of #{inspect(rname)} conflict, but remote is more recent, resolving.."
 521 |                       )
 522 | 
 523 |                       resolve_incorrect_local_reg(sync_node, lreg, rreg, state)
 524 | 
 525 |                     :else ->
 526 |                       # name should be on another node, so neither registration is correct
 527 |                       debug(
 528 |                         "remote and local view of #{inspect(rname)} are both outdated, resolving.."
 529 |                       )
 530 | 
 531 |                       resolve_incorrect_local_reg(sync_node, lreg, rreg, state)
 532 |                   end
 533 |               end
 534 |           end
 535 |       end
 536 |     end)
 537 | 
 538 |     state
 539 |   end
 540 | 
 541 |   defp resolve_pending_sync_requests(%TrackerState{pending_sync_reqs: []} = state) do
 542 |     info("pending sync requests cleared")
 543 | 
 544 |     case state.sync_ref do
 545 |       nil -> :ok
 546 |       ref -> Process.demonitor(ref, [:flush])
 547 |     end
 548 | 
 549 |     {:next_state, :tracking, %{state | sync_node: nil, sync_ref: nil}}
 550 |   end
 551 | 
 552 |   defp resolve_pending_sync_requests(
 553 |          %TrackerState{sync_node: sync_node, pending_sync_reqs: [pid | pending]} = state
 554 |        )
 555 |        when sync_node == node(pid) do
 556 |     info("discarding sync_node from pending_sync_reqs")
 557 | 
 558 |     resolve_pending_sync_requests(%{state | pending_sync_reqs: pending})
 559 |   end
 560 | 
 561 |   defp resolve_pending_sync_requests(%TrackerState{pending_sync_reqs: [pid | pending]} = state) do
 562 |     pending_node = node(pid)
 563 |     # Remove monitoring of the previous sync node
 564 |     case state.sync_ref do
 565 |       nil -> :ok
 566 |       ref -> Process.demonitor(ref, [:flush])
 567 |     end
 568 | 
 569 |     cond do
 570 |       Enum.member?(state.nodes, pending_node) ->
 571 |         info("clearing pending sync request for #{pending_node}")
 572 |         {lclock, rclock} = Clock.fork(state.clock)
 573 |         debug("forking clock when resolving: #{inspect state.clock}, lclock: #{inspect lclock}, rclock: #{inspect rclock}")
 574 |         ref = Process.monitor(pid)
 575 |         GenStateMachine.cast(pid, {:sync_recv, self(), rclock, get_registry_snapshot()})
 576 | 
 577 |         new_state = %{
 578 |           state
 579 |           | sync_node: node(pid),
 580 |             sync_ref: ref,
 581 |             pending_sync_reqs: pending,
 582 |             clock: lclock
 583 |         }
 584 | 
 585 |         {:next_state, :awaiting_sync_ack, new_state}
 586 | 
 587 |       :else ->
 588 |         resolve_pending_sync_requests(%{
 589 |           state
 590 |           | sync_node: nil,
 591 |             sync_ref: nil,
 592 |             pending_sync_reqs: pending
 593 |         })
 594 |     end
 595 |   end
 596 | 
 597 |   def awaiting_sync_ack(
 598 |         :cast,
 599 |         {:sync_ack, from, sync_clock, registry},
 600 |         %TrackerState{sync_node: sync_node} = state
 601 |       )
 602 |       when sync_node == node(from) do
 603 |     info("received sync acknowledgement from #{node(from)}, syncing with remote registry")
 604 |     new_state = sync_registry(from, sync_clock, registry, state)
 605 |     info("local synchronization with #{node(from)} complete!")
 606 |     resolve_pending_sync_requests(new_state)
 607 |   end
 608 | 
 609 |   def awaiting_sync_ack(
 610 |         :info,
 611 |         {:DOWN, ref, _type, _pid, _info},
 612 |         %TrackerState{sync_ref: ref} = state
 613 |       ) do
 614 |     warn("wait for acknowledgement from #{state.sync_node} cancelled, tracker down")
 615 |     resolve_pending_sync_requests(%{state | sync_node: nil, sync_ref: nil})
 616 |   end
 617 | 
 618 |   def awaiting_sync_ack(:info, {:nodeup, node, _}, %TrackerState{} = state) do
 619 |     new_state =
 620 |       case nodeup(state, node) do
 621 |         {:ok, new_state} -> new_state
 622 |         {:ok, new_state, _next_state} -> new_state
 623 |       end
 624 | 
 625 |     {:keep_state, new_state}
 626 |   end
 627 | 
 628 |   def awaiting_sync_ack(:info, {:nodedown, node, _}, %TrackerState{sync_node: node} = state) do
 629 |     new_state =
 630 |       case nodedown(state, node) do
 631 |         {:ok, new_state} -> new_state
 632 |         {:ok, new_state, _next_state} -> new_state
 633 |       end
 634 | 
 635 |     Process.demonitor(state.sync_ref, [:flush])
 636 |     resolve_pending_sync_requests(%{new_state | sync_node: nil, sync_ref: nil})
 637 |   end
 638 | 
 639 |   def awaiting_sync_ack(:info, {:nodedown, node, _}, %TrackerState{} = state) do
 640 |     new_state =
 641 |       case nodedown(state, node) do
 642 |         {:ok, new_state} -> new_state
 643 |         {:ok, new_state, _next_state} -> new_state
 644 |       end
 645 | 
 646 |     {:keep_state, new_state}
 647 |   end
 648 | 
 649 |   def awaiting_sync_ack(_event_type, _event_data, _state) do
 650 |     {:keep_state_and_data, :postpone}
 651 |   end
 652 | 
 653 |   def tracking(:info, {:EXIT, _child, _reason}, _state) do
 654 |     # A child process started by this tracker has crashed
 655 |     :keep_state_and_data
 656 |   end
 657 | 
 658 |   def tracking(:info, {:nodeup, node, _}, %TrackerState{nodes: []} = state) do
 659 |     if ignore_node?(node) do
 660 |       :keep_state_and_data
 661 |     else
 662 |       # This case occurs when the tracker comes up without being connected to a cluster
 663 |       # and a cluster forms after some period of time. In this case, we need to treat this
 664 |       # like a cluster_wait -> cluster_join scenario, so that we sync with cluster and ensure
 665 |       # any registrations on the remote node are in the local registry and vice versa
 666 |       new_state =
 667 |         case nodeup(state, node) do
 668 |           {:ok, new_state} -> new_state
 669 |           {:ok, new_state, _next_state} -> new_state
 670 |         end
 671 | 
 672 |       cluster_wait(:info, :cluster_join, new_state)
 673 |     end
 674 |   end
 675 | 
 676 |   def tracking(:info, {:nodeup, node, _}, state) do
 677 |     state
 678 |     |> nodeup(node)
 679 |     |> handle_node_status()
 680 |   end
 681 | 
 682 |   def tracking(:info, {:nodedown, node, _}, state) do
 683 |     state
 684 |     |> nodedown(node)
 685 |     |> handle_node_status()
 686 |   end
 687 | 
 688 |   def tracking(:info, {:ensure_swarm_started_on_remote_node, node, attempts}, state) do
 689 |     state
 690 |     |> ensure_swarm_started_on_remote_node(node, attempts)
 691 |     |> handle_node_status()
 692 |   end
 693 | 
 694 |   def tracking(:info, :anti_entropy, state) do
 695 |     anti_entropy(state)
 696 |   end
 697 | 
 698 |   # A change event received from another replica/node
 699 |   def tracking(:cast, {:event, from, rclock, event}, state) do
 700 |     handle_replica_event(from, event, rclock, state)
 701 |   end
 702 | 
 703 |   # Received a handoff request from a node
 704 |   def tracking(:cast, {:handoff, from, {name, meta, handoff_state, rclock}}, state) do
 705 |     handle_handoff(from, {name, meta, handoff_state, rclock}, state)
 706 |   end
 707 | 
 708 |   # A remote registration failed due to nodedown during the call
 709 |   def tracking(:cast, {:retry, from, {:track, name, m, f, a}}, state) do
 710 |     handle_retry(from, {:track, name, %{mfa: {m, f, a}}}, state)
 711 |   end
 712 | 
 713 |   # A change event received locally
 714 |   def tracking({:call, from}, msg, state) do
 715 |     handle_call(msg, from, state)
 716 |   end
 717 | 
 718 |   def tracking(:cast, msg, state) do
 719 |     handle_cast(msg, state)
 720 |   end
 721 | 
 722 |   # A tracked process has gone down
 723 |   def tracking(:info, {:DOWN, ref, _type, pid, info}, state) do
 724 |     handle_monitor(ref, pid, info, state)
 725 |   end
 726 | 
 727 |   def tracking(event_type, event_data, state) do
 728 |     handle_event(event_type, event_data, state)
 729 |   end
 730 | 
 731 |   # This state helps us ensure that nodes proactively keep themselves synced
 732 |   # after joining the cluster and initial syncrhonization. This way if replication
 733 |   # events fail for some reason, we can control the drift in registry state
 734 |   def anti_entropy(%TrackerState{nodes: []}) do
 735 |     interval = Application.get_env(:swarm, :anti_entropy_interval, @default_anti_entropy_interval)
 736 |     Process.send_after(self(), :anti_entropy, interval)
 737 |     :keep_state_and_data
 738 |   end
 739 | 
 740 |   def anti_entropy(%TrackerState{nodes: nodes} = state) do
 741 |     sync_node = Enum.random(nodes)
 742 |     info("syncing with #{sync_node}")
 743 |     ref = Process.monitor({__MODULE__, sync_node})
 744 |     GenStateMachine.cast({__MODULE__, sync_node}, {:sync, self(), state.clock})
 745 |     new_state = %{state | sync_node: sync_node, sync_ref: ref}
 746 |     interval = Application.get_env(:swarm, :anti_entropy_interval, @default_anti_entropy_interval)
 747 |     Process.send_after(self(), :anti_entropy, interval)
 748 |     {:next_state, :syncing, new_state}
 749 |   end
 750 | 
 751 |   # This message is sent as a broadcast message for replication
 752 |   def handle_event(:info, {:event, from, rclock, event}, state) do
 753 |     handle_replica_event(from, event, rclock, state)
 754 |   end
 755 | 
 756 |   # If we receive cluster_join outside of cluster_wait it's because
 757 |   # we implicitly joined the cluster due to a sync event, we know if
 758 |   # we receive such an event the cluster is already formed due to how
 759 |   # Erlang distribution works (it's a mesh)
 760 |   def handle_event(:info, :cluster_join, _state) do
 761 |     :keep_state_and_data
 762 |   end
 763 | 
 764 |   def handle_event({:call, from}, msg, state) do
 765 |     handle_call(msg, from, state)
 766 |   end
 767 | 
 768 |   def handle_event(:cast, msg, state) do
 769 |     handle_cast(msg, state)
 770 |   end
 771 | 
 772 |   # Default event handler
 773 |   def handle_event(event_type, event_data, _state) do
 774 |     debug("unexpected event: #{inspect({event_type, event_data})}")
 775 |     :keep_state_and_data
 776 |   end
 777 | 
 778 |   def code_change(_oldvsn, state, data, _extra) do
 779 |     {:ok, state, data}
 780 |   end
 781 | 
 782 |   defp handle_node_status({:ok, new_state}), do: {:keep_state, new_state}
 783 | 
 784 |   defp handle_node_status({:ok, new_state, {:topology_change, change_info}}) do
 785 |     handle_topology_change(change_info, new_state)
 786 |   end
 787 | 
 788 |   # This is the callback for when a process is being handed off from a remote node to this node.
 789 |   defp handle_handoff(
 790 |          from,
 791 |          {name, meta, handoff_state, rclock},
 792 |          %TrackerState{clock: clock} = state
 793 |        ) do
 794 |     try do
 795 |       # If a network split is being healed, we almost certainly will have a
 796 |       # local registration already for this name (since it was present on this side of the split)
 797 |       # If not, we'll restart it, but if so, we'll send the handoff state to the old process and
 798 |       # let it determine how to resolve the conflict
 799 |       current_node = Node.self()
 800 | 
 801 |       case Registry.get_by_name(name) do
 802 |         :undefined ->
 803 |           {{m, f, a}, _other_meta} = Map.pop(meta, :mfa)
 804 |           {:ok, pid} = apply(m, f, a)
 805 |           GenServer.cast(pid, {:swarm, :end_handoff, handoff_state})
 806 |           ref = Process.monitor(pid)
 807 |           lclock = Clock.join(clock, rclock)
 808 |           Registry.new!(entry(name: name, pid: pid, ref: ref, meta: meta, clock: lclock))
 809 |           broadcast_event(state.nodes, lclock, {:track, name, pid, meta})
 810 |           {:keep_state, state}
 811 | 
 812 |         entry(pid: pid) when node(pid) == current_node ->
 813 |           GenServer.cast(pid, {:swarm, :resolve_conflict, handoff_state})
 814 |           lclock = Clock.join(clock, rclock)
 815 |           broadcast_event(state.nodes, lclock, {:track, name, pid, meta})
 816 |           {:keep_state, state}
 817 | 
 818 |         entry(pid: pid, ref: ref) = obj when node(pid) == node(from) ->
 819 |           # We have received the handoff before we've received the untrack event, but because
 820 |           # the handoff is coming from the node where the registration existed, we can safely
 821 |           # remove the registration now, and proceed with the handoff
 822 |           Process.demonitor(ref, [:flush])
 823 |           Registry.remove(obj)
 824 |           # Re-enter this callback to take advantage of the first clause
 825 |           handle_handoff(from, {name, meta, handoff_state, rclock}, state)
 826 |       end
 827 |     catch
 828 |       kind, err ->
 829 |         error(Exception.format(kind, err, System.stacktrace()))
 830 |         :keep_state_and_data
 831 |     end
 832 |   end
 833 | 
 834 |   # This is the callback for when a nodeup/down event occurs after the tracker has entered
 835 |   # the main receive loop. Topology changes are handled a bit differently during startup.
 836 |   defp handle_topology_change({type, remote_node}, %TrackerState{} = state) do
 837 |     debug("topology change (#{type} for #{remote_node})")
 838 |     current_node = state.self
 839 | 
 840 |     new_state =
 841 |       Registry.reduce(state, fn
 842 |         entry(name: name, pid: pid, meta: %{mfa: _mfa} = meta) = obj, state
 843 |         when node(pid) == current_node ->
 844 |           case Strategy.key_to_node(state.strategy, name) do
 845 |             :undefined ->
 846 |               # No node available to host process, it must be stopped
 847 |               debug("#{inspect(pid)} must be stopped as no node is available to host it")
 848 |               {:ok, new_state} = remove_registration(obj, state)
 849 |               send(pid, {:swarm, :die})
 850 |               new_state
 851 | 
 852 |             ^current_node ->
 853 |               # This process is correct
 854 |               state
 855 | 
 856 |             other_node ->
 857 |               debug("#{inspect(pid)} belongs on #{other_node}")
 858 |               # This process needs to be moved to the new node
 859 |               try do
 860 |                 case GenServer.call(pid, {:swarm, :begin_handoff}) do
 861 |                   :ignore ->
 862 |                     debug("#{inspect(name)} has requested to be ignored")
 863 |                     state
 864 | 
 865 |                   {:resume, handoff_state} ->
 866 |                     debug("#{inspect(name)} has requested to be resumed")
 867 |                     {:ok, new_state} = remove_registration(obj, state)
 868 |                     send(pid, {:swarm, :die})
 869 |                     debug("sending handoff for #{inspect(name)} to #{other_node}")
 870 | 
 871 |                     GenStateMachine.cast(
 872 |                       {__MODULE__, other_node},
 873 |                       {:handoff, self(), {name, meta, handoff_state, Clock.peek(new_state.clock)}}
 874 |                     )
 875 | 
 876 |                     new_state
 877 | 
 878 |                   :restart ->
 879 |                     debug("#{inspect(name)} has requested to be restarted")
 880 |                     {:ok, new_state} = remove_registration(obj, state)
 881 |                     send(pid, {:swarm, :die})
 882 | 
 883 |                     case do_track(%Tracking{name: name, meta: meta}, new_state) do
 884 |                       :keep_state_and_data -> new_state
 885 |                       {:keep_state, new_state} -> new_state
 886 |                     end
 887 |                 end
 888 |               catch
 889 |                 _, err ->
 890 |                   warn("handoff failed for #{inspect(name)}: #{inspect(err)}")
 891 |                   state
 892 |               end
 893 |           end
 894 | 
 895 |         entry(name: name, pid: pid, meta: %{mfa: _mfa} = meta) = obj, state when is_map(meta) ->
 896 |           cond do
 897 |             Enum.member?(state.nodes, node(pid)) ->
 898 |               # the parent node is still up
 899 |               state
 900 | 
 901 |             :else ->
 902 |               # pid is dead, we're going to restart it
 903 |               case Strategy.key_to_node(state.strategy, name) do
 904 |                 :undefined ->
 905 |                   # No node available to restart process on, so remove registration
 906 |                   warn("no node available to restart #{inspect(name)}")
 907 |                   {:ok, new_state} = remove_registration(obj, state)
 908 |                   new_state
 909 | 
 910 |                 ^current_node ->
 911 |                   debug("restarting #{inspect(name)} on #{current_node}")
 912 |                   {:ok, new_state} = remove_registration(obj, state)
 913 | 
 914 |                   case do_track(%Tracking{name: name, meta: meta}, new_state) do
 915 |                     :keep_state_and_data -> new_state
 916 |                     {:keep_state, new_state} -> new_state
 917 |                   end
 918 | 
 919 |                 _other_node ->
 920 |                   # other_node will tell us to unregister/register the restarted pid
 921 |                   state
 922 |               end
 923 |           end
 924 | 
 925 |         entry(name: name, pid: pid) = obj, state ->
 926 |           pid_node = node(pid)
 927 | 
 928 |           cond do
 929 |             pid_node == current_node or Enum.member?(state.nodes, pid_node) ->
 930 |               # the parent node is still up
 931 |               state
 932 | 
 933 |             :else ->
 934 |               # the parent node is down, but we cannot restart this pid, so unregister it
 935 |               debug("removing registration for #{inspect(name)}, #{pid_node} is down")
 936 |               {:ok, new_state} = remove_registration(obj, state)
 937 |               new_state
 938 |           end
 939 |       end)
 940 | 
 941 |     info("topology change complete")
 942 |     {:keep_state, new_state}
 943 |   end
 944 | 
 945 |   # This is the callback for tracker events which are being replicated from other nodes in the cluster
 946 |   defp handle_replica_event(_from, {:track, name, pid, meta}, rclock, %TrackerState{clock: clock}) do
 947 |     debug("replicating registration for #{inspect(name)} (#{inspect(pid)}) locally")
 948 | 
 949 |     case Registry.get_by_name(name) do
 950 |       entry(name: ^name, pid: ^pid, meta: ^meta) ->
 951 |         # We're already up to date
 952 |         :keep_state_and_data
 953 | 
 954 |       entry(name: ^name, pid: ^pid, clock: lclock) ->
 955 |         # We don't have the same view of the metadata
 956 |         cond do
 957 |           Clock.leq(lclock, rclock) ->
 958 |             # The remote version is dominant
 959 |             lclock = Clock.join(lclock, rclock)
 960 |             Registry.update(name, meta: meta, clock: lclock)
 961 |             :keep_state_and_data
 962 | 
 963 |           Clock.leq(rclock, lclock) ->
 964 |             # The local version is dominant
 965 |             :keep_state_and_data
 966 | 
 967 |           :else ->
 968 |             warn(
 969 |               "received track event for #{inspect(name)}, but local clock conflicts with remote clock, event unhandled"
 970 |             )
 971 | 
 972 |             :keep_state_and_data
 973 |         end
 974 | 
 975 |       entry(name: ^name, pid: other_pid, ref: ref, clock: lclock) = obj ->
 976 |         # we have conflicting views of this name, compare clocks and fix it
 977 |         current_node = Node.self()
 978 | 
 979 |         cond do
 980 |           Clock.leq(lclock, rclock) and node(other_pid) == current_node ->
 981 |             # The remote version is dominant, kill the local pid and remove the registration
 982 |             Process.demonitor(ref, [:flush])
 983 |             Process.exit(other_pid, :kill)
 984 |             Registry.remove(obj)
 985 |             new_ref = Process.monitor(pid)
 986 |             lclock = Clock.join(lclock, rclock)
 987 |             Registry.new!(entry(name: name, pid: pid, ref: new_ref, meta: meta, clock: lclock))
 988 |             :keep_state_and_data
 989 | 
 990 |           Clock.leq(rclock, lclock) ->
 991 |             # The local version is dominant, so ignore this event
 992 |             :keep_state_and_data
 993 | 
 994 |           :else ->
 995 |             # The clocks are conflicted, warn, and ignore this event
 996 |             warn(
 997 |               "received track event for #{inspect(name)}, mismatched pids, local clock conflicts with remote clock, event unhandled"
 998 |             )
 999 | 
1000 |             :keep_state_and_data
1001 |         end
1002 | 
1003 |       :undefined ->
1004 |         ref = Process.monitor(pid)
1005 |         lclock = Clock.join(clock, rclock)
1006 |         Registry.new!(entry(name: name, pid: pid, ref: ref, meta: meta, clock: lclock))
1007 |         :keep_state_and_data
1008 |     end
1009 |   end
1010 | 
1011 |   defp handle_replica_event(_from, {:untrack, pid}, rclock, _state) do
1012 |     debug("replica event: untrack #{inspect(pid)}")
1013 | 
1014 |     case Registry.get_by_pid(pid) do
1015 |       :undefined ->
1016 |         :keep_state_and_data
1017 | 
1018 |       entries when is_list(entries) ->
1019 |         Enum.each(entries, fn entry(ref: ref, clock: lclock) = obj ->
1020 |           cond do
1021 |             Clock.leq(lclock, rclock) ->
1022 |               # registration came before unregister, so remove the registration
1023 |               Process.demonitor(ref, [:flush])
1024 |               Registry.remove(obj)
1025 | 
1026 |             Clock.leq(rclock, lclock) ->
1027 |               # registration is newer than de-registration, ignore msg
1028 |               debug("untrack is causally dominated by track for #{inspect(pid)}, ignoring..")
1029 | 
1030 |             :else ->
1031 |               debug("untrack is causally conflicted with track for #{inspect(pid)}, ignoring..")
1032 |           end
1033 |         end)
1034 | 
1035 |         :keep_state_and_data
1036 |     end
1037 |   end
1038 | 
1039 |   defp handle_replica_event(_from, {:update_meta, new_meta, pid}, rclock, state) do
1040 |     debug("replica event: update_meta #{inspect(new_meta)} for #{inspect(pid)}")
1041 | 
1042 |     case Registry.get_by_pid(pid) do
1043 |       :undefined ->
1044 |         :keep_state_and_data
1045 | 
1046 |       entries when is_list(entries) ->
1047 |         Enum.each(entries, fn entry(name: name, meta: old_meta, clock: lclock) ->
1048 |           cond do
1049 |             Clock.leq(lclock, rclock) ->
1050 |               lclock = Clock.join(lclock, rclock)
1051 |               Registry.update(name, meta: new_meta, clock: lclock)
1052 | 
1053 |               debug(
1054 |                 "request to update meta from #{inspect(pid)} (#{inspect(new_meta)}) is causally dominated by remote, updated registry..."
1055 |               )
1056 | 
1057 |             Clock.leq(rclock, lclock) ->
1058 |               # ignore the request, as the local clock dominates the remote
1059 |               debug(
1060 |                 "request to update meta from #{inspect(pid)} (#{inspect(new_meta)}) is causally dominated by local, ignoring.."
1061 |               )
1062 | 
1063 |             :else ->
1064 |               new_meta = Map.merge(old_meta, new_meta)
1065 | 
1066 |               # we're going to join and bump our local clock though and re-broadcast the update to ensure we converge
1067 |               debug(
1068 |                 "conflicting meta for #{inspect(name)}, updating and notifying other nodes, old meta: #{
1069 |                   inspect(old_meta)
1070 |                 }, new meta: #{inspect(new_meta)}"
1071 |               )
1072 | 
1073 |               lclock = Clock.join(lclock, rclock)
1074 |               lclock = Clock.event(lclock)
1075 |               Registry.update(name, meta: new_meta, clock: lclock)
1076 |               broadcast_event(state.nodes, lclock, {:update_meta, new_meta, pid})
1077 |           end
1078 |         end)
1079 | 
1080 |         :keep_state_and_data
1081 |     end
1082 |   end
1083 | 
1084 |   defp handle_replica_event(_from, event, _clock, _state) do
1085 |     warn("received unrecognized replica event: #{inspect(event)}")
1086 |     :keep_state_and_data
1087 |   end
1088 | 
1089 |   # This is the handler for local operations on the tracker which require a response.
1090 |   defp handle_call({:whereis, name}, from, %TrackerState{strategy: strategy}) do
1091 |     current_node = Node.self()
1092 | 
1093 |     case Strategy.key_to_node(strategy, name) do
1094 |       :undefined ->
1095 |         GenStateMachine.reply(from, :undefined)
1096 | 
1097 |       ^current_node ->
1098 |         case Registry.get_by_name(name) do
1099 |           :undefined ->
1100 |             GenStateMachine.reply(from, :undefined)
1101 | 
1102 |           entry(pid: pid) ->
1103 |             GenStateMachine.reply(from, pid)
1104 |         end
1105 | 
1106 |       other_node ->
1107 |         _ =
1108 |           Task.Supervisor.start_child(Swarm.TaskSupervisor, fn ->
1109 |             case :rpc.call(other_node, Swarm.Registry, :get_by_name, [name], :infinity) do
1110 |               :undefined ->
1111 |                 GenStateMachine.reply(from, :undefined)
1112 | 
1113 |               entry(pid: pid) ->
1114 |                 GenStateMachine.reply(from, pid)
1115 | 
1116 |               {:badrpc, reason} ->
1117 |                 warn(
1118 |                   "failed to execute remote get_by_name on #{inspect(other_node)}: #{
1119 |                     inspect(reason)
1120 |                   }"
1121 |                 )
1122 | 
1123 |                 GenStateMachine.reply(from, :undefined)
1124 |             end
1125 |           end)
1126 |     end
1127 | 
1128 |     :keep_state_and_data
1129 |   end
1130 | 
1131 |   defp handle_call({:track, name, pid, meta}, from, %TrackerState{} = state) do
1132 |     debug("registering #{inspect(pid)} as #{inspect(name)}, with metadata #{inspect(meta)}")
1133 |     add_registration({name, pid, meta}, from, state)
1134 |   end
1135 | 
1136 |   defp handle_call({:track, name, meta}, from, state) do
1137 |     current_node = Node.self()
1138 |     {{m, f, a}, _other_meta} = Map.pop(meta, :mfa)
1139 | 
1140 |     case from do
1141 |       {from_pid, _} when node(from_pid) != current_node ->
1142 |         debug(
1143 |           "#{inspect(node(from_pid))} is registering #{inspect(name)} as process started by #{m}.#{
1144 |             f
1145 |           }/#{length(a)} with args #{inspect(a)}"
1146 |         )
1147 | 
1148 |       _ ->
1149 |         debug(
1150 |           "registering #{inspect(name)} as process started by #{m}.#{f}/#{length(a)} with args #{
1151 |             inspect(a)
1152 |           }"
1153 |         )
1154 |     end
1155 | 
1156 |     do_track(%Tracking{name: name, meta: meta, from: from}, state)
1157 |   end
1158 | 
1159 |   defp handle_call({:untrack, pid}, from, %TrackerState{} = state) do
1160 |     debug("untrack #{inspect(pid)}")
1161 |     {:ok, new_state} = remove_registration_by_pid(pid, state)
1162 |     GenStateMachine.reply(from, :ok)
1163 |     {:keep_state, new_state}
1164 |   end
1165 | 
1166 |   defp handle_call({:add_meta, key, value, pid}, from, %TrackerState{} = state) do
1167 |     debug("add_meta #{inspect({key, value})} to #{inspect(pid)}")
1168 |     {:ok, new_state} = add_meta_by_pid({key, value}, pid, state)
1169 |     GenStateMachine.reply(from, :ok)
1170 |     {:keep_state, new_state}
1171 |   end
1172 | 
1173 |   defp handle_call({:remove_meta, key, pid}, from, %TrackerState{} = state) do
1174 |     debug("remote_meta #{inspect(key)} for #{inspect(pid)}")
1175 |     {:ok, new_state} = remove_meta_by_pid(key, pid, state)
1176 |     GenStateMachine.reply(from, :ok)
1177 |     {:keep_state, new_state}
1178 |   end
1179 |   defp handle_call({:handoff, worker_name, handoff_state}, from, state) do
1180 |     Registry.get_by_name(worker_name)
1181 |     |> case do
1182 |       :undefined ->
1183 |         # Worker was already removed from registry -> do nothing
1184 |         debug "The node #{worker_name} was not found in the registry"
1185 |       entry(name: name, pid: pid, meta: %{mfa: _mfa} = meta) = obj ->
1186 |         case Strategy.remove_node(state.strategy, state.self) |> Strategy.key_to_node(name) do
1187 |           {:error, {:invalid_ring, :no_nodes}} ->
1188 |             debug "Cannot handoff #{inspect name} because there is no other node left"
1189 |           other_node ->
1190 |             debug "#{inspect name} has requested to be terminated and resumed on another node"
1191 |             {:ok, state} = remove_registration(obj, state)
1192 |             send(pid, {:swarm, :die})
1193 |             debug "sending handoff for #{inspect name} to #{other_node}"
1194 |             GenStateMachine.cast({__MODULE__, other_node},
1195 |                                  {:handoff, self(), {name, meta, handoff_state, Clock.peek(state.clock)}})
1196 |         end
1197 |     end
1198 | 
1199 |     GenStateMachine.reply(from, :finished)
1200 |     :keep_state_and_data
1201 |   end
1202 |   defp handle_call(msg, _from, _state) do
1203 |     warn("unrecognized call: #{inspect(msg)}")
1204 |     :keep_state_and_data
1205 |   end
1206 | 
1207 |   # This is the handler for local operations on the tracker which are asynchronous
1208 |   defp handle_cast({:sync, from, rclock}, %TrackerState{} = state) do
1209 |     if ignore_node?(node(from)) do
1210 |       GenStateMachine.cast(from, {:sync_err, :node_ignored})
1211 |       :keep_state_and_data
1212 |     else
1213 |       debug("received sync request from #{node(from)}")
1214 |       sync_node = node(from)
1215 |       ref = Process.monitor(from)
1216 |       GenStateMachine.cast(from, {:sync_recv, self(), rclock, get_registry_snapshot()})
1217 | 
1218 |       {:next_state, :awaiting_sync_ack,
1219 |        %{state | sync_node: sync_node, sync_ref: ref}}
1220 |     end
1221 |   end
1222 | 
1223 |   defp handle_cast(msg, _state) do
1224 |     warn("unrecognized cast: #{inspect(msg)}")
1225 |     :keep_state_and_data
1226 |   end
1227 | 
1228 |   # This is only ever called if a registration needs to be sent to a remote node
1229 |   # and that node went down in the middle of the call to its Swarm process.
1230 |   # We need to process the nodeup/down events by re-entering the receive loop first,
1231 |   # so we send ourselves a message to retry. This is the handler for that message.
1232 |   defp handle_retry(from, {:track, name, meta}, state) do
1233 |     handle_call({:track, name, meta}, from, state)
1234 |   end
1235 | 
1236 |   defp handle_retry(_from, _event, _state) do
1237 |     :keep_state_and_data
1238 |   end
1239 | 
1240 |   # Called when a pid dies, and the monitor is triggered
1241 |   defp handle_monitor(ref, pid, :noconnection, %TrackerState{} = state) do
1242 |     # lost connection to the node this pid is running on, check if we should restart it
1243 |     case Registry.get_by_ref(ref) do
1244 |       :undefined ->
1245 |         debug(
1246 |           "lost connection to #{inspect(pid)}, but no registration could be found, ignoring.."
1247 |         )
1248 | 
1249 |         :keep_state_and_data
1250 | 
1251 |       entry(name: name, pid: ^pid, meta: %{mfa: _mfa}) ->
1252 |         debug(
1253 |           "lost connection to #{inspect(name)} (#{inspect(pid)}) on #{node(pid)}, node is down"
1254 |         )
1255 | 
1256 |         state
1257 |         |> nodedown(node(pid))
1258 |         |> handle_node_status()
1259 | 
1260 |       entry(pid: ^pid) = obj ->
1261 |         debug("lost connection to #{inspect(pid)}, but not restartable, removing registration..")
1262 |         {:ok, new_state} = remove_registration(obj, state)
1263 |         {:keep_state, new_state}
1264 |     end
1265 |   end
1266 | 
1267 |   defp handle_monitor(ref, pid, reason, %TrackerState{} = state) do
1268 |     case Registry.get_by_ref(ref) do
1269 |       :undefined ->
1270 |         debug(
1271 |           "#{inspect(pid)} is down: #{inspect(reason)}, but no registration found, ignoring.."
1272 |         )
1273 | 
1274 |         :keep_state_and_data
1275 | 
1276 |       entry(name: name, pid: ^pid) = obj ->
1277 |         debug("#{inspect(name)} is down: #{inspect(reason)}")
1278 |         {:ok, new_state} = remove_registration(obj, state)
1279 |         {:keep_state, new_state}
1280 |     end
1281 |   end
1282 | 
1283 |   # Attempt to start a named process on its destination node
1284 |   defp do_track(
1285 |          %Tracking{name: name, meta: meta, from: from},
1286 |          %TrackerState{strategy: strategy} = state
1287 |        ) do
1288 |     current_node = Node.self()
1289 |     {{m, f, a}, _other_meta} = Map.pop(meta, :mfa)
1290 | 
1291 |     case Strategy.key_to_node(strategy, name) do
1292 |       :undefined ->
1293 |         warn("no node available to start #{inspect(name)} process")
1294 |         reply(from, {:error, :no_node_available})
1295 |         :keep_state_and_data
1296 | 
1297 |       ^current_node ->
1298 |         case Registry.get_by_name(name) do
1299 |           :undefined ->
1300 |             debug("starting #{inspect(name)} on #{current_node}")
1301 | 
1302 |             try do
1303 |               case apply(m, f, a) do
1304 |                 {:ok, pid} ->
1305 |                   debug("started #{inspect(name)} on #{current_node}")
1306 |                   add_registration({name, pid, meta}, from, state)
1307 | 
1308 |                 err ->
1309 |                   warn("failed to start #{inspect(name)} on #{current_node}: #{inspect(err)}")
1310 |                   reply(from, {:error, {:invalid_return, err}})
1311 |                   :keep_state_and_data
1312 |               end
1313 |             catch
1314 |               kind, reason ->
1315 |                 warn(Exception.format(kind, reason, System.stacktrace()))
1316 |                 reply(from, {:error, reason})
1317 |                 :keep_state_and_data
1318 |             end
1319 | 
1320 |           entry(pid: pid) ->
1321 |             debug("found #{inspect(name)} already registered on #{node(pid)}")
1322 |             reply(from, {:error, {:already_registered, pid}})
1323 |             :keep_state_and_data
1324 |         end
1325 | 
1326 |       remote_node ->
1327 |         debug("starting #{inspect(name)} on remote node #{remote_node}")
1328 | 
1329 |         {:ok, _pid} =
1330 |           Task.start(fn ->
1331 |             start_pid_remotely(remote_node, from, name, meta, state)
1332 |           end)
1333 | 
1334 |         :keep_state_and_data
1335 |     end
1336 |   end
1337 | 
1338 |   # Starts a process on a remote node. Handles failures with a retry mechanism
1339 |   defp start_pid_remotely(remote_node, from, name, meta, state, attempts \\ 0)
1340 | 
1341 |   defp start_pid_remotely(remote_node, from, name, meta, %TrackerState{} = state, attempts)
1342 |        when attempts <= @retry_max_attempts do
1343 |     try do
1344 |       case GenStateMachine.call({__MODULE__, remote_node}, {:track, name, meta}, :infinity) do
1345 |         {:ok, pid} ->
1346 |           debug("remotely started #{inspect(name)} (#{inspect(pid)}) on #{remote_node}")
1347 |           reply(from, {:ok, pid})
1348 | 
1349 |         {:error, {:already_registered, pid}} ->
1350 |           debug(
1351 |             "#{inspect(name)} already registered to #{inspect(pid)} on #{node(pid)}, registering locally"
1352 |           )
1353 | 
1354 |           # register named process that is unknown locally
1355 |           add_registration({name, pid, meta}, from, state)
1356 |           :ok
1357 | 
1358 |         {:error, {:noproc, _}} = err ->
1359 |           warn(
1360 |             "#{inspect(name)} could not be started on #{remote_node}: #{inspect(err)}, retrying operation after #{
1361 |               @retry_interval
1362 |             }ms.."
1363 |           )
1364 | 
1365 |           :timer.sleep(@retry_interval)
1366 |           start_pid_remotely(remote_node, from, name, meta, state, attempts + 1)
1367 | 
1368 |         {:error, :undef} ->
1369 |           warn(
1370 |             "#{inspect(name)} could not be started on #{remote_node}: target module not available on remote node, retrying operation after #{
1371 |               @retry_interval
1372 |             }ms.."
1373 |           )
1374 | 
1375 |           :timer.sleep(@retry_interval)
1376 |           start_pid_remotely(remote_node, from, name, meta, state, attempts + 1)
1377 | 
1378 |         {:error, _reason} = err ->
1379 |           warn("#{inspect(name)} could not be started on #{remote_node}: #{inspect(err)}")
1380 |           reply(from, err)
1381 |       end
1382 |     catch
1383 |       _, {:noproc, _} ->
1384 |         warn(
1385 |           "remote tracker on #{remote_node} went down during registration, retrying operation.."
1386 |         )
1387 | 
1388 |         start_pid_remotely(remote_node, from, name, meta, state)
1389 | 
1390 |       _, {{:nodedown, _}, _} ->
1391 |         warn("failed to start #{inspect(name)} on #{remote_node}: nodedown, retrying operation..")
1392 | 
1393 |         new_state = %{
1394 |           state
1395 |           | nodes: state.nodes -- [remote_node],
1396 |             strategy: Strategy.remove_node(state.strategy, remote_node)
1397 |         }
1398 | 
1399 |         case Strategy.key_to_node(new_state.strategy, name) do
1400 |           :undefined ->
1401 |             warn("failed to start #{inspect(name)} as no node available")
1402 |             reply(from, {:error, :no_node_available})
1403 | 
1404 |           new_node ->
1405 |             start_pid_remotely(new_node, from, name, meta, new_state)
1406 |         end
1407 | 
1408 |       kind, err ->
1409 |         error(Exception.format(kind, err, System.stacktrace()))
1410 |         warn("failed to start #{inspect(name)} on #{remote_node}: #{inspect(err)}")
1411 |         reply(from, {:error, err})
1412 |     end
1413 |   end
1414 | 
1415 |   defp start_pid_remotely(remote_node, from, name, _meta, _state, attempts) do
1416 |     warn(
1417 |       "#{inspect(name)} could not be started on #{remote_node}, failed to start after #{attempts} attempt(s)"
1418 |     )
1419 | 
1420 |     reply(from, {:error, :too_many_attempts})
1421 |   end
1422 | 
1423 |   ## Internal helpers
1424 | 
1425 |   # Send a reply message unless the recipient client is `nil`. Function always returns `:ok`
1426 |   defp reply(nil, _message), do: :ok
1427 |   defp reply(from, message), do: GenStateMachine.reply(from, message)
1428 | 
1429 |   defp broadcast_event([], _clock, _event), do: :ok
1430 | 
1431 |   defp broadcast_event(nodes, clock, event) do
1432 |     clock = Clock.peek(clock)
1433 | 
1434 |     :abcast = :rpc.abcast(nodes, __MODULE__, {:event, self(), clock, event})
1435 |     :ok
1436 |   end
1437 | 
1438 |   # Add a registration and reply to the caller with the result, then return the state transition
1439 |   defp add_registration({_name, _pid, _meta} = reg, from, state) do
1440 |     case register(reg, state) do
1441 |       {:ok, reply, new_state} ->
1442 |         reply(from, {:ok, reply})
1443 |         {:keep_state, new_state}
1444 | 
1445 |       {:error, reply, new_state} ->
1446 |         reply(from, {:error, reply})
1447 |         {:keep_state, new_state}
1448 |     end
1449 |   end
1450 | 
1451 |   # Add a registration and return the result of the add
1452 |   defp register({name, pid, meta}, %TrackerState{clock: clock, nodes: nodes} = state) do
1453 |     case Registry.get_by_name(name) do
1454 |       :undefined ->
1455 |         ref = Process.monitor(pid)
1456 |         lclock = Clock.event(clock)
1457 |         Registry.new!(entry(name: name, pid: pid, ref: ref, meta: meta, clock: lclock))
1458 |         broadcast_event(nodes, lclock, {:track, name, pid, meta})
1459 |         {:ok, pid, state}
1460 | 
1461 |       entry(pid: ^pid) ->
1462 |         # Not sure how this could happen, but hey, no need to return an error
1463 |         {:ok, pid, state}
1464 | 
1465 |       entry(pid: other_pid) ->
1466 |         debug(
1467 |           "conflicting registration for #{inspect(name)}: remote (#{inspect(pid)}) vs. local #{
1468 |             inspect(other_pid)
1469 |           }"
1470 |         )
1471 | 
1472 |         # Since there is already a registration, we need to check whether to kill the newly
1473 |         # created process
1474 |         pid_node = node(pid)
1475 |         current_node = Node.self()
1476 | 
1477 |         case meta do
1478 |           %{mfa: _} when pid_node == current_node ->
1479 |             # This was created via register_name/5, which means we need to kill the pid we started
1480 |             Process.exit(pid, :kill)
1481 | 
1482 |           _ ->
1483 |             # This was a pid started by something else, so we can ignore it
1484 |             :ok
1485 |         end
1486 | 
1487 |         {:error, {:already_registered, other_pid}, state}
1488 |     end
1489 |   end
1490 | 
1491 |   # Remove a registration, and return the result of the remove
1492 |   defp remove_registration(entry(pid: pid, ref: ref, clock: lclock) = obj, state) do
1493 |     Process.demonitor(ref, [:flush])
1494 |     Registry.remove(obj)
1495 |     lclock = Clock.event(lclock)
1496 |     broadcast_event(state.nodes, lclock, {:untrack, pid})
1497 |     {:ok, state}
1498 |   end
1499 | 
1500 |   defp remove_registration_by_pid(pid, state) do
1501 |     case Registry.get_by_pid(pid) do
1502 |       :undefined ->
1503 |         {:ok, state}
1504 | 
1505 |       entries when is_list(entries) ->
1506 |         Enum.each(entries, fn entry ->
1507 |           remove_registration(entry, state)
1508 |         end)
1509 | 
1510 |         {:ok, state}
1511 |     end
1512 |   end
1513 | 
1514 |   defp add_meta_by_pid({key, value}, pid, state) do
1515 |     case Registry.get_by_pid(pid) do
1516 |       :undefined ->
1517 |         {:ok, state}
1518 | 
1519 |       entries when is_list(entries) ->
1520 |         Enum.each(entries, fn entry(name: name, meta: old_meta, clock: lclock) ->
1521 |           new_meta = Map.put(old_meta, key, value)
1522 |           lclock = Clock.event(lclock)
1523 |           Registry.update(name, meta: new_meta, clock: lclock)
1524 |           broadcast_event(state.nodes, lclock, {:update_meta, new_meta, pid})
1525 |         end)
1526 | 
1527 |         {:ok, state}
1528 |     end
1529 |   end
1530 | 
1531 |   defp remove_meta_by_pid(key, pid, state) do
1532 |     case Registry.get_by_pid(pid) do
1533 |       :undefined ->
1534 |         {:ok, state}
1535 | 
1536 |       entries when is_list(entries) ->
1537 |         Enum.each(entries, fn entry(name: name, meta: old_meta, clock: lclock) ->
1538 |           new_meta = Map.drop(old_meta, [key])
1539 |           lclock = Clock.event(lclock)
1540 |           Registry.update(name, meta: new_meta, clock: lclock)
1541 |           broadcast_event(state.nodes, lclock, {:update_meta, new_meta, pid})
1542 |         end)
1543 | 
1544 |         {:ok, state}
1545 |     end
1546 |   end
1547 | 
1548 |   @global_blacklist MapSet.new([~r/^remsh.*$/, ~r/^.+_upgrader_.+$/, ~r/^.+_maint_.+$/])
1549 |   # The list of configured ignore patterns for nodes
1550 |   # This is only applied if no blacklist is provided.
1551 |   defp node_blacklist() do
1552 |     Application.get_env(:swarm, :node_blacklist, [])
1553 |     |> MapSet.new()
1554 |     |> MapSet.union(@global_blacklist)
1555 |     |> MapSet.to_list()
1556 |   end
1557 | 
1558 |   # The list of configured whitelist patterns for nodes
1559 |   # If a whitelist is provided, any nodes which do not match the whitelist are ignored
1560 |   defp node_whitelist(), do: Application.get_env(:swarm, :node_whitelist, [])
1561 | 
1562 |   # Determine if a node should be ignored, even if connected
1563 |   # The whitelist and blacklist can contain literal strings, regexes, or regex strings
1564 |   # By default, all nodes are allowed, except those which are remote shell sessions
1565 |   # where the node name of the remote shell starts with `remsh` (relx, exrm, and distillery)
1566 |   # all use that prefix for remote shells.
1567 |   defp ignore_node?(node) do
1568 |     blacklist = node_blacklist()
1569 |     whitelist = node_whitelist()
1570 |     HashRing.Utils.ignore_node?(node, blacklist, whitelist)
1571 |   end
1572 | 
1573 |   # Used during anti-entropy checks to remove local registrations and replace them with the remote version
1574 |   defp resolve_incorrect_local_reg(
1575 |          _remote_node,
1576 |          entry(pid: lpid, clock: lclock) = lreg,
1577 |          entry(name: rname, pid: rpid, meta: rmeta, clock: rclock),
1578 |          state
1579 |        ) do
1580 |     # the remote registration is correct
1581 |     {:ok, new_state} = remove_registration(lreg, state)
1582 |     send(lpid, {:swarm, :die})
1583 |     # add the remote registration
1584 |     ref = Process.monitor(rpid)
1585 |     lclock = Clock.join(lclock, rclock)
1586 |     Registry.new!(entry(name: rname, pid: rpid, ref: ref, meta: rmeta, clock: lclock))
1587 |     new_state
1588 |   end
1589 | 
1590 |   # Used during anti-entropy checks to remove remote registrations and replace them with the local version
1591 |   defp resolve_incorrect_remote_reg(
1592 |          remote_node,
1593 |          entry(pid: lpid, meta: lmeta),
1594 |          entry(name: rname, pid: rpid),
1595 |          state
1596 |        ) do
1597 |     GenStateMachine.cast({__MODULE__, remote_node}, {:untrack, rpid})
1598 |     send(rpid, {:swarm, :die})
1599 |     GenStateMachine.cast({__MODULE__, remote_node}, {:track, rname, lpid, lmeta})
1600 |     state
1601 |   end
1602 | 
1603 |   # A new node has been added to the cluster, we need to update the distribution strategy and handle shifting
1604 |   # processes to new nodes based on the new topology.
1605 |   defp nodeup(%TrackerState{nodes: nodes, strategy: strategy} = state, node) do
1606 |     cond do
1607 |       node == Node.self() ->
1608 |         new_strategy =
1609 |           strategy
1610 |           |> Strategy.remove_node(state.self)
1611 |           |> Strategy.add_node(node)
1612 | 
1613 |         info("node name changed from #{state.self} to #{node}")
1614 |         {:ok, %{state | self: node, strategy: new_strategy}}
1615 | 
1616 |       Enum.member?(nodes, node) ->
1617 |         {:ok, state}
1618 | 
1619 |       ignore_node?(node) ->
1620 |         {:ok, state}
1621 | 
1622 |       :else ->
1623 |         ensure_swarm_started_on_remote_node(state, node)
1624 |     end
1625 |   end
1626 | 
1627 |   defp ensure_swarm_started_on_remote_node(state, node, attempts \\ 0)
1628 | 
1629 |   defp ensure_swarm_started_on_remote_node(
1630 |          %TrackerState{nodes: nodes, strategy: strategy} = state,
1631 |          node,
1632 |          attempts
1633 |        )
1634 |        when attempts <= @retry_max_attempts do
1635 |     case :rpc.call(node, :application, :which_applications, []) do
1636 |       app_list when is_list(app_list) ->
1637 |         case List.keyfind(app_list, :swarm, 0) do
1638 |           {:swarm, _, _} ->
1639 |             info("nodeup #{node}")
1640 | 
1641 |             new_state = %{
1642 |               state
1643 |               | nodes: [node | nodes],
1644 |                 strategy: Strategy.add_node(strategy, node)
1645 |             }
1646 | 
1647 |             {:ok, new_state, {:topology_change, {:nodeup, node}}}
1648 | 
1649 |           nil ->
1650 |             debug(
1651 |               "nodeup for #{node} was ignored because swarm not started yet, will retry in #{
1652 |                 @retry_interval
1653 |               }ms.."
1654 |             )
1655 | 
1656 |             Process.send_after(
1657 |               self(),
1658 |               {:ensure_swarm_started_on_remote_node, node, attempts + 1},
1659 |               @retry_interval
1660 |             )
1661 | 
1662 |             {:ok, state}
1663 |         end
1664 | 
1665 |       other ->
1666 |         warn("nodeup for #{node} was ignored because: #{inspect(other)}")
1667 |         {:ok, state}
1668 |     end
1669 |   end
1670 | 
1671 |   defp ensure_swarm_started_on_remote_node(%TrackerState{} = state, node, attempts) do
1672 |     warn(
1673 |       "nodeup for #{node} was ignored because swarm failed to start after #{attempts} attempt(s)"
1674 |     )
1675 | 
1676 |     {:ok, state}
1677 |   end
1678 | 
1679 |   # A remote node went down, we need to update the distribution strategy and handle restarting/shifting processes
1680 |   # as needed based on the new topology
1681 |   defp nodedown(%TrackerState{nodes: nodes, strategy: strategy} = state, node) do
1682 |     cond do
1683 |       Enum.member?(nodes, node) ->
1684 |         info("nodedown #{node}")
1685 |         strategy = Strategy.remove_node(strategy, node)
1686 | 
1687 |         pending_reqs =
1688 |           Enum.filter(state.pending_sync_reqs, fn
1689 |             ^node -> false
1690 |             _ -> true
1691 |           end)
1692 | 
1693 |         new_state = %{
1694 |           state
1695 |           | nodes: nodes -- [node],
1696 |             strategy: strategy,
1697 |             pending_sync_reqs: pending_reqs
1698 |         }
1699 | 
1700 |         {:ok, new_state, {:topology_change, {:nodedown, node}}}
1701 | 
1702 |       :else ->
1703 |         {:ok, state}
1704 |     end
1705 |   end
1706 | 
1707 |   defp get_registry_snapshot() do
1708 |     snapshot = Registry.snapshot()
1709 | 
1710 |     Enum.map(snapshot, fn entry(name: name, pid: pid, ref: ref, meta: meta, clock: clock) ->
1711 |       entry(name: name, pid: pid, ref: ref, meta: meta, clock: Clock.peek(clock))
1712 |     end)
1713 |   end
1714 | end
1715 | 


--------------------------------------------------------------------------------
/mix.exs:
--------------------------------------------------------------------------------
 1 | # Since we depend on gen_statem, an OTP 19 construct
 2 | # Warn if someone depends on this in <19
 3 | otp_release =
 4 |   String.split("#{:erlang.system_info(:otp_release)}", ".")
 5 |   |> List.first()
 6 |   |> String.to_integer()
 7 | 
 8 | if otp_release < 19 do
 9 |   IO.warn("Swarm requires Erlang/OTP 19 or greater", [])
10 | end
11 | 
12 | defmodule Swarm.Mixfile do
13 |   use Mix.Project
14 | 
15 |   def project do
16 |     [
17 |       app: :swarm,
18 |       version: "3.4.0",
19 |       elixir: "~> 1.3",
20 |       elixirc_paths: elixirc_paths(Mix.env()),
21 |       build_embedded: Mix.env() == :prod,
22 |       start_permanent: Mix.env() == :prod,
23 |       description:
24 |         "A fast, multi-master, distributed global process registry, with automatic distribution of worker processes.",
25 |       package: package(),
26 |       docs: docs(),
27 |       deps: deps(),
28 |       aliases: aliases(),
29 |       dialyzer: [
30 |         plt_add_apps: [:inets],
31 |         plt_add_deps: :transitive,
32 |         flags: ~w(-Wunmatched_returns -Werror_handling -Wrace_conditions -Wunderspecs)
33 |       ]
34 |     ]
35 |   end
36 | 
37 |   def application do
38 |     [extra_applications: [:logger, :crypto], mod: {Swarm, []}]
39 |   end
40 | 
41 |   defp deps do
42 |     [
43 |       {:ex_doc, "~> 0.13", only: :dev},
44 |       {:dialyxir, "~> 0.3", only: :dev},
45 |       {:benchee, "~> 0.4", only: :dev},
46 |       {:porcelain, "~> 2.0", only: [:dev, :test]},
47 |       {:libring, "~> 1.0"},
48 |       {:gen_state_machine, "~> 2.0"}
49 |     ]
50 |   end
51 | 
52 |   defp package do
53 |     [
54 |       files: ["lib", "src", "mix.exs", "README.md", "LICENSE.md"],
55 |       maintainers: ["Paul Schoenfelder"],
56 |       licenses: ["MIT"],
57 |       links: %{Github: "https://github.com/bitwalker/swarm"}
58 |     ]
59 |   end
60 | 
61 |   defp docs do
62 |     [
63 |       main: "readme",
64 |       formatter_opts: [gfm: true],
65 |       extras: [
66 |         "README.md"
67 |       ]
68 |     ]
69 |   end
70 | 
71 |   defp aliases() do
72 |     if System.get_env("SWARM_TEST_DEBUG") do
73 |       [test: "test --no-start --trace"]
74 |     else
75 |       [test: "test --no-start"]
76 |     end
77 |   end
78 | 
79 |   defp elixirc_paths(:test), do: ["lib", "test/support"]
80 |   defp elixirc_paths(_), do: ["lib"]
81 | end
82 | 


--------------------------------------------------------------------------------
/mix.lock:
--------------------------------------------------------------------------------
 1 | %{
 2 |   "benchee": {:hex, :benchee, "0.13.2", "30cd4ff5f593fdd218a9b26f3c24d580274f297d88ad43383afe525b1543b165", [:mix], [{:deep_merge, "~> 0.1", [hex: :deep_merge, repo: "hexpm", optional: false]}], "hexpm"},
 3 |   "deep_merge": {:hex, :deep_merge, "0.2.0", "c1050fa2edf4848b9f556fba1b75afc66608a4219659e3311d9c9427b5b680b3", [:mix], [], "hexpm"},
 4 |   "dialyxir": {:hex, :dialyxir, "0.5.1", "b331b091720fd93e878137add264bac4f644e1ddae07a70bf7062c7862c4b952", [:mix], [], "hexpm"},
 5 |   "earmark": {:hex, :earmark, "1.2.6", "b6da42b3831458d3ecc57314dff3051b080b9b2be88c2e5aa41cd642a5b044ed", [:mix], [], "hexpm"},
 6 |   "ex_doc": {:hex, :ex_doc, "0.19.1", "519bb9c19526ca51d326c060cb1778d4a9056b190086a8c6c115828eaccea6cf", [:mix], [{:earmark, "~> 1.1", [hex: :earmark, repo: "hexpm", optional: false]}, {:makeup_elixir, "~> 0.7", [hex: :makeup_elixir, repo: "hexpm", optional: false]}], "hexpm"},
 7 |   "gen_state_machine": {:hex, :gen_state_machine, "2.0.3", "477ea51b466a749ab23a0d6090e9e84073f41f9aa28c7efc40eac18f3d4a9f77", [:mix], [], "hexpm"},
 8 |   "libring": {:hex, :libring, "1.4.0", "41246ba2f3fbc76b3971f6bce83119dfec1eee17e977a48d8a9cfaaf58c2a8d6", [:mix], [], "hexpm"},
 9 |   "makeup": {:hex, :makeup, "0.5.1", "966c5c2296da272d42f1de178c1d135e432662eca795d6dc12e5e8787514edf7", [:mix], [{:nimble_parsec, "~> 0.2.2", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm"},
10 |   "makeup_elixir": {:hex, :makeup_elixir, "0.8.0", "1204a2f5b4f181775a0e456154830524cf2207cf4f9112215c05e0b76e4eca8b", [:mix], [{:makeup, "~> 0.5.0", [hex: :makeup, repo: "hexpm", optional: false]}, {:nimble_parsec, "~> 0.2.2", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm"},
11 |   "nimble_parsec": {:hex, :nimble_parsec, "0.2.2", "d526b23bdceb04c7ad15b33c57c4526bf5f50aaa70c7c141b4b4624555c68259", [:mix], [], "hexpm"},
12 |   "poison": {:hex, :poison, "2.2.0", "4763b69a8a77bd77d26f477d196428b741261a761257ff1cf92753a0d4d24a63", [:mix], []},
13 |   "porcelain": {:hex, :porcelain, "2.0.3", "2d77b17d1f21fed875b8c5ecba72a01533db2013bd2e5e62c6d286c029150fdc", [:mix], []},
14 | }
15 | 


--------------------------------------------------------------------------------
/src/swarm.erl:
--------------------------------------------------------------------------------
 1 | -module(swarm).
 2 | 
 3 | -export([
 4 |   start/2, stop/1,
 5 |   register_name/2, register_name/4, unregister_name/1, whereis_name/1,
 6 |   join/2, leave/2, members/1,
 7 |   publish/2, multi_call/2, multi_call/3, send/2
 8 | ]).
 9 | 
10 | -define(SWARM, 'Elixir.Swarm').
11 | 
12 | %% @doc You shouldn't need this if you've added the
13 | %% the `swarm` application to your applications
14 | %% list, but it's here if you need it.
15 | %% @end
16 | start(Type, Args) ->
17 |     ?SWARM:start(Type, Args).
18 | 
19 | %% Same as above, use if you need it.
20 | stop(State) ->
21 |     ?SWARM:stop(State).
22 | 
23 | %% @doc Registers a name to a pid. Should not be used directly,
24 | %% should only be used with `{via, swarm, Name}`
25 | -spec register_name(term(), pid()) -> yes | no.
26 | register_name(Name, Pid) ->
27 |     ?SWARM:register_name(Name, Pid).
28 | 
29 | %% @doc Registers a name to a process started by the provided
30 | %% module/function/args. If the MFA does not start a process,
31 | %% an error will be returned.
32 | %% @end
33 | -spec register_name(term(), atom(), atom(), [term()]) -> {ok, pid()} | {error, term()}.
34 | register_name(Name, Module, Function, Args) ->
35 |     ?SWARM:register_name(Name, Module, Function, Args).
36 | 
37 | %% @doc Unregisters a name.
38 | -spec unregister_name(term()) -> ok.
39 | unregister_name(Name) ->
40 |     ?SWARM:unregister_name(Name).
41 | 
42 | %% @doc Get the pid of a registered name.
43 | -spec whereis_name(term()) -> pid() | undefined.
44 | whereis_name(Name) ->
45 |     ?SWARM:whereis_name(Name).
46 | 
47 | %% @doc Join a process to a group
48 | -spec join(term(), pid()) -> ok.
49 | join(Group, Pid) ->
50 |     ?SWARM:join(Group, Pid).
51 | 
52 | %% @doc Part a process from a group
53 | -spec leave(term(), pid()) -> ok.
54 | leave(Group, Pid) ->
55 |     ?SWARM:leave(Group, Pid).
56 | 
57 | %% @doc Get a list of pids which are members of the given group
58 | -spec members(term()) -> [pid()].
59 | members(Group) ->
60 |     ?SWARM:members(Group).
61 | 
62 | %% @doc Publish a message to all members of a group
63 | -spec publish(term(), term()) -> ok.
64 | publish(Group, Message) ->
65 |     ?SWARM:publish(Group, Message).
66 | 
67 | %% @doc Call all members of a group with the given message
68 | %% and return the results as a list.
69 | %% @end
70 | -spec multi_call(term(), term()) -> [any()].
71 | multi_call(Group, Message) ->
72 |     multi_call(Group, Message, 5000).
73 | 
74 | %% @doc Same as multi_call/2, but takes a timeout.
75 | %% Any responses not received within that period are
76 | %% ignored.
77 | %% @end
78 | -spec multi_call(term(), term(), pos_integer()) -> [any()].
79 | multi_call(Group, Message, Timeout) ->
80 |     ?SWARM:multi_call(Group, Message, Timeout).
81 | 
82 | %% @doc This function sends a message to the process registered to the given name.
83 | %% It is intended to be used by GenServer when using `GenServer.cast/2`, but you
84 | %% may use it to send any message to the desired process.
85 | %% @end
86 | -spec send(term(), term()) -> ok.
87 |  send(Name, Msg) ->
88 |     ?SWARM:send(Name, Msg).
89 | 


--------------------------------------------------------------------------------
/test/crdt_test.exs:
--------------------------------------------------------------------------------
  1 | defmodule Swarm.IntervalTreeClockTests do
  2 |   use ExUnit.Case
  3 | 
  4 |   alias Swarm.IntervalTreeClock, as: Clock
  5 | 
  6 |   setup do
  7 |     lclock = Clock.seed()
  8 |     {lclock, rclock} = Clock.fork(lclock)
  9 |     [lclock: lclock, rclock: rclock]
 10 |   end
 11 | 
 12 |   test "Forked clocks compare equal", %{lclock: lclock, rclock: rclock} do
 13 |     assert Clock.compare(lclock, rclock) == :eq
 14 |     assert Clock.leq(lclock, rclock)
 15 |     assert Clock.leq(rclock, lclock)
 16 |   end
 17 | 
 18 |   test "Peeked Clock has zero identity and still compares equal", %{
 19 |     lclock: lclock,
 20 |     rclock: rclock
 21 |   } do
 22 |     rclock = Clock.peek(rclock)
 23 | 
 24 |     {0, _} = rclock
 25 |     assert Clock.compare(lclock, rclock) == :eq
 26 |     assert Clock.leq(lclock, rclock)
 27 |     assert Clock.leq(rclock, lclock)
 28 |   end
 29 | 
 30 |   test "Clock with additional event is greater", %{lclock: lclock, rclock: rclock} do
 31 |     rclock = Clock.event(rclock)
 32 | 
 33 |     assert Clock.compare(rclock, lclock) == :gt
 34 |     assert Clock.compare(lclock, rclock) == :lt
 35 |     assert Clock.leq(lclock, rclock)
 36 |     assert not Clock.leq(rclock, lclock)
 37 |   end
 38 | 
 39 |   test "Peeked Clock with additional event is greater", %{lclock: lclock, rclock: rclock} do
 40 |     rclock = Clock.peek(Clock.event(rclock))
 41 | 
 42 |     assert Clock.compare(rclock, lclock) == :gt
 43 |     assert Clock.compare(lclock, rclock) == :lt
 44 |     assert Clock.leq(lclock, rclock)
 45 |     assert not Clock.leq(rclock, lclock)
 46 |   end
 47 | 
 48 |   test "Clocks with two parallel events are concurrent", %{lclock: lclock, rclock: rclock} do
 49 |     lclock = Clock.event(lclock)
 50 |     rclock = Clock.event(rclock)
 51 | 
 52 |     assert not Clock.leq(lclock, rclock)
 53 |     assert not Clock.leq(rclock, lclock)
 54 |     assert Clock.compare(lclock, rclock) == :concurrent
 55 |   end
 56 | 
 57 |   test "Peeked Clocks with two parallel events are concurrent", %{lclock: lclock, rclock: rclock} do
 58 |     lclock = Clock.peek(Clock.event(lclock))
 59 |     rclock = Clock.peek(Clock.event(rclock))
 60 | 
 61 |     assert not Clock.leq(lclock, rclock)
 62 |     assert not Clock.leq(rclock, lclock)
 63 |     assert Clock.compare(lclock, rclock) == :concurrent
 64 |   end
 65 | 
 66 |   test "Clock with additional event can be joined", %{lclock: lclock, rclock: rclock} do
 67 |     rclock = Clock.event(rclock)
 68 |     joined_clock = Clock.join(lclock, rclock)
 69 | 
 70 |     assert Clock.compare(joined_clock, rclock) == :eq
 71 |     assert Clock.compare(joined_clock, lclock) == :gt
 72 |   end
 73 | 
 74 |   test "Peeked clock with additional event can be joined", %{lclock: lclock, rclock: rclock} do
 75 |     rclock = Clock.peek(Clock.event(rclock))
 76 |     joined_clock = Clock.join(lclock, rclock)
 77 | 
 78 |     assert Clock.compare(joined_clock, rclock) == :eq
 79 |     assert Clock.compare(joined_clock, lclock) == :gt
 80 |   end
 81 | 
 82 |   test "Concurrent clocks can be joined and the joined clock contains events from both", %{
 83 |     lclock: lclock,
 84 |     rclock: rclock
 85 |   } do
 86 |     rclock = Clock.event(rclock)
 87 |     lclock = Clock.event(lclock)
 88 |     joined_clock = Clock.join(lclock, rclock)
 89 | 
 90 |     assert Clock.compare(joined_clock, rclock) == :gt
 91 |     assert Clock.compare(joined_clock, lclock) == :gt
 92 |   end
 93 | 
 94 |   test "Concurrent clocks can be joined and new event is not concurrent anymore", %{
 95 |     lclock: lclock,
 96 |     rclock: rclock
 97 |   } do
 98 |     rclock = Clock.event(rclock)
 99 |     lclock = Clock.event(lclock)
100 | 
101 |     joined_lclock = Clock.join(lclock, rclock)
102 |     joined_rclock = Clock.join(rclock, lclock)
103 | 
104 |     joined_lclock = Clock.event(joined_lclock)
105 | 
106 |     assert Clock.compare(joined_lclock, joined_rclock) == :gt
107 |   end
108 | end
109 | 


--------------------------------------------------------------------------------
/test/distributed_test.exs:
--------------------------------------------------------------------------------
  1 | defmodule Swarm.DistributedTests do
  2 |   use ExUnit.Case, async: false
  3 | 
  4 |   alias Swarm.Nodes
  5 | 
  6 |   @moduletag timeout: 120_000
  7 |   @moduletag :capture_log
  8 |   @moduletag :distributed
  9 | 
 10 |   setup_all do
 11 |     :rand.seed(:exs64)
 12 |     :net_kernel.stop()
 13 |     {:ok, _} = :net_kernel.start([:swarm_master, :shortnames])
 14 |     Node.set_cookie(:swarm_test)
 15 | 
 16 |     on_exit(fn ->
 17 |       :net_kernel.stop()
 18 |       exclude = Keyword.get(ExUnit.configuration(), :exclude, [])
 19 | 
 20 |       unless :clustered in exclude do
 21 |         :net_kernel.start([:"primary@127.0.0.1"])
 22 |       end
 23 |     end)
 24 | 
 25 |     :ok
 26 |   end
 27 | 
 28 |   setup do
 29 |     whitelist = [~r/^[a-z]@.*$/]
 30 |     {:ok, node1, _node1pid} = Nodes.start(:a, debug: true, node_whitelist: whitelist)
 31 |     {:ok, node2, _node2pid} = Nodes.start(:b, debug: true, node_whitelist: whitelist)
 32 | 
 33 |     on_exit(fn ->
 34 |       Nodes.stop(node1)
 35 |       Nodes.stop(node2)
 36 |     end)
 37 | 
 38 |     [nodes: [node1, node2]]
 39 |   end
 40 | 
 41 |   test "correct redistribution of processes", %{nodes: [node1, node2 | _]} do
 42 |     # connect nodes
 43 |     :rpc.call(node2, :net_kernel, :connect_node, [node1])
 44 |     # start swarm
 45 |     {:ok, _} = :rpc.call(node1, Application, :ensure_all_started, [:swarm])
 46 |     {:ok, _} = :rpc.call(node2, Application, :ensure_all_started, [:swarm])
 47 |     # give time to warm up
 48 |     Process.sleep(1_000)
 49 | 
 50 |     # start 5 processes from node2 to be distributed between node1 and node2
 51 |     worker_count = 10
 52 | 
 53 |     procs =
 54 |       for n <- 1..worker_count do
 55 |         name = {:"worker#{n}", n}
 56 |         # name = :"worker#{n}"
 57 |         {:ok, pid} =
 58 |           :rpc.call(node2, Swarm, :register_name, [name, MyApp.Worker, :start_link, []])
 59 | 
 60 |         {node(pid), name, pid}
 61 |       end
 62 | 
 63 |     # IO.puts "workers started"
 64 | 
 65 |     # give time to sync
 66 |     Process.sleep(5_000)
 67 | 
 68 |     :rpc.call(node1, :net_kernel, :disconnect, [node2])
 69 |     :rpc.call(node2, :net_kernel, :disconnect, [node1])
 70 |     # Nodes.stop(node2)
 71 | 
 72 |     # IO.puts "node2 disconnected"
 73 | 
 74 |     # give time to sync
 75 |     Process.sleep(5_000)
 76 | 
 77 |     # node1_members = :rpc.call(node1, Swarm, :registered, [], :infinity)
 78 |     # node2_members = :rpc.call(node2, Swarm, :registered, [], :infinity)
 79 |     # node1_ms = Enum.map(node1_members, fn {k, _} -> k end)
 80 |     # node2_ms = Enum.map(node2_members, fn {k, _} -> k end)
 81 |     # IO.inspect {:node1, length(node1_members)}
 82 |     # IO.inspect {:node2, length(node2_members)}
 83 |     # missing = Enum.reject(node1_members, fn v -> MapSet.member?(node2_ms, v) end)
 84 |     # IO.inspect node2_ms -- node1_ms
 85 | 
 86 |     # check to see if the processes were moved as expected to node1
 87 |     node2procs =
 88 |       procs
 89 |       |> Enum.filter(fn
 90 |         {^node2, _, _} -> true
 91 |         _ -> false
 92 |       end)
 93 | 
 94 |     # IO.inspect {:node2procs, length(node2procs)}
 95 |     node2procs
 96 |     |> Enum.map(fn {_, name, _} ->
 97 |       case :rpc.call(node1, Swarm, :whereis_name, [name]) do
 98 |         :undefined ->
 99 |           assert :undefined == node1
100 | 
101 |         pid ->
102 |           assert node(pid) == node1
103 |       end
104 |     end)
105 | 
106 |     # restore node2 to cluster
107 |     # IO.puts "node2 reconnecting"
108 |     :rpc.call(node1, :net_kernel, :connect_node, [node2])
109 |     :rpc.call(node2, :net_kernel, :connect_node, [node1])
110 |     # IO.puts "node2 reconnected"
111 | 
112 |     # give time to sync
113 |     Process.sleep(5_000)
114 | 
115 |     # make sure processes are back in the correct place
116 |     procs
117 |     |> Enum.filter(fn
118 |       {^node2, _, _} -> true
119 |       _ -> false
120 |     end)
121 |     |> Enum.filter(fn {target, name, _} ->
122 |       pid = :rpc.call(node1, Swarm, :whereis_name, [name])
123 |       node(pid) == target
124 |     end)
125 | 
126 |     node1_members = :rpc.call(node1, Swarm, :registered, [], :infinity)
127 |     node2_members = :rpc.call(node2, Swarm, :registered, [], :infinity)
128 |     n1ms = MapSet.new(node1_members)
129 |     n2ms = MapSet.new(node2_members)
130 |     empty_ms = MapSet.new([])
131 |     # IO.inspect {:node1_members, length(node1_members)}
132 |     # IO.inspect {:node2_members, length(node2_members)}
133 |     # IO.inspect {:union, MapSet.size(MapSet.union(n1ms, n2ms))}
134 |     assert length(node1_members) == worker_count
135 |     assert length(node2_members) == worker_count
136 |     assert ^empty_ms = MapSet.difference(n1ms, n2ms)
137 |   end
138 | end
139 | 


--------------------------------------------------------------------------------
/test/distribution/static_quorum_ring_test.exs:
--------------------------------------------------------------------------------
 1 | defmodule Swarm.Distribution.StaticQuorumRingTests do
 2 |   use ExUnit.Case, async: false
 3 | 
 4 |   @moduletag :capture_log
 5 | 
 6 |   alias Swarm.Distribution.StaticQuorumRing
 7 | 
 8 |   test "key to node should return `:undefined` until quorum size reached" do
 9 |     quorum =
10 |       StaticQuorumRing.create()
11 |       |> StaticQuorumRing.add_node("node1")
12 | 
13 |     assert StaticQuorumRing.key_to_node(quorum, :key1) == :undefined
14 | 
15 |     quorum = StaticQuorumRing.add_node(quorum, "node2")
16 |     assert StaticQuorumRing.key_to_node(quorum, :key1) != :undefined
17 | 
18 |     quorum = StaticQuorumRing.add_node(quorum, "node3")
19 |     assert StaticQuorumRing.key_to_node(quorum, :key1) != :undefined
20 |   end
21 | 
22 |   test "quorum size should be set by binary setting" do
23 |     static_quorum_size = Application.get_env(:swarm, :static_quorum_size)
24 |     Application.put_env(:swarm, :static_quorum_size, "5")
25 | 
26 |     assert StaticQuorumRing.create() == %StaticQuorumRing{
27 |              ring: %HashRing{},
28 |              static_quorum_size: 5
29 |            }
30 | 
31 |     Application.put_env(:swarm, :static_quorum_size, static_quorum_size)
32 |   end
33 | 
34 |   test "creating StaticQuorumRing should raise if the setting is not a positive integer" do
35 |     static_quorum_size = Application.get_env(:swarm, :static_quorum_size)
36 | 
37 |     Application.put_env(:swarm, :static_quorum_size, 0)
38 | 
39 |     assert_raise(
40 |       RuntimeError,
41 |       "config :static_quorum_size should be a positive integer",
42 |       &StaticQuorumRing.create/0
43 |     )
44 | 
45 |     Application.put_env(:swarm, :static_quorum_size, {:strange, :tuple})
46 | 
47 |     assert_raise(
48 |       RuntimeError,
49 |       "config :static_quorum_size should be a positive integer",
50 |       &StaticQuorumRing.create/0
51 |     )
52 | 
53 |     Application.put_env(:swarm, :static_quorum_size, 0.5)
54 | 
55 |     assert_raise(
56 |       RuntimeError,
57 |       "config :static_quorum_size should be a positive integer",
58 |       &StaticQuorumRing.create/0
59 |     )
60 | 
61 |     Application.put_env(:swarm, :static_quorum_size, "fake")
62 | 
63 |     assert_raise(
64 |       RuntimeError,
65 |       "config :static_quorum_size should be a positive integer",
66 |       &StaticQuorumRing.create/0
67 |     )
68 | 
69 |     Application.put_env(:swarm, :static_quorum_size, static_quorum_size)
70 |   end
71 | end
72 | 


--------------------------------------------------------------------------------
/test/integration_test.exs:
--------------------------------------------------------------------------------
  1 | defmodule Swarm.IntegrationTest do
  2 |   use Swarm.NodeCase
  3 | 
  4 |   @node1 :"node1@127.0.0.1"
  5 |   @node2 :"node2@127.0.0.1"
  6 |   @nodes [@node1, @node2]
  7 |   @worker_count 10
  8 | 
  9 |   setup do
 10 |     on_exit(fn ->
 11 |       for {_name, pid} <- get_registry(@node1), do: shutdown(pid)
 12 |     end)
 13 | 
 14 |     :ok
 15 |   end
 16 | 
 17 |   test "correct redistribution of processes" do
 18 |     group_name = :group1
 19 | 
 20 |     for n <- 1..@worker_count do
 21 |       {_, {:ok, _}} = spawn_worker(@node1, {:worker, n}, group_name)
 22 |     end
 23 | 
 24 |     # wait for process registration
 25 |     Process.sleep(1000)
 26 | 
 27 |     node1_registry = get_registry(@node1)
 28 |     node2_registry = get_registry(@node2)
 29 | 
 30 |     # each node should have all workers in their registry
 31 |     assert length(node1_registry) == @worker_count
 32 |     assert length(node2_registry) == @worker_count
 33 | 
 34 |     assert length(get_group_members(@node1, group_name)) == @worker_count
 35 |     assert length(get_group_members(@node2, group_name)) == @worker_count
 36 | 
 37 |     assert length(workers_for(@node1)) < @worker_count
 38 |     assert length(workers_for(@node2)) < @worker_count
 39 | 
 40 |     assert length(members_for(@node1, group_name)) < @worker_count
 41 |     assert length(members_for(@node2, group_name)) < @worker_count
 42 | 
 43 |     # netsplit
 44 |     disconnect(@node1, from: @node2)
 45 | 
 46 |     # wait for process redistribution
 47 |     Process.sleep(1000)
 48 | 
 49 |     ## check to see if the processes were migrated as expected
 50 |     assert length(workers_for(@node1)) == @worker_count
 51 |     assert length(workers_for(@node2)) == @worker_count
 52 | 
 53 |     assert length(get_group_members(@node1, group_name)) == @worker_count
 54 |     assert length(get_group_members(@node2, group_name)) == @worker_count
 55 | 
 56 |     # restore the cluster
 57 |     connect(@node1, to: @node2)
 58 | 
 59 |     # give time to sync
 60 |     Process.sleep(1000)
 61 | 
 62 |     # make sure processes are back in the correct place
 63 |     assert length(workers_for(@node1)) < @worker_count
 64 |     assert length(workers_for(@node2)) < @worker_count
 65 | 
 66 |     assert length(get_group_members(@node1, group_name)) == @worker_count
 67 |     assert length(get_group_members(@node2, group_name)) == @worker_count
 68 | 
 69 |     assert length(members_for(@node1, group_name)) < @worker_count
 70 |     assert length(members_for(@node2, group_name)) < @worker_count
 71 |   end
 72 | 
 73 |   test "redistribute already started process" do
 74 |     {_, {:ok, pid1}} = spawn_restart_worker(@node1, {:worker, 1})
 75 |     {_, {:ok, pid2}} = spawn_restart_worker(@node1, {:worker, 2})
 76 | 
 77 |     Enum.each(@nodes, fn node ->
 78 |       assert ordered_registry(node) == [{{:worker, 1}, pid1}, {{:worker, 2}, pid2}]
 79 |     end)
 80 | 
 81 |     # netsplit
 82 |     simulate_disconnect(@node1, @node2)
 83 | 
 84 |     # wait for process redistribution
 85 |     Process.sleep(1_000)
 86 | 
 87 |     # both worker processes should be running on each node
 88 |     assert whereis_name(@node1, {:worker, 1}) != whereis_name(@node2, {:worker, 1})
 89 |     assert whereis_name(@node1, {:worker, 2}) != whereis_name(@node2, {:worker, 2})
 90 | 
 91 |     Enum.each(@nodes, fn node ->
 92 |       # both nodes should be aware of two workers
 93 |       assert node |> get_registry() |> length() == 2
 94 |     end)
 95 | 
 96 |     # restore the cluster
 97 |     simulate_reconnect(@node1, @node2)
 98 | 
 99 |     # give time to sync
100 |     Process.sleep(1_000)
101 | 
102 |     pid1 = whereis_name(@node1, {:worker, 1})
103 |     pid2 = whereis_name(@node1, {:worker, 2})
104 | 
105 |     Enum.each(@nodes, fn node ->
106 |       assert ordered_registry(node) == [{{:worker, 1}, pid1}, {{:worker, 2}, pid2}]
107 |     end)
108 | 
109 |     shutdown(pid1)
110 |     shutdown(pid2)
111 | 
112 |     # give time to sync
113 |     Process.sleep(1_000)
114 |   end
115 | 
116 |   test "don't attempt to redistribute processes started with `Swarm.register_name/2`" do
117 |     name = {:agent, 1}
118 |     {_, :yes} = spawn_agent(@node1, name, [])
119 | 
120 |     assert [{^name, pid}] = get_registry(@node1)
121 | 
122 |     # another node joins cluster
123 |     Swarm.Cluster.spawn_node(:"node3@127.0.0.1")
124 | 
125 |     # ensure process still running on node1
126 |     assert whereis_name(@node1, name) == pid
127 | 
128 |     shutdown(pid)
129 |     Swarm.Cluster.stop_node(:"node3@127.0.0.1")
130 |   end
131 | 
132 |   test "remove processes started with `Swarm.register_name/2` when hosting node goes down" do
133 |     name = {:agent, 1}
134 |     {_, :yes} = spawn_agent(@node1, name, [])
135 | 
136 |     # give time to sync
137 |     Process.sleep(1_000)
138 | 
139 |     assert [{^name, pid}] = get_registry(@node1)
140 |     assert [{^name, ^pid}] = get_registry(@node2)
141 | 
142 |     # stop agent process
143 |     shutdown(pid)
144 | 
145 |     # ensure process is removed from node registries
146 |     assert whereis_name(@node1, name) == :undefined
147 |     assert whereis_name(@node2, name) == :undefined
148 |   end
149 | 
150 |   defp disconnect(node, opts) do
151 |     from = Keyword.fetch!(opts, :from)
152 |     :rpc.call(node, Node, :disconnect, [from])
153 |   end
154 | 
155 |   defp connect(node, opts) do
156 |     to = Keyword.fetch!(opts, :to)
157 |     :rpc.call(node, Node, :connect, [to])
158 |   end
159 | 
160 |   defp get_registry(node) do
161 |     :rpc.call(node, Swarm, :registered, [], :infinity)
162 |   end
163 | 
164 |   defp whereis_name(node, name) do
165 |     :rpc.call(node, Swarm, :whereis_name, [name], :infinity)
166 |   end
167 | 
168 |   defp ordered_registry(node) do
169 |     node
170 |     |> get_registry()
171 |     |> Enum.sort_by(fn {name, _pid} -> name end)
172 |   end
173 | 
174 |   # simulate a disconnect between two nodes
175 |   defp simulate_disconnect(lnode, rnode) do
176 |     spawn(fn -> send({Swarm.Tracker, lnode}, {:nodedown, rnode, nil}) end)
177 |     spawn(fn -> send({Swarm.Tracker, rnode}, {:nodedown, lnode, nil}) end)
178 |   end
179 | 
180 |   # simulate a reconnect between two nodes
181 |   defp simulate_reconnect(lnode, rnode) do
182 |     spawn(fn -> send({Swarm.Tracker, lnode}, {:nodeup, rnode, nil}) end)
183 |     spawn(fn -> send({Swarm.Tracker, rnode}, {:nodeup, lnode, nil}) end)
184 |   end
185 | 
186 |   defp workers_for(node) do
187 |     node
188 |     |> get_registry()
189 |     |> Enum.filter(fn {_, pid} -> node(pid) == node end)
190 |   end
191 | 
192 |   defp get_group_members(node, group_name) do
193 |     :rpc.call(node, Swarm, :members, [group_name])
194 |   end
195 | 
196 |   defp members_for(node, group_name) do
197 |     node
198 |     |> get_group_members(group_name)
199 |     |> Enum.filter(fn pid -> node(pid) == node end)
200 |   end
201 | 
202 |   def shutdown(nil), do: :ok
203 | 
204 |   def shutdown(pid) when is_pid(pid) do
205 |     ref = Process.monitor(pid)
206 | 
207 |     Process.unlink(pid)
208 |     Process.exit(pid, :shutdown)
209 | 
210 |     assert_receive {:DOWN, ^ref, _, _, _}
211 |   end
212 | end
213 | 


--------------------------------------------------------------------------------
/test/quorum_test.exs:
--------------------------------------------------------------------------------
  1 | defmodule Swarm.QuorumTests do
  2 |   use ExUnit.Case, async: false
  3 | 
  4 |   @moduletag :capture_log
  5 | 
  6 |   @node1 :"node1@127.0.0.1"
  7 |   @node2 :"node2@127.0.0.1"
  8 |   @node3 :"node3@127.0.0.1"
  9 |   @node4 :"node4@127.0.0.1"
 10 |   @node5 :"node5@127.0.0.1"
 11 |   @nodes [@node1, @node2, @node3, @node4, @node5]
 12 |   @names [{:test, 1}, {:test, 2}, {:test, 3}, {:test, 4}, {:test, 5}]
 13 | 
 14 |   alias Swarm.Cluster
 15 |   alias Swarm.Distribution.{Ring, StaticQuorumRing}
 16 | 
 17 |   setup_all do
 18 |     :rand.seed(:exs64)
 19 | 
 20 |     Application.put_env(:swarm, :static_quorum_size, 3)
 21 |     restart_cluster_using_strategy(StaticQuorumRing, [])
 22 | 
 23 |     MyApp.WorkerSup.start_link()
 24 | 
 25 |     on_exit(fn ->
 26 |       Application.delete_env(:swarm, :static_quorum_size)
 27 | 
 28 |       nodes = Application.get_env(:swarm, :nodes, [])
 29 |       restart_cluster_using_strategy(Ring, nodes)
 30 |     end)
 31 | 
 32 |     :ok
 33 |   end
 34 | 
 35 |   setup do
 36 |     on_exit(fn ->
 37 |       # stop any started nodes after each test
 38 |       @nodes
 39 |       |> Enum.map(&Task.async(fn -> Cluster.stop_node(&1) end))
 40 |       |> Enum.map(&Task.await(&1, 30_000))
 41 |     end)
 42 |   end
 43 | 
 44 |   describe "without quorum cluster" do
 45 |     setup [:form_two_node_cluster]
 46 | 
 47 |     test "should error on name registration" do
 48 |       assert {:error, :no_node_available} =
 49 |                register_name(@node1, {:test, 1}, MyApp.WorkerSup, :register, [])
 50 | 
 51 |       Enum.each([@node1, @node2], fn node ->
 52 |         assert whereis_name(node, {:test, 1}) == :undefined
 53 |         assert get_registry(node) == []
 54 |       end)
 55 |     end
 56 | 
 57 |     test "should optionally timeout a track call" do
 58 |       case register_name(@node1, {:test, 1}, MyApp.WorkerSup, :register, [], 0) do
 59 |         {:error, {:EXIT, {:timeout, _}}} -> :ok
 60 |         reply -> flunk("expected timeout, instead received: #{inspect(reply)}")
 61 |       end
 62 |     end
 63 |   end
 64 | 
 65 |   describe "with quorum cluster" do
 66 |     setup [:form_three_node_cluster]
 67 | 
 68 |     test "should immediately start registered process" do
 69 |       assert {:ok, _pid} = register_name(@node1, {:test, 1}, MyApp.WorkerSup, :register, [])
 70 |       assert :ok = unregister_name(@node1, {:test, 1})
 71 |     end
 72 | 
 73 |     test "should kill process after topology change results in too few nodes to host" do
 74 |       {:ok, pid} = register_name(@node1, {:test, 1}, MyApp.WorkerSup, :register, [])
 75 | 
 76 |       ref = Process.monitor(pid)
 77 | 
 78 |       # stopping a node means not enough nodes for a quorum, running processes must be stopped
 79 |       Cluster.stop_node(@node3)
 80 | 
 81 |       assert_receive {:DOWN, ^ref, _, _, _}
 82 | 
 83 |       :timer.sleep(1_000)
 84 | 
 85 |       Enum.each([@node1, @node2], fn node ->
 86 |         assert whereis_name(node, {:test, 1}) == :undefined
 87 |         assert get_registry(node) == []
 88 |       end)
 89 |     end
 90 | 
 91 |     test "should kill all processes after topology change results in too few nodes to host" do
 92 |       refs = start_named_processes()
 93 | 
 94 |       :timer.sleep(1_000)
 95 | 
 96 |       # stopping one node means not enough nodes for a quorum, running processes must be stopped
 97 |       Cluster.stop_node(@node3)
 98 | 
 99 |       :timer.sleep(1_000)
100 | 
101 |       # ensure all processes have been stopped
102 |       Enum.each(refs, fn ref ->
103 |         assert_receive {:DOWN, ^ref, _, _, _}
104 |       end)
105 | 
106 |       # ensure all nodes have an empty process registry
107 |       Enum.each(@names, fn name ->
108 |         assert whereis_name(@node1, name) == :undefined
109 |         assert whereis_name(@node2, name) == :undefined
110 |         assert get_registry(@node2) == []
111 |         assert get_registry(@node2) == []
112 |       end)
113 |     end
114 | 
115 |     test "should unregister name" do
116 |       {:ok, _pid} = register_name(@node1, {:test, 1}, MyApp.WorkerSup, :register, [])
117 | 
118 |       :timer.sleep(1_000)
119 | 
120 |       assert :ok = unregister_name(@node1, {:test, 1})
121 | 
122 |       :timer.sleep(1_000)
123 | 
124 |       Enum.each([@node1, @node2, @node3], fn node ->
125 |         assert whereis_name(node, {:test, 1}) == :undefined
126 |         assert get_registry(node) == []
127 |       end)
128 |     end
129 |   end
130 | 
131 |   describe "net split" do
132 |     setup [:form_five_node_cluster]
133 | 
134 |     test "should redistribute processes from smaller to larger partition" do
135 |       # start worker for each name
136 |       Enum.each(@names, fn name ->
137 |         {:ok, _pid} = register_name(@node1, name, MyApp.WorkerSup, :register, [])
138 |       end)
139 | 
140 |       :timer.sleep(1_000)
141 | 
142 |       # simulate net split (1, 2, 3) and (4, 5)
143 |       simulate_disconnect([@node1, @node2, @node3], [@node4, @node5])
144 | 
145 |       :timer.sleep(1_000)
146 | 
147 |       # ensure processes are redistributed onto nodes 1, 2, or 3 (quorum)
148 |       @names
149 |       |> Enum.map(&whereis_name(@node1, &1))
150 |       |> Enum.each(fn pid ->
151 |         refute pid == :undefined
152 | 
153 |         case node(pid) do
154 |           @node4 -> flunk("process still running on node4")
155 |           @node5 -> flunk("process still running on node5")
156 |           _ -> :ok
157 |         end
158 |       end)
159 | 
160 |       Enum.each(@names, fn name ->
161 |         assert :ok = unregister_name(@node1, name)
162 |       end)
163 |     end
164 | 
165 |     # simulate a disconnect between the two node partitions
166 |     defp simulate_disconnect(lpartition, rpartition) do
167 |       Enum.each(lpartition, fn lnode ->
168 |         Enum.each(rpartition, fn rnode ->
169 |           send({Swarm.Tracker, lnode}, {:nodedown, rnode, nil})
170 |           send({Swarm.Tracker, rnode}, {:nodedown, lnode, nil})
171 |         end)
172 |       end)
173 |     end
174 |   end
175 | 
176 |   defp get_registry(node) do
177 |     :rpc.call(node, Swarm, :registered, [], :infinity)
178 |   end
179 | 
180 |   defp register_name(node, name, m, f, a, timeout \\ :infinity)
181 | 
182 |   defp register_name(node, name, m, f, a, timeout) do
183 |     case :rpc.call(node, Swarm, :register_name, [name, m, f, a, timeout], :infinity) do
184 |       {:badrpc, reason} -> {:error, reason}
185 |       reply -> reply
186 |     end
187 |   end
188 | 
189 |   defp unregister_name(node, name) do
190 |     :rpc.call(node, Swarm, :unregister_name, [name], :infinity)
191 |   end
192 | 
193 |   defp whereis_name(node, name) do
194 |     :rpc.call(node, Swarm, :whereis_name, [name], :infinity)
195 |   end
196 | 
197 |   defp form_two_node_cluster(_context) do
198 |     with {:ok, _node1} <- Cluster.spawn_node(@node1),
199 |          {:ok, _node2} <- Cluster.spawn_node(@node2) do
200 |       :ok
201 |     end
202 |   end
203 | 
204 |   defp form_three_node_cluster(_context) do
205 |     with {:ok, _node1} <- Cluster.spawn_node(@node1),
206 |          {:ok, _node2} <- Cluster.spawn_node(@node2),
207 |          {:ok, _node3} <- Cluster.spawn_node(@node3) do
208 |       Process.sleep(2000)
209 |       :ok
210 |     end
211 |   end
212 | 
213 |   defp form_five_node_cluster(_context) do
214 |     with {:ok, _node1} <- Cluster.spawn_node(@node1),
215 |          {:ok, _node2} <- Cluster.spawn_node(@node2),
216 |          {:ok, _node3} <- Cluster.spawn_node(@node3),
217 |          {:ok, _node4} <- Cluster.spawn_node(@node4),
218 |          {:ok, _node5} <- Cluster.spawn_node(@node5) do
219 |       Process.sleep(2000)
220 |       :ok
221 |     end
222 |   end
223 | 
224 |   # start worker for each name
225 |   def start_named_processes do
226 |     Enum.map(@names, fn name ->
227 |       with {:ok, pid} <- register_name(@node1, name, MyApp.WorkerSup, :register, []) do
228 |         Process.monitor(pid)
229 |       end
230 |     end)
231 |   end
232 | 
233 |   defp restart_cluster_using_strategy(strategy, nodes) do
234 |     Cluster.stop()
235 | 
236 |     Application.put_env(:swarm, :distribution_strategy, strategy)
237 |     Application.stop(:swarm)
238 | 
239 |     Cluster.spawn(nodes)
240 | 
241 |     Application.ensure_all_started(:swarm)
242 |   end
243 | end
244 | 


--------------------------------------------------------------------------------
/test/registry_test.exs:
--------------------------------------------------------------------------------
 1 | defmodule Swarm.RegistryTests do
 2 |   use ExUnit.Case, async: false
 3 | 
 4 |   import Swarm.Entry
 5 |   @moduletag :capture_log
 6 | 
 7 |   setup do
 8 |     :rand.seed(:exs64)
 9 |     Application.ensure_all_started(:swarm)
10 |     on_exit(fn -> Application.stop(:swarm) end)
11 |     {:ok, _} = MyApp.WorkerSup.start_link()
12 |     :ok
13 |   end
14 | 
15 |   test "register_name/4" do
16 |     {:ok, pid1} = Swarm.register_name({:test, 1}, MyApp.WorkerSup, :register, [])
17 |     {:ok, pid2} = Swarm.register_name({:test, 2}, MyApp.WorkerSup, :register, [])
18 | 
19 |     Process.sleep(1_000)
20 | 
21 |     assert ^pid1 = Swarm.Registry.whereis({:test, 1})
22 |     assert ^pid2 = Swarm.Registry.whereis({:test, 2})
23 | 
24 |     all = Swarm.Registry.all()
25 |     assert Enum.member?(all, {{:test, 1}, pid1})
26 |     assert Enum.member?(all, {{:test, 2}, pid2})
27 | 
28 |     assert entry(name: _, pid: ^pid1, ref: ref1, meta: _, clock: _) =
29 |              Swarm.Registry.get_by_name({:test, 1})
30 | 
31 |     assert entry(name: _, pid: ^pid2, ref: ref2, meta: _, clock: _) =
32 |              Swarm.Registry.get_by_name({:test, 2})
33 | 
34 |     assert [entry(name: {:test, 1}, pid: _, ref: _, meta: _, clock: _)] =
35 |              Swarm.Registry.get_by_pid(pid1)
36 | 
37 |     assert [entry(name: {:test, 2}, pid: _, ref: _, meta: _, clock: _)] =
38 |              Swarm.Registry.get_by_pid(pid2)
39 | 
40 |     assert entry(name: _, pid: _, ref: ^ref1, meta: _, clock: _) =
41 |              Swarm.Registry.get_by_pid_and_name(pid1, {:test, 1})
42 | 
43 |     assert entry(name: _, pid: _, ref: ^ref2, meta: _, clock: _) =
44 |              Swarm.Registry.get_by_pid_and_name(pid2, {:test, 2})
45 | 
46 |     assert entry(name: _, pid: ^pid1, ref: _, meta: _, clock: _) = Swarm.Registry.get_by_ref(ref1)
47 |     assert entry(name: _, pid: ^pid2, ref: _, meta: _, clock: _) = Swarm.Registry.get_by_ref(ref2)
48 | 
49 |     meta_enum = Swarm.Registry.get_by_meta(:mfa, {MyApp.WorkerSup, :register, []})
50 | 
51 |     assert Enum.find(
52 |              meta_enum,
53 |              &match?(entry(name: _, pid: ^pid1, ref: _, meta: _, clock: _), &1)
54 |            ) != nil
55 | 
56 |     assert Enum.find(
57 |              meta_enum,
58 |              &match?(entry(name: _, pid: ^pid2, ref: _, meta: _, clock: _), &1)
59 |            ) != nil
60 | 
61 |     assert [entry(name: _, pid: ^pid1, ref: _, meta: _, clock: _)] =
62 |              :ets.lookup(:swarm_registry, {:test, 1})
63 | 
64 |     assert [entry(name: _, pid: ^pid2, ref: _, meta: _, clock: _)] =
65 |              :ets.lookup(:swarm_registry, {:test, 2})
66 |   end
67 | 
68 |   test "join/2 (joining a group does not create race conditions)" do
69 |     # https://github.com/bitwalker/swarm/issues/14
70 |     {:ok, pid} = Agent.start_link(fn -> "testing" end)
71 |     Swarm.register_name(:agent, pid)
72 |     Swarm.join(:agents, pid)
73 |     assert [my_agent] = Swarm.members(:agents)
74 |     assert "testing" == Agent.get(my_agent, fn s -> s end)
75 |   end
76 | 
77 |   test "whereis_or_register_name/4" do
78 |     # lookup test
79 |     {:ok, pid3} = Swarm.register_name({:test, 3}, MyApp.WorkerSup, :register, [])
80 | 
81 |     assert {:ok, ^pid3} =
82 |              Swarm.whereis_or_register_name({:test, 3}, MyApp.WorkerSup, :register, [])
83 | 
84 |     # transparrent registration
85 |     {:ok, pid4} = Swarm.whereis_or_register_name({:test, 4}, MyApp.WorkerSup, :register, [])
86 |     assert ^pid4 = Swarm.Registry.whereis({:test, 4})
87 |   end
88 | end
89 | 


--------------------------------------------------------------------------------
/test/support/cluster.ex:
--------------------------------------------------------------------------------
 1 | defmodule Swarm.Cluster do
 2 |   def spawn(nodes \\ Application.get_env(:swarm, :nodes, [])) do
 3 |     # Turn node into a distributed node with the given long name
 4 |     :net_kernel.start([:"primary@127.0.0.1"])
 5 | 
 6 |     # Allow spawned nodes to fetch all code from this node
 7 |     :erl_boot_server.start([])
 8 |     allow_boot(to_charlist("127.0.0.1"))
 9 | 
10 |     case Application.load(:swarm) do
11 |       :ok -> :ok
12 |       {:error, {:already_loaded, :swarm}} -> :ok
13 |     end
14 | 
15 |     nodes
16 |     |> Enum.map(&Task.async(fn -> spawn_node(&1) end))
17 |     |> Enum.map(&Task.await(&1, 30_000))
18 |   end
19 | 
20 |   def spawn_node(node_host) do
21 |     {:ok, node} = :slave.start(to_charlist("127.0.0.1"), node_name(node_host), slave_args())
22 |     add_code_paths(node)
23 |     transfer_configuration(node)
24 |     ensure_applications_started(node)
25 |     {:ok, node}
26 |   end
27 | 
28 |   def stop do
29 |     nodes = Node.list(:connected)
30 | 
31 |     nodes
32 |     |> Enum.map(&Task.async(fn -> stop_node(&1) end))
33 |     |> Enum.map(&Task.await(&1, 30_000))
34 |   end
35 | 
36 |   def stop_node(node) do
37 |     :ok = :slave.stop(node)
38 |   end
39 | 
40 |   defp rpc(node, module, fun, args) do
41 |     :rpc.block_call(node, module, fun, args)
42 |   end
43 | 
44 |   defp slave_args do
45 |     log_level = "-logger level #{Logger.level()}"
46 |     to_charlist("-loader inet -hosts 127.0.0.1 -setcookie #{:erlang.get_cookie()} " <> log_level)
47 |   end
48 | 
49 |   defp allow_boot(host) do
50 |     {:ok, ipv4} = :inet.parse_ipv4_address(host)
51 |     :erl_boot_server.add_slave(ipv4)
52 |   end
53 | 
54 |   defp add_code_paths(node) do
55 |     rpc(node, :code, :add_paths, [:code.get_path()])
56 |   end
57 | 
58 |   @blacklist [~r/^primary@.*$/, ~r/^remsh.*$/, ~r/^.+_upgrader_.+$/, ~r/^.+_maint_.+$/]
59 | 
60 |   defp transfer_configuration(node) do
61 |     for {app_name, _, _} <- Application.loaded_applications() do
62 |       for {key, val} <- Application.get_all_env(app_name) do
63 |         rpc(node, Application, :put_env, [app_name, key, val])
64 |       end
65 |     end
66 | 
67 |     # Our current node might be blacklisted ourself; overwrite config with default
68 |     rpc(node, Application, :put_env, [:swarm, :node_blacklist, @blacklist])
69 |   end
70 | 
71 |   defp ensure_applications_started(node) do
72 |     rpc(node, Application, :ensure_all_started, [:mix])
73 |     rpc(node, Mix, :env, [Mix.env()])
74 | 
75 |     for {app_name, _, _} <- Application.loaded_applications() do
76 |       rpc(node, Application, :ensure_all_started, [app_name])
77 |     end
78 | 
79 |     rpc(node, MyApp.WorkerSup, :start_link, [])
80 |   end
81 | 
82 |   defp node_name(node_host) do
83 |     node_host
84 |     |> to_string
85 |     |> String.split("@")
86 |     |> Enum.at(0)
87 |     |> String.to_atom()
88 |   end
89 | end
90 | 


--------------------------------------------------------------------------------
/test/support/example_sup.ex:
--------------------------------------------------------------------------------
 1 | defmodule MyApp.WorkerSup do
 2 |   use Supervisor
 3 | 
 4 |   def start_link() do
 5 |     Supervisor.start_link(__MODULE__, [], name: __MODULE__)
 6 |   end
 7 | 
 8 |   def init(_) do
 9 |     children = [
10 |       worker(MyApp.Worker, [], restart: :transient)
11 |     ]
12 | 
13 |     supervise(children, strategy: :simple_one_for_one)
14 |   end
15 | 
16 |   def register() do
17 |     {:ok, _pid} = Supervisor.start_child(__MODULE__, [])
18 |   end
19 | end
20 | 


--------------------------------------------------------------------------------
/test/support/example_worker.ex:
--------------------------------------------------------------------------------
 1 | defmodule MyApp.Worker do
 2 |   def start_link(), do: GenServer.start_link(__MODULE__, [])
 3 | 
 4 |   def init(_name) do
 5 |     # IO.inspect "starting #{inspect self()} on #{Node.self}"
 6 |     {:ok, {:rand.uniform(5_000), 0}, 0}
 7 |   end
 8 | 
 9 |   def handle_call({:swarm, :begin_handoff}, _from, {delay, count}) do
10 |     {:reply, {:resume, {delay, count}}, {delay, count}}
11 |   end
12 | 
13 |   def handle_call(:ping, _from, state) do
14 |     {:reply, {:pong, self()}, state}
15 |   end
16 | 
17 |   def handle_cast({:swarm, :end_handoff, {delay, count}}, {_, _}) do
18 |     {:noreply, {delay, count}}
19 |   end
20 | 
21 |   def handle_cast(_, state) do
22 |     {:noreply, state}
23 |   end
24 | 
25 |   def handle_info(:timeout, {delay, count}) do
26 |     Process.send_after(self(), :timeout, delay)
27 |     {:noreply, {delay, count + 1}}
28 |   end
29 | 
30 |   # this message is sent when this process should die
31 |   # because it's being moved, use this as an opportunity
32 |   # to clean up
33 |   def handle_info({:swarm, :die}, state) do
34 |     {:stop, :shutdown, state}
35 |   end
36 | 
37 |   def handle_info(_, state), do: {:noreply, state}
38 | 
39 |   def terminate(_reason, _state) do
40 |     # IO.inspect "stopping #{inspect self()} on #{Node.self}"
41 |     :ok
42 |   end
43 | end
44 | 


--------------------------------------------------------------------------------
/test/support/node_case.ex:
--------------------------------------------------------------------------------
 1 | defmodule Swarm.NodeCase do
 2 |   @timeout 5000
 3 |   @heartbeat 100
 4 |   @permdown 1500
 5 | 
 6 |   defmacro __using__(_opts) do
 7 |     quote do
 8 |       use ExUnit.Case, async: true
 9 |       import unquote(__MODULE__)
10 |       @moduletag :clustered
11 | 
12 |       @timeout unquote(@timeout)
13 |       @heartbeat unquote(@heartbeat)
14 |       @permdown unquote(@permdown)
15 |     end
16 |   end
17 | 
18 |   def start_tracker(node) do
19 |     call_node(node, fn ->
20 |       {:ok, result} = Application.ensure_all_started(:swarm)
21 |       result
22 |     end)
23 |   end
24 | 
25 |   def spawn_worker(node, name, group_name \\ nil) do
26 |     call_node(node, fn ->
27 |       result = Swarm.register_name(name, MyApp.Worker, :start_link, [])
28 | 
29 |       case result do
30 |         {:ok, pid} when group_name != nil -> Swarm.join(group_name, pid)
31 |       end
32 | 
33 |       result
34 |     end)
35 |   end
36 | 
37 |   def spawn_restart_worker(node, name) do
38 |     call_node(node, fn ->
39 |       Swarm.register_name(name, MyApp.RestartWorker, :start_link, [name])
40 |     end)
41 |   end
42 | 
43 |   def spawn_agent(node, name, initial_state) do
44 |     call_node(node, fn ->
45 |       {:ok, pid} = Agent.start(fn -> initial_state end)
46 | 
47 |       Swarm.register_name(name, pid)
48 |     end)
49 |   end
50 | 
51 |   def flush() do
52 |     receive do
53 |       _ -> flush()
54 |     after
55 |       0 -> :ok
56 |     end
57 |   end
58 | 
59 |   defp call_node(node, func) do
60 |     parent = self()
61 |     ref = make_ref()
62 | 
63 |     pid =
64 |       Node.spawn_link(node, fn ->
65 |         result = func.()
66 |         send(parent, {ref, result})
67 |         ref = Process.monitor(parent)
68 | 
69 |         receive do
70 |           {:DOWN, ^ref, :process, _, _} -> :ok
71 |         end
72 |       end)
73 | 
74 |     receive do
75 |       {^ref, result} -> {pid, result}
76 |     after
77 |       @timeout -> {pid, {:error, :timeout}}
78 |     end
79 |   end
80 | end
81 | 


--------------------------------------------------------------------------------
/test/support/nodes.ex:
--------------------------------------------------------------------------------
  1 | defmodule Swarm.Nodes do
  2 |   alias Porcelain.Process, as: Proc
  3 |   require Logger
  4 | 
  5 |   def start(name, config \\ nil) when is_atom(name) do
  6 |     Application.ensure_started(:porcelain)
  7 | 
  8 |     config_file =
  9 |       case System.get_env("SWARM_DEBUG") do
 10 |         "true" -> "sys_debug.config"
 11 |         _ -> "sys.config"
 12 |       end
 13 | 
 14 |     "swarm_master@" <> hostname = "#{Node.self()}"
 15 |     node_name = :"#{name}@#{hostname}"
 16 | 
 17 |     node_pid =
 18 |       spawn_link(fn ->
 19 |         Process.flag(:trap_exit, true)
 20 |         code_paths = :code.get_path()
 21 | 
 22 |         base_args = [
 23 |           "-noshell",
 24 |           "-connect_all false",
 25 |           "-hidden",
 26 |           "-sname #{node_name}",
 27 |           "-setcookie swarm_test",
 28 |           "-config #{Path.join(__DIR__, config_file)}",
 29 |           "-eval 'io:format(\"ok\", []).'"
 30 |         ]
 31 | 
 32 |         args =
 33 |           Enum.reduce(code_paths, Enum.join(base_args, " "), fn path, acc ->
 34 |             acc <> " -pa #{path}"
 35 |           end)
 36 | 
 37 |         _proc =
 38 |           %Proc{pid: pid} =
 39 |           Porcelain.spawn_shell("erl " <> args, in: :receive, out: {:send, self()})
 40 | 
 41 |         :ok = wait_until_started(node_name, pid)
 42 |         true = :net_kernel.hidden_connect_node(node_name)
 43 |         receive_loop(node_name, pid)
 44 |       end)
 45 | 
 46 |     :ok = block_until_nodeup(node_pid)
 47 | 
 48 |     case config do
 49 |       nil ->
 50 |         :ok
 51 | 
 52 |       _ ->
 53 |         for {k, v} <- config do
 54 |           :rpc.call(node(), Application, :put_env, [:swarm, k, v])
 55 |         end
 56 |     end
 57 | 
 58 |     {:ok, _} = :rpc.call(node(), Application, :ensure_all_started, [:elixir])
 59 |     {:ok, node_name, node_pid}
 60 |   end
 61 | 
 62 |   def stop(name) when is_atom(name) do
 63 |     :abcast = :rpc.eval_everywhere([name], :init, :stop, [])
 64 |   end
 65 | 
 66 |   defp block_until_nodeup(pid) do
 67 |     case GenServer.call(pid, :ready) do
 68 |       true ->
 69 |         :ok
 70 | 
 71 |       false ->
 72 |         block_until_nodeup(pid)
 73 |     end
 74 |   end
 75 | 
 76 |   defp wait_until_started(node_name, pid) do
 77 |     receive do
 78 |       {^pid, :data, :out, _data} ->
 79 |         # IO.inspect {node_name, data}
 80 |         :ok
 81 | 
 82 |       {^pid, :result, %{status: status}} ->
 83 |         {:error, status}
 84 | 
 85 |       {:"$gen_call", from, :ready} ->
 86 |         GenServer.reply(from, false)
 87 |         wait_until_started(node_name, pid)
 88 |     end
 89 |   end
 90 | 
 91 |   defp receive_loop(node_name, pid) do
 92 |     receive do
 93 |       {^pid, :data, :out, data} ->
 94 |         case Application.get_env(:logger, :level, :warn) do
 95 |           l when l in [:debug, :info] ->
 96 |             IO.puts("#{node_name} =>\n" <> data)
 97 | 
 98 |           _ ->
 99 |             :ok
100 |         end
101 | 
102 |         receive_loop(node_name, pid)
103 | 
104 |       {^pid, :result, %{status: status}} ->
105 |         IO.inspect({:exit, node_name, status})
106 | 
107 |       {:EXIT, parent, reason} when parent == self() ->
108 |         Process.exit(pid, reason)
109 | 
110 |       {:"$gen_call", from, :ready} ->
111 |         GenServer.reply(from, true)
112 |         receive_loop(node_name, pid)
113 | 
114 |       :die ->
115 |         Process.exit(pid, :normal)
116 |     end
117 |   end
118 | end
119 | 


--------------------------------------------------------------------------------
/test/support/restart_worker.ex:
--------------------------------------------------------------------------------
 1 | defmodule MyApp.RestartWorker do
 2 |   @moduledoc false
 3 | 
 4 |   # A worker process that requests to be restarted during Swarm hand-off
 5 | 
 6 |   def start_link(name), do: GenServer.start_link(__MODULE__, name)
 7 | 
 8 |   def init(name), do: {:ok, name}
 9 | 
10 |   def handle_call({:swarm, :begin_handoff}, _from, state) do
11 |     {:reply, :restart, state}
12 |   end
13 | 
14 |   def handle_info({:swarm, :die}, state) do
15 |     {:stop, :shutdown, state}
16 |   end
17 | end
18 | 


--------------------------------------------------------------------------------
/test/support/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | mix compile
 4 | 
 5 | erl \
 6 |     -connect_all false \
 7 |     -hidden \
 8 |     -sname "$1" \
 9 |     -setcookie swarm_test \
10 |     -config ./test/support/sys.config \
11 |     -pa /Users/paulschoenfelder/erlang/19.1/*/ebin \
12 |     -pa /Users/paulschoenfelder/src/github.com/bitwalker/swarm/_build/test/consolidated \
13 |     -pa /Users/paulschoenfelder/src/github.com/bitwalker/swarm/_build/test/lib/*/ebin \
14 |     -user Elixir.IEx.CLI \
15 |     -extra --no-halt +iex
16 | 


--------------------------------------------------------------------------------
/test/support/sys.config:
--------------------------------------------------------------------------------
1 | [{swarm, [{node_blacklist, [<<"swarm_master@.*">>]}]},
2 |  {logger, [{level, warn}]}].
3 | 


--------------------------------------------------------------------------------
/test/support/sys_debug.config:
--------------------------------------------------------------------------------
1 | [{swarm, [{debug, true}, {node_blacklist, [<<"swarm_master@.*">>]}, {debug_opts, [trace]}]},
2 |  {logger, [{level, debug}]}].
3 | 


--------------------------------------------------------------------------------
/test/test_helper.exs:
--------------------------------------------------------------------------------
1 | exclude = Keyword.get(ExUnit.configuration(), :exclude, [])
2 | 
3 | unless :clustered in exclude do
4 |   Swarm.Cluster.spawn()
5 | end
6 | 
7 | ExUnit.start()
8 | 


--------------------------------------------------------------------------------
/test/tracker_replica_event_test.exs:
--------------------------------------------------------------------------------
  1 | defmodule Swarm.TrackerReplicaEventTests do
  2 |   use ExUnit.Case, async: false
  3 | 
  4 |   import Swarm.Entry
  5 |   alias Swarm.IntervalTreeClock, as: Clock
  6 |   alias Swarm.Registry, as: Registry
  7 | 
  8 |   @moduletag :capture_log
  9 | 
 10 |   setup_all do
 11 |     :rand.seed(:exs64)
 12 |     Application.ensure_all_started(:swarm)
 13 |     {:ok, _} = MyApp.WorkerSup.start_link()
 14 |     :ok
 15 |   end
 16 | 
 17 |   setup do
 18 |     :ets.delete_all_objects(:swarm_registry)
 19 | 
 20 |     {:ok, pid} = MyApp.WorkerSup.register()
 21 |     meta = %{mfa: {MyApp.WorkerSup, :register, []}}
 22 |     name = :rand.uniform()
 23 |     {lclock, rclock} = Clock.fork(Clock.seed())
 24 | 
 25 |     [name: name, pid: pid, meta: meta, lclock: lclock, rclock: rclock]
 26 |   end
 27 | 
 28 |   test "handle_replica_event :track should add registration", %{
 29 |     name: name,
 30 |     pid: pid,
 31 |     meta: meta,
 32 |     rclock: rclock
 33 |   } do
 34 |     send_replica_event(rclock, {:track, name, pid, meta})
 35 | 
 36 |     assert ^pid = Registry.whereis(name)
 37 |     assert Enum.member?(Registry.all(), {name, pid})
 38 |     assert entry(name: _, pid: ^pid, ref: ref, meta: ^meta, clock: _) = Registry.get_by_name(name)
 39 |     assert [entry(name: ^name, pid: _, ref: _, meta: _, clock: _)] = Registry.get_by_pid(pid)
 40 | 
 41 |     assert entry(name: _, pid: _, ref: ^ref, meta: _, clock: _) =
 42 |              Registry.get_by_pid_and_name(pid, name)
 43 | 
 44 |     assert entry(name: _, pid: ^pid, ref: _, meta: _, clock: _) = Registry.get_by_ref(ref)
 45 | 
 46 |     assert [entry(name: _, pid: ^pid, ref: _, meta: _, clock: _)] =
 47 |              Registry.get_by_meta(:mfa, {MyApp.WorkerSup, :register, []})
 48 | 
 49 |     assert [entry(name: _, pid: ^pid, ref: _, meta: _, clock: _)] =
 50 |              :ets.lookup(:swarm_registry, name)
 51 |   end
 52 | 
 53 |   test "handle_replica_event :track with existing registration should ignore the event", %{
 54 |     name: name,
 55 |     pid: pid,
 56 |     meta: meta,
 57 |     lclock: lclock,
 58 |     rclock: rclock
 59 |   } do
 60 |     send_replica_event(lclock, {:track, name, pid, meta})
 61 | 
 62 |     send_replica_event(rclock, {:track, name, pid, meta})
 63 | 
 64 |     assert entry(name: _, pid: ^pid, ref: _, meta: _, clock: _) = Registry.get_by_name(name)
 65 |   end
 66 | 
 67 |   test "handle_replica_event :track with conflicting metadata and remote clock dominates should update the metadata",
 68 |        %{name: name, pid: pid, meta: meta, lclock: lclock, rclock: rclock} do
 69 |     send_replica_event(lclock, {:track, name, pid, meta})
 70 | 
 71 |     rclock = Clock.event(rclock)
 72 |     remote_meta = %{other: "meta"}
 73 |     send_replica_event(rclock, {:track, name, pid, remote_meta})
 74 | 
 75 |     assert entry(name: _, pid: ^pid, ref: _, meta: ^remote_meta, clock: _) =
 76 |              Registry.get_by_name(name)
 77 |   end
 78 | 
 79 |   test "handle_replica_event :track with conflicting metadata and local clock dominates should keep the metadata",
 80 |        %{name: name, pid: pid, meta: meta, lclock: lclock, rclock: rclock} do
 81 |     lclock = Clock.event(lclock)
 82 |     send_replica_event(lclock, {:track, name, pid, meta})
 83 | 
 84 |     remote_meta = %{other: "meta"}
 85 |     send_replica_event(rclock, {:track, name, pid, remote_meta})
 86 | 
 87 |     assert entry(name: _, pid: ^pid, ref: _, meta: ^meta, clock: _) = Registry.get_by_name(name)
 88 |   end
 89 | 
 90 |   test "handle_replica_event :track with conflicting pid and remote clock dominates should kill the locally registered pid",
 91 |        %{name: name, pid: pid, meta: meta, lclock: lclock, rclock: rclock} do
 92 |     send_replica_event(lclock, {:track, name, pid, meta})
 93 | 
 94 |     rclock = Clock.event(rclock)
 95 |     {:ok, other_pid} = MyApp.WorkerSup.register()
 96 |     send_replica_event(rclock, {:track, name, other_pid, meta})
 97 | 
 98 |     assert entry(name: _, pid: ^other_pid, ref: _, meta: ^meta, clock: _) =
 99 |              Registry.get_by_name(name)
100 | 
101 |     refute Process.alive?(pid)
102 |   end
103 | 
104 |   test "handle_replica_event :track with conflicting pid and local clock dominates should ignore the event",
105 |        %{name: name, pid: pid, meta: meta, lclock: lclock, rclock: rclock} do
106 | 
107 |     lclock = Clock.event(lclock)
108 |     send_replica_event(lclock, {:track, name, pid, meta})
109 | 
110 |     {:ok, other_pid} = MyApp.WorkerSup.register()
111 |     send_replica_event(rclock, {:track, name, other_pid, meta})
112 | 
113 |     assert entry(name: _, pid: ^pid, ref: _, meta: ^meta, clock: _) = Registry.get_by_name(name)
114 |     assert Process.alive?(pid)
115 |   end
116 | 
117 |   test "handle_replica_event :untrack when remote clock dominates should remove registration", %{
118 |     name: name,
119 |     pid: pid,
120 |     meta: meta,
121 |     lclock: lclock,
122 |     rclock: rclock
123 |   } do
124 |     send_replica_event(lclock, {:track, name, pid, meta})
125 | 
126 |     rclock = Clock.event(rclock)
127 |     send_replica_event(rclock, {:untrack, pid})
128 | 
129 |     assert :undefined = Registry.get_by_name(name)
130 |   end
131 | 
132 |   test "handle_replica_event :untrack when local clock dominates should ignore event", %{
133 |     name: name,
134 |     pid: pid,
135 |     meta: meta,
136 |     lclock: lclock,
137 |     rclock: rclock
138 |   } do
139 |     lclock = Clock.event(lclock)
140 |     send_replica_event(lclock, {:track, name, pid, meta})
141 | 
142 |     send_replica_event(rclock, {:untrack, pid})
143 | 
144 |     assert entry(name: _, pid: ^pid, ref: _, meta: ^meta, clock: _) = Registry.get_by_name(name)
145 |   end
146 | 
147 |   test "handle_replica_event :untrack for unknown pid should ignore the event", %{
148 |     name: name,
149 |     pid: pid,
150 |     meta: meta,
151 |     lclock: lclock,
152 |     rclock: rclock
153 |   } do
154 |     send_replica_event(lclock, {:track, name, pid, meta})
155 | 
156 |     {:ok, other_pid} = MyApp.WorkerSup.register()
157 |     send_replica_event(rclock, {:untrack, other_pid})
158 | 
159 |     assert entry(name: _, pid: ^pid, ref: _, meta: ^meta, clock: _) = Registry.get_by_name(name)
160 |   end
161 | 
162 |   test "handle_replica_event :update_meta for unknown pid should ignore the event", %{
163 |     name: name,
164 |     pid: pid,
165 |     meta: meta,
166 |     lclock: lclock,
167 |     rclock: rclock
168 |   } do
169 |     send_replica_event(lclock, {:track, name, pid, meta})
170 | 
171 |     {:ok, other_pid} = MyApp.WorkerSup.register()
172 |     other_meta = %{other: "meta"}
173 |     send_replica_event(rclock, {:update_meta, other_meta, other_pid})
174 | 
175 |     assert entry(name: _, pid: ^pid, ref: _, meta: ^meta, clock: _) = Registry.get_by_name(name)
176 |     assert :undefined = Registry.get_by_pid(other_pid)
177 |   end
178 | 
179 |   test "handle_replica_event :update_meta when remote dominates should update the registry", %{
180 |     name: name,
181 |     pid: pid,
182 |     meta: meta,
183 |     lclock: lclock,
184 |     rclock: rclock
185 |   } do
186 |     send_replica_event(lclock, {:track, name, pid, meta})
187 | 
188 |     rclock = Clock.event(rclock)
189 |     new_meta = %{other: "meta"}
190 |     send_replica_event(rclock, {:update_meta, new_meta, pid})
191 | 
192 |     assert entry(name: _, pid: ^pid, ref: _, meta: ^new_meta, clock: _) =
193 |              Registry.get_by_name(name)
194 |   end
195 | 
196 |   test "handle_replica_event :update_meta when local dominates should ignore the event", %{
197 |     name: name,
198 |     pid: pid,
199 |     meta: meta,
200 |     lclock: lclock,
201 |     rclock: rclock
202 |   } do
203 |     lclock = Clock.event(lclock)
204 |     send_replica_event(lclock, {:track, name, pid, meta})
205 | 
206 |     new_meta = %{other: "meta"}
207 |     send_replica_event(rclock, {:update_meta, new_meta, pid})
208 | 
209 |     assert entry(name: _, pid: ^pid, ref: _, meta: ^meta, clock: _) = Registry.get_by_name(name)
210 |   end
211 |   
212 |   test "handle_replica_event :update_meta when conflict should merge the meta data", %{
213 |     name: name,
214 |     pid: pid,
215 |     meta: meta,
216 |     lclock: lclock,
217 |     rclock: rclock
218 |   } do
219 |     lclock = Clock.event(lclock)
220 |     send_replica_event(lclock, {:track, name, pid, meta})
221 | 
222 |     rclock = Clock.event(rclock)
223 |     new_meta = %{other: "meta"}
224 |     send_replica_event(rclock, {:update_meta, new_meta, pid})
225 | 
226 |     assert entry(name: _, pid: ^pid, ref: _, meta: updated_meta, clock: _) =
227 |              Registry.get_by_name(name)
228 | 
229 |     assert updated_meta.mfa == {MyApp.WorkerSup, :register, []}
230 |     assert updated_meta.other == "meta"
231 |   end
232 | 
233 |   defp send_replica_event(clock, message) do
234 |     send(Swarm.Tracker, {:event, self(), Clock.peek(clock), message})
235 |     # get_state to wait for the replication event to be processed
236 |     :sys.get_state(Swarm.Tracker)
237 |   end
238 | end
239 | 


--------------------------------------------------------------------------------
/test/tracker_sync_test.exs:
--------------------------------------------------------------------------------
  1 | defmodule Swarm.TrackerSyncTests do
  2 |   use ExUnit.Case, async: false
  3 | 
  4 |   import Swarm.Entry
  5 |   alias Swarm.IntervalTreeClock, as: Clock
  6 |   alias Swarm.Registry, as: Registry
  7 | 
  8 |   @moduletag :capture_log
  9 | 
 10 |   setup_all do
 11 |     Application.put_env(:swarm, :node_whitelist, [~r/primary@/])
 12 |     {:ok, _} = Application.ensure_all_started(:swarm)
 13 | 
 14 |     on_exit(fn ->
 15 |       Application.put_env(:swarm, :node_whitelist, [])
 16 |     end)
 17 | 
 18 |     :rand.seed(:exs64)
 19 | 
 20 |     {:ok, _} = MyApp.WorkerSup.start_link()
 21 |     :ok
 22 |   end
 23 | 
 24 |   setup do
 25 |     {:ok, pid} = MyApp.WorkerSup.register()
 26 |     meta = %{mfa: {MyApp.WorkerSup, :register, []}}
 27 |     name = random_name()
 28 | 
 29 |     {lclock, rclock} = Clock.fork(Clock.seed())
 30 |     send_sync_request(lclock, [])
 31 | 
 32 |     entry(name: _, pid: _, ref: _, meta: _, clock: lclock) = call_track(name, pid, meta)
 33 | 
 34 |     # fake handle_replica_event which joins the clocks so that they are in sync
 35 |     rclock = Clock.join(rclock, Clock.peek(lclock))
 36 | 
 37 |     [name: name, pid: pid, meta: meta, lclock: lclock, rclock: rclock]
 38 |   end
 39 | 
 40 |   test ":sync should set sync node and preserve node clock", %{rclock: rclock} do
 41 |     {_, state_before} = :sys.get_state(Swarm.Tracker)
 42 |     GenServer.cast(Swarm.Tracker, {:sync, self(), rclock})
 43 |     {_, state_after} = :sys.get_state(Swarm.Tracker)
 44 | 
 45 |     assert state_after.sync_node == :"primary@127.0.0.1"
 46 |     assert state_after.sync_ref
 47 |     assert state_after.clock == state_before.clock
 48 |   end
 49 | 
 50 |   test ":sync should add missing registration", %{pid: pid, meta: meta, rclock: rclock} do
 51 |     name = random_name()
 52 | 
 53 |     remote_registry = [
 54 |       entry(name: name, pid: pid, ref: nil, meta: meta, clock: Clock.peek(rclock))
 55 |     ]
 56 | 
 57 |     send_sync_request(rclock, remote_registry)
 58 | 
 59 |     assert entry(name: ^name, pid: ^pid, ref: _, meta: ^meta, clock: _) =
 60 |              Registry.get_by_name(name)
 61 |   end
 62 | 
 63 |   test ":sync with same pid and remote clock dominates should update the meta", %{
 64 |     name: name,
 65 |     pid: pid,
 66 |     rclock: rclock
 67 |   } do
 68 |     rclock = Clock.event(rclock)
 69 |     remote_meta = %{new: "meta"}
 70 | 
 71 |     remote_registry = [
 72 |       entry(name: name, pid: pid, ref: nil, meta: remote_meta, clock: Clock.peek(rclock))
 73 |     ]
 74 | 
 75 |     send_sync_request(rclock, remote_registry)
 76 | 
 77 |     assert entry(name: ^name, pid: ^pid, ref: _, meta: ^remote_meta, clock: _) =
 78 |              Registry.get_by_name(name)
 79 |   end
 80 | 
 81 |   test ":sync with same pid and local clock dominates should ignore entry", %{
 82 |     name: name,
 83 |     pid: pid,
 84 |     rclock: rclock
 85 |   } do
 86 |     Swarm.Tracker.add_meta(:new_local, "meta_local", pid)
 87 | 
 88 |     remote_registry = [
 89 |       entry(
 90 |         name: name,
 91 |         pid: pid,
 92 |         ref: nil,
 93 |         meta: %{new_remote: "remote_meta"},
 94 |         clock: Clock.peek(rclock)
 95 |       )
 96 |     ]
 97 | 
 98 |     send_sync_request(rclock, remote_registry)
 99 | 
100 |     local_meta = %{mfa: {MyApp.WorkerSup, :register, []}, new_local: "meta_local"}
101 | 
102 |     assert entry(name: ^name, pid: ^pid, ref: _, meta: ^local_meta, clock: _) =
103 |              Registry.get_by_name(name)
104 |   end
105 | 
106 |   test ":sync with same pid and clock should ignore entry", %{
107 |     name: name,
108 |     pid: pid,
109 |     meta: meta,
110 |     rclock: rclock
111 |   } do
112 |     remote_registry = [
113 |       entry(name: name, pid: pid, ref: nil, meta: %{remote: "meta"}, clock: Clock.peek(rclock))
114 |     ]
115 | 
116 |     send_sync_request(rclock, remote_registry)
117 | 
118 |     assert entry(name: ^name, pid: ^pid, ref: _, meta: ^meta, clock: _) =
119 |              Registry.get_by_name(name)
120 |   end
121 | 
122 |   test ":sync with same pid and concurrent changes should merge data", %{
123 |     name: name,
124 |     pid: pid,
125 |     rclock: rclock
126 |   } do
127 |     Swarm.Tracker.add_meta(:new_local, "meta_local", pid)
128 |     rclock = Clock.event(rclock)
129 | 
130 |     remote_registry = [
131 |       entry(
132 |         name: name,
133 |         pid: pid,
134 |         ref: nil,
135 |         meta: %{new_remote: "remote_meta"},
136 |         clock: Clock.peek(Clock.event(rclock))
137 |       )
138 |     ]
139 | 
140 |     send_sync_request(rclock, remote_registry)
141 | 
142 |     merged_meta = %{
143 |       mfa: {MyApp.WorkerSup, :register, []},
144 |       new_local: "meta_local",
145 |       new_remote: "remote_meta"
146 |     }
147 | 
148 |     assert entry(name: ^name, pid: ^pid, ref: _, meta: ^merged_meta, clock: _) =
149 |              Registry.get_by_name(name)
150 |   end
151 | 
152 |   test ":sync with different pid and local clock dominates should kill remote pid", %{
153 |     name: name,
154 |     pid: pid,
155 |     meta: meta,
156 |     rclock: rclock
157 |   } do
158 |     Swarm.Tracker.remove_meta(:mfa, pid)
159 | 
160 |     {:ok, remote_pid} = MyApp.WorkerSup.register()
161 | 
162 |     remote_registry = [
163 |       entry(name: name, pid: remote_pid, ref: nil, meta: meta, clock: Clock.peek(rclock))
164 |     ]
165 | 
166 |     send_sync_request(rclock, remote_registry)
167 | 
168 |     assert_process_alive?(true, pid)
169 |     assert_process_alive?(false, remote_pid)
170 |   end
171 | 
172 |   test ":sync with different pid and remote clock dominates should kill local pid", %{
173 |     name: name,
174 |     pid: pid,
175 |     meta: meta,
176 |     rclock: rclock
177 |   } do
178 |     {:ok, remote_pid} = MyApp.WorkerSup.register()
179 |     rclock = Clock.event(rclock)
180 | 
181 |     remote_registry = [
182 |       entry(name: name, pid: remote_pid, ref: nil, meta: meta, clock: Clock.peek(rclock))
183 |     ]
184 | 
185 |     send_sync_request(rclock, remote_registry)
186 | 
187 |     assert_process_alive?(false, pid)
188 |     assert_process_alive?(true, remote_pid)
189 |   end
190 | 
191 |   test ":sync_recv should discard pending sync request from sync_node", %{
192 |     name: name,
193 |     meta: meta,
194 |     rclock: rclock
195 |   } do
196 |     {:ok, remote_pid} = MyApp.WorkerSup.register()
197 |     rclock = Clock.event(rclock)
198 | 
199 |     remote_registry = [
200 |       entry(name: name, pid: remote_pid, ref: nil, meta: meta, clock: Clock.peek(rclock))
201 |     ]
202 | 
203 |     tracker_data = %{
204 |       elem(:sys.get_state(Swarm.Tracker), 1)
205 |       | sync_node: node(self()),
206 |         pending_sync_reqs: [self()]
207 |     }
208 | 
209 |     :sys.replace_state(Swarm.Tracker, fn _ -> {:syncing, tracker_data} end)
210 | 
211 |     GenServer.cast(Swarm.Tracker, {:sync_recv, self(), rclock, remote_registry})
212 | 
213 |     tracker_pid = Process.whereis(Swarm.Tracker)
214 |     assert_receive({:"$gen_cast", {:sync_ack, ^tracker_pid, _, _}})
215 |     refute_receive({:"$gen_cast", {:sync_recv, ^tracker_pid, _, _}})
216 | 
217 |     assert elem(:sys.get_state(Swarm.Tracker), 0) == :tracking
218 |   end
219 | 
220 |   defp call_track(name, pid, meta) do
221 |     Swarm.Tracker.track(name, pid)
222 | 
223 |     Enum.each(meta, fn {k, v} ->
224 |       Swarm.Tracker.add_meta(k, v, pid)
225 |     end)
226 | 
227 |     Registry.get_by_name(name)
228 |   end
229 | 
230 |   defp send_sync_request(clock, registry) do
231 |     GenServer.cast(Swarm.Tracker, {:sync, self(), clock})
232 |     GenServer.cast(Swarm.Tracker, {:sync_ack, self(), clock, registry})
233 | 
234 |     # get_state to wait for the sync to be completed
235 |     :sys.get_state(Swarm.Tracker)
236 | 
237 |     # flush all messages from the sync process
238 |     receive do
239 |       _ -> :ok
240 |     after
241 |       0 -> :ok
242 |     end
243 |   end
244 | 
245 |   defp random_name() do
246 |     :rand.uniform()
247 |   end
248 | 
249 |   defp assert_process_alive?(alive, pid, tries \\ 10)
250 | 
251 |   defp assert_process_alive?(alive, pid, 0) do
252 |     assert Process.alive?(pid) == alive
253 |   end
254 | 
255 |   defp assert_process_alive?(alive, pid, tries) do
256 |     if Process.alive?(pid) == alive do
257 |       alive
258 |     else
259 |       # get_state to drain the message queue and retry...
260 |       :sys.get_state(Swarm.Tracker)
261 |       assert_process_alive?(alive, pid, tries - 1)
262 |     end
263 |   end
264 | end
265 | 


--------------------------------------------------------------------------------