├── test
    ├── test.config
    ├── minishard_test.erl
    └── minishard_detest.erl
├── elvis
├── .gitignore
├── src
    ├── minishard.app.src
    ├── minishard.erl
    ├── minishard_demo.erl
    ├── minishard_sup.erl
    ├── minishard_shard.erl
    ├── minishard_allocator.erl
    └── minishard_gen_leader.erl
├── Makefile
├── AUTHORS
├── LICENSE
├── README.md
└── elvis.config


/test/test.config:
--------------------------------------------------------------------------------
1 | % -*- mode: erlang -*-
2 | [
3 | ].
4 | 


--------------------------------------------------------------------------------
/elvis:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yandex/minishard/HEAD/elvis


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | deps/*
 2 | ebin
 3 | .*.sw?
 4 | .erlang.mk*
 5 | *.d
 6 | erl_crash.dump
 7 | test/*.ebin
 8 | .detest
 9 | log
10 | 


--------------------------------------------------------------------------------
/src/minishard.app.src:
--------------------------------------------------------------------------------
 1 | {application, minishard, [
 2 |     {description, ""},
 3 |         {vsn, "0.1.0"},
 4 |         {id, "git"},
 5 |         {modules, []},
 6 |         {registered, []},
 7 |         {applications, [
 8 |             kernel,
 9 |             stdlib
10 |                 ]},
11 |         {mod, {minishard, []}},
12 |         {env, []}
13 |     ]}.
14 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | PROJECT = minishard
 2 | COMPILE_FIRST = minishard_gen_leader
 3 | 
 4 | NID := 1
 5 | SHELL_OPTS = -sname minishard$(NID) -setcookie minishard_demo -s minishard -boot start_sasl -sasl errlog_type error
 6 | 
 7 | BUILD_DEPS = elvis_mk
 8 | DEP_PLUGINS = elvis_mk
 9 | TEST_DEPS = detest
10 | 
11 | dep_elvis_mk = git https://github.com/inaka/elvis.mk.git 784e41bcb91
12 | 
13 | include erlang.mk
14 | 


--------------------------------------------------------------------------------
/AUTHORS:
--------------------------------------------------------------------------------
 1 | The following authors have created the source code of "minishard"
 2 | published and distributed by YANDEX LLC as the owner:
 3 | 
 4 | Danil Zagoskin <z@gosk.in>
 5 | 
 6 | 
 7 | The list of authors and contributors, who created the source code of
 8 | "minishard_gen_leader" module ("src/minishard_gen_leader.erl"), which is
 9 | a part of "minishard", you may find in a comment at the beginning of that file.
10 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2016, YANDEX LLC
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without modification,
 5 | are permitted provided that the following conditions are met:
 6 | 
 7 | 1. Redistributions of source code must retain the above copyright notice,
 8 |     this list of conditions and the following disclaimer.
 9 | 
10 | 2. Redistributions in binary form must reproduce the above copyright notice,
11 |     this list of conditions and the following disclaimer in the documentation
12 |     and/or other materials provided with the distribution.
13 | 
14 | 3. Neither the name of the copyright holder nor the names of its contributors
15 |     may be used to endorse or promote products derived from this software
16 |     without specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
20 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 


--------------------------------------------------------------------------------
/src/minishard.erl:
--------------------------------------------------------------------------------
 1 | -module(minishard).
 2 | -behaviour(application).
 3 | 
 4 | -type score() :: number().
 5 | -export_type([score/0]).
 6 | 
 7 | %%% Behavior callbacks
 8 | % Configuration
 9 | -callback shard_count(ClusterName :: atom()) -> integer().
10 | -callback cluster_nodes(ClusterName :: atom()) -> [node()].
11 | % Life cycle
12 | -callback allocated(ClusterName :: atom(), ShardNum :: integer()) -> State :: any().
13 | -callback score(State :: any()) -> score().
14 | -callback prolonged(Loser :: pid(), State :: any()) -> {ok, NextState :: any()}.
15 | -callback deallocated(Winner :: pid(), State :: any()) -> any().
16 | 
17 | % API
18 | -export([start/0]).
19 | -export([join/2, get_node/2, get_manager/2]).
20 | -export([status/1, status/2]).
21 | 
22 | % Application callbacks
23 | -export([start/2, stop/1]).
24 | 
25 | 
26 | start() ->
27 |     application:start(?MODULE, permanent).
28 | 
29 | 
30 | start(_Type, _Args) ->
31 |     minishard_sup:start_link(root).
32 | 
33 | stop(_State) ->
34 |     ok.
35 | 
36 | 
37 | %% Join the cluster
38 | join(ClusterName, CallbackMod) ->
39 |     minishard_sup:join_cluster(ClusterName, CallbackMod).
40 | 
41 | %% Resolve a shard number to the shard manager pid
42 | get_manager(ClusterName, ShardNum) ->
43 |     minishard_allocator:get_manager(ClusterName, ShardNum).
44 | 
45 | %% Resolve a shard number to the node currently hosting it
46 | get_node(ClusterName, ShardNum) ->
47 |     minishard_allocator:get_node(ClusterName, ShardNum).
48 | 
49 | %% Cluster status
50 | status(ClusterName) ->
51 |     minishard_allocator:cluster_status(ClusterName).
52 | 
53 | status(ClusterName, _CallbackMod) -> % old API compatibility
54 |     {Status, _Counts, NodeMap} = status(ClusterName),
55 |     {Status, NodeMap}.
56 | 


--------------------------------------------------------------------------------
/src/minishard_demo.erl:
--------------------------------------------------------------------------------
 1 | -module(minishard_demo).
 2 | -behavior(minishard).
 3 | 
 4 | -export([cluster_nodes/1, shard_count/1]).
 5 | -export([allocated/2, score/1, prolonged/2, deallocated/2]).
 6 | 
 7 | % Generate fake node list by changing a number in local node name
 8 | cluster_nodes(_) ->
 9 |     BinNode = atom_to_binary(node(), latin1),
10 |     [make_node(BinNode, N) || N <- lists:seq(1, 5)].
11 | 
12 | % Shard count, needed to monitor cluster for degrades
13 | shard_count(_) ->
14 |     2.
15 | 
16 | % Helper for cluster node names generation
17 | make_node(BinPattern, N) ->
18 |     IOLNode = re:replace(BinPattern, "[0-9]+@", [integer_to_list(N), "@"]),
19 |     binary_to_atom(iolist_to_binary(IOLNode), latin1).
20 | 
21 | 
22 | -record(demo, {name, num, alloc_time}).
23 | 
24 | allocated(Cluster, Num) ->
25 |     error_logger:info_msg("Woo-hoo!!! Minishard demo cluster (name ~w) has allocated us as shard #~w", [Cluster, Num]),
26 |     {ok, #demo{name = Cluster, num = Num, alloc_time = os:timestamp()}}.
27 | 
28 | score(#demo{alloc_time = AllocTime}) ->
29 |     % Let the score be number of seconds we are active
30 |     timer:now_diff(os:timestamp(), AllocTime) div 10000000.
31 | 
32 | prolonged(Loser, #demo{name = Cluster, num = Num} = State) ->
33 |     error_logger:info_msg("Wheeeeeee!!! We still own minishard cluster ~w shard #~w, and ~w at ~w is loser!",
34 |                           [Cluster, Num, Loser, node(Loser)]),
35 |     {ok, State}.
36 | 
37 | deallocated(undefined, #demo{name = Cluster, num = Num}) ->
38 |     error_logger:info_msg("Bad news: minishard cluster ~w has degraded, so we lose the shard #~w :(", [Cluster, Num]),
39 |     ok;
40 | deallocated(Winner, #demo{name = Cluster, num = Num}) ->
41 |     error_logger:info_msg("Bad news: ~w at ~w has won the competition for minishard cluster ~w shard #~w :(",
42 |                           [Winner, node(Winner), Cluster, Num]),
43 |     ok.
44 | 


--------------------------------------------------------------------------------
/test/minishard_test.erl:
--------------------------------------------------------------------------------
 1 | -module(minishard_test).
 2 | 
 3 | -export([set_config/1, set_config/2, start/0, start/1, map/0, map/1]).
 4 | 
 5 | -behavior(minishard).
 6 | -export([shard_count/1, cluster_nodes/1, score/1, prolonged/2, allocated/2, deallocated/2]).
 7 | 
 8 | 
 9 | start() ->
10 |     start(test).
11 | 
12 | start(Name) ->
13 |     application:ensure_all_started(lager),
14 |     application:ensure_all_started(minishard),
15 |     %error_logger:info_msg("Minishard config: ~120p~n", [application:get_all_env(minishard)]),
16 |     {ok, _} = minishard:join(Name, ?MODULE),
17 |     ok.
18 | 
19 | 
20 | map() ->
21 |     map(test).
22 | map(Name) ->
23 |     minishard_allocator:shard_map(Name).
24 | 
25 | 
26 | set_config(Config) ->
27 |     set_config(test, Config).
28 | 
29 | set_config(Name, Config) ->
30 |     application:load(minishard),
31 |     ClustersConf = application:get_env(minishard, clusters, []),
32 |     NewClustersConf = lists:ukeymerge(1, [{Name, Config}], ClustersConf),
33 |     application:set_env(minishard, clusters, NewClustersConf),
34 |     ok.
35 | 
36 | get_config(Name) ->
37 |     ClustersConf = application:get_env(minishard, clusters, []),
38 |     proplists:get_value(Name, ClustersConf, []).
39 | 
40 | get_conf_value(Name, Key, Default) ->
41 |     MyConf = get_config(Name),
42 |     proplists:get_value(Key, MyConf, Default).
43 | 
44 | shard_count(Name) ->
45 |     get_conf_value(Name, shard_count, 3).
46 | cluster_nodes(Name) ->
47 |     get_conf_value(Name, nodes, [node()]).
48 | 
49 | -record(shaman, {
50 |         name,
51 |         shard,
52 |         started_at
53 |         }).
54 | 
55 | allocated(Name, Shard) ->
56 |     {ok, #shaman{name = Name, shard = Shard, started_at = os:timestamp()}}.
57 | 
58 | score(#shaman{started_at = Started}) ->
59 |     timer:now_diff(os:timestamp(), Started)/1000000.
60 | 
61 | prolonged(_, State) ->
62 |     {ok, State}.
63 | 
64 | deallocated(_, State) ->
65 |     {ok, State}.
66 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Minishard — lightweight sharding for distributed Erlang applications
 2 | =======
 3 | 
 4 | Goal
 5 | -----
 6 | Sometimes you need to store large amount of temporary data available from any node of Erlang cluster.
 7 | Storing all the data on single node may cause a memory problem and makes this node a single point of failure.
 8 | Replication is even worse sometimes — in case of unreliable network (e.g. multiple datacenters) you get
 9 | inconsistencies and merge conflicts.
10 | 
11 | Minishard keeps configured number of unique shards allocated on nodes of your cluster, restores cluster connectivity,
12 | resolves possible conflicts after netsplit, notifies your application when cluster degrades.
13 | 
14 | How minishard is supposed to work
15 | -----------
16 | Minishard node is started with two arguments: ```ClusterName :: atom()``` and ```CallbackMod :: module()```.
17 | 
18 | Before joining the cluster we first need to know the list of nodes it is constructed of. This is done by calling ```CallbackMod:cluster_nodes(ClusterName) -> [node()]```.
19 | 
20 | Also minishard gets the number of required shards by calling ```CallbackMod:shard_count(ClusterName) -> integer()```.
21 | 
22 | Minishard uses modified version of ```gen_leader``` for leader election.
23 | Leader is responsible for all shard allocations/deallocations.
24 | 
25 | After manager has been selected as a shard owner, ```CallbackMod:allocated(ClusterName, ShardId) -> {ok, State}``` is called.
26 | Any actions needed to initialize a shard should be performed in this function.
27 | ```State``` is any term you want, it will be passed to other callbacks.
28 | 
29 | When conflict occurs minishard decides which shard instance should be shut down. To do that each instance is queried for its score by calling ```CallbackMod:score(State) -> integer()```.
30 | Instance with the highest score is a winner and remains allocated. Loser is deallocated. Corresponding callbacks are called:
31 |   * ```CallbackMod:prolonged(LoserPid, State) -> {ok, NewState}```
32 |   * ```CallbackMod:deallocated(WinnerPid, State) -> any()``` — final cleanup, return value is ignored. If deallocation is done due to cluster degradation or shutdown, ```WinnerPid``` is ```undefined```.
33 | 
34 | If you need a complex migration algorithm, implement it yourself by calling e.g. ```gen_server:enter_loop``` on deallocation.
35 | 
36 | 


--------------------------------------------------------------------------------
/src/minishard_sup.erl:
--------------------------------------------------------------------------------
 1 | -module(minishard_sup).
 2 | -behaviour(supervisor).
 3 | 
 4 | -export([start_link/1]).
 5 | -export([join_cluster/2, get_pid/2]).
 6 | -export([init/1]).
 7 | 
 8 | %% For embedding minishard cluster in any supervision tree
 9 | -export([cluster_child_spec/2, cluster_internal_specs/2]).
10 | 
11 | sup_name(root) ->
12 |     minishard;
13 | sup_name({cluster, ClusterName, _}) ->
14 |     list_to_atom("minishard_" ++ atom_to_list(ClusterName) ++ "_sup").
15 | 
16 | % Helper: get pid of started infrastructure part
17 | get_pid(undefined, _) ->
18 |     throw(undefined_cluster);
19 | get_pid(ClusterName, shard) when is_atom(ClusterName) ->
20 |     strict_whereis(minishard_shard:name(ClusterName));
21 | get_pid(ClusterName, PartName) when is_atom(ClusterName), is_atom(PartName) ->
22 |     Sup = sup_name({cluster, ClusterName, undefined}),
23 |     Children = supervisor:which_children(Sup),
24 |     case lists:keyfind(PartName, 1, Children) of
25 |         {PartName, Pid, _, _} -> Pid;
26 |         false -> undefined
27 |     end.
28 | 
29 | strict_whereis(ProcessName) when is_atom(ProcessName) ->
30 |     Pid = whereis(ProcessName),
31 |     Pid == undefined andalso error(no_cluster),
32 |     Pid.
33 | 
34 | start_link(Arg) ->
35 |     supervisor:start_link({local, sup_name(Arg)}, ?MODULE, Arg).
36 | 
37 | cluster_child_spec(ClusterName, CallbackMod) when is_atom(ClusterName), is_atom(CallbackMod) ->
38 |     {ClusterName,
39 |      {?MODULE, start_link, [{cluster, ClusterName, CallbackMod}]},
40 |      permanent, 10000, supervisor, []}.
41 | 
42 | allocator_spec(ClusterName, CallbackMod) ->
43 |     {allocator,
44 |      {minishard_allocator, start_link, [ClusterName, CallbackMod]},
45 |      permanent, 1000, worker, [minishard_allocator]}.
46 | 
47 | shard_spec(ClusterName, CallbackMod) ->
48 |     {shard,
49 |      {minishard_shard, start_link, [ClusterName, CallbackMod]},
50 |      permanent, 1000, worker, [minishard_shard]}.
51 | 
52 | cluster_internal_specs(ClusterName, CallbackMod) when is_atom(ClusterName), is_atom(CallbackMod) ->
53 |     [allocator_spec(ClusterName, CallbackMod), shard_spec(ClusterName, CallbackMod)].
54 | 
55 | 
56 | join_cluster(ClusterName, CallbackMod) ->
57 |     supervisor:start_child(sup_name(root), cluster_child_spec(ClusterName, CallbackMod)).
58 | 
59 | 
60 | 
61 | init(root) ->
62 |     {ok, {{one_for_one, 1, 5}, []}};
63 | 
64 | init({cluster, ClusterName, CallbackMod}) ->
65 |     {ok, {{one_for_all, 5, 10}, cluster_internal_specs(ClusterName, CallbackMod)}}.
66 | 


--------------------------------------------------------------------------------
/elvis.config:
--------------------------------------------------------------------------------
 1 | % -*- mode: erlang -*-
 2 | [
 3 |     {
 4 |         elvis,
 5 |         [
 6 |             {config,
 7 |              [#{dirs => ["src"],
 8 |                 filter => "*.erl",
 9 |                 rules => [{elvis_style, line_length, #{limit => 120,
10 |                                                        skip_comments => false}},
11 |                           {elvis_style, no_tabs},
12 |                           {elvis_style, no_trailing_whitespace},
13 |                           {elvis_style, macro_names},
14 |                           {elvis_style, operator_spaces, #{
15 |                                     rules => [{right, ","}, {right, "++"}, {left, "++"}]}},
16 |                           {elvis_style, nesting_level, #{level => 3, ignore => [minishard_gen_leader]}},
17 |                           {elvis_style, god_modules, #{limit => 40}},
18 |                           {elvis_style, no_if_expression},
19 |                           {elvis_style, invalid_dynamic_call, #{ignore => [minishard_allocator, minishard_shard, elvis, minishard_gen_leader]}},
20 |                           {elvis_style, used_ignored_variable},
21 |                           {elvis_style, no_behavior_info},
22 |                           {
23 |                                 elvis_style,
24 |                                 module_naming_convention,
25 |                                 #{regex => "^([a-z][a-z0-9]*_?)*(_SUITE)?$",
26 |                                   ignore => []}
27 |                                 },
28 |                           {elvis_style, no_spec_with_records},
29 |                           {elvis_style, dont_repeat_yourself, #{min_complexity => 10, ignore => [minishard_gen_leader]}}
30 |                          ]
31 |                },
32 |               #{dirs => ["."],
33 |                 filter => "Makefile",
34 |                 rules => [{elvis_project, no_deps_master_erlang_mk, #{ignore => []}},
35 |                           {elvis_project, protocol_for_deps_erlang_mk, #{ignore => []}}]
36 |                },
37 |               #{dirs => ["."],
38 |                 filter => "rebar.config",
39 |                 rules => [{elvis_project, no_deps_master_rebar, #{ignore => []}},
40 |                           {elvis_project, git_for_deps_rebar, #{ignore => []}}]
41 |                },
42 |               #{dirs => ["."],
43 |                 filter => "elvis.config",
44 |                 rules => [{elvis_project, old_configuration_format}]
45 |                }
46 |              ]
47 |             }
48 |             ]
49 |         }
50 |     ].
51 | 


--------------------------------------------------------------------------------
/test/minishard_detest.erl:
--------------------------------------------------------------------------------
  1 | -module(minishard_detest).
  2 | 
  3 | % mandatory detest functions
  4 | -export([cfg/1,run/1,setup/1,cleanup/1]).
  5 | 
  6 | cfg(_TestArgs) ->
  7 |     % Compile test callback module
  8 |     compile:file("test/minishard_test", [{outdir, "test"}]),
  9 |     [
 10 |         % {global_cfg,[{"test/nodes.yaml",[{fixedvals,KV}]},"test/withoutparams.yaml"]},
 11 |         {per_node_cfg, ["test/test.config"]},
 12 |         {cmd         , "-pa test -s minishard -config test/test.config"},
 13 |         {wait_for_app, minishard},
 14 |         {nodes       , []},
 15 |         {erlenv      , [{"ERL_LIBS","deps"}]}
 16 |     ].
 17 | 
 18 | 
 19 | setup(_Param) ->
 20 |     ok.
 21 | 
 22 | cleanup(_Param) ->
 23 |     ok.
 24 | 
 25 | 
 26 | run(Param) ->
 27 |     random:seed(os:timestamp()),
 28 |     lager:info("Script params: ~p", [Param]),
 29 |     ClusterSize = proplists:get_value(cluster_size, Param, 10),
 30 | 
 31 | 
 32 |     IdsToStart = lists:seq(1, ClusterSize),
 33 |     % Warning: do not start new nodes with pmap because they will get a same address
 34 |     NodeMap = maps:from_list([{Id, detest:add_node(node_spec(Id))} || Id <- IdsToStart]),
 35 | 
 36 |     lager:info("Started nodes, map: ~120p", [NodeMap]),
 37 | 
 38 |     Nodes = maps:values(NodeMap),
 39 |     MST_Config = [{shard_count, 3}, {nodes, Nodes}],
 40 | 
 41 |     configure_and_start(test, Nodes, MST_Config),
 42 | 
 43 |     timer:sleep(1200 + 50*ClusterSize), % Let the leader allocate all shards
 44 | 
 45 |     lager:info("initial shard map: ~120p", [get_validate_map(Nodes, allocated)]),
 46 | 
 47 |     kill_standby_nodes_test(test, Nodes, MST_Config, 10),
 48 | 
 49 |     kill_leader_test(test, Nodes, MST_Config, 10),
 50 | 
 51 |     ok.
 52 | 
 53 | 
 54 | configure_and_start(Name, Nodes, MST_Config) ->
 55 |     ConfigResults = multicall(Nodes, minishard_test, set_config, [Name, MST_Config]),
 56 |     [{_, ok}] = lists:ukeysort(2, ConfigResults),
 57 |     StartResults = multicall(Nodes, minishard_test, start, [Name]),
 58 |     [{_, ok}] = lists:ukeysort(2, StartResults),
 59 |     ok.
 60 | 
 61 | %% Helper: parallel map for faster cluster startup
 62 | pmap(Function, List) ->
 63 |     S = self(),
 64 |     Pids = [spawn_link(fun() -> execute(S, Function, El) end) || El <- List],
 65 |     gather(Pids).
 66 | 
 67 | execute(Recv, Function, Element) ->
 68 |     Recv ! {self(), Function(Element)}.
 69 | 
 70 | gather([]) -> [];
 71 | gather([H|T]) ->
 72 |     receive
 73 |         {H, Ret} -> [Ret|gather(T)]
 74 |     end.  
 75 | 
 76 | %% This multicall is not compatible with rpc:multicall:
 77 | %% it takes only MFA and returns a tuplelist where results are tagged with node names
 78 | multicall(Nodes, M, F, A) ->
 79 |     pmap(fun(Node) ->
 80 |                 {Node, rpc:call(Node, M, F, A)}
 81 |         end, Nodes).
 82 |     
 83 | 
 84 | get_validate_map(Nodes, ExpectedState) ->
 85 |     Map = get_same_map(Nodes),
 86 |     ok = validate_map(Map, ExpectedState),
 87 |     Map.
 88 | 
 89 | get_same_map(Nodes) ->
 90 |     NodeMaps = multicall(Nodes, minishard_test, map, [test]),
 91 |     case lists:ukeysort(2, NodeMaps) of
 92 |         [{_, #{} = Map}] ->
 93 |             Map;
 94 |         _Other ->
 95 |             error({different_maps, NodeMaps})
 96 |     end.
 97 | 
 98 | validate_map(Map, allocated) ->
 99 |     case missing_shards(Map) of
100 |         [] -> ok;
101 |         [_|_] = Missing -> {error, {missing, Missing}}
102 |     end.
103 | 
104 | missing_shards(Map) ->
105 |     maps:fold(fun collect_missing_shards/3, [], Map).
106 | 
107 | collect_missing_shards(Shard, undefined, Acc) -> [Shard|Acc];
108 | collect_missing_shards(_Shard, _, Acc) -> Acc.
109 | 
110 | 
111 | kill_standby_nodes_test(_Name, _Nodes, _Config, 0) ->
112 |     ok;
113 | kill_standby_nodes_test(Name, Nodes, MST_Config, Iterations) ->
114 |     ClusterSize = length(Nodes),
115 |     Map0 = get_validate_map(Nodes, allocated),
116 |     BusyNodes = maps:values(Map0),
117 |     KillCandidates = Nodes -- BusyNodes,
118 |     NodesToKill = lists:filter(fun(_) -> crypto:rand_uniform(0, 2) == 1 end, KillCandidates),
119 |     RemainingNodes = Nodes -- NodesToKill,
120 | 
121 |     lager:info("kill_standby_nodes_test: (~w iters left) killing ~120p", [Iterations, NodesToKill]),
122 |     pmap(fun(Node) -> detest:stop_node(Node) end, NodesToKill),
123 | 
124 |     timer:sleep(1200),
125 |     Map0 = get_validate_map(RemainingNodes, allocated),
126 | 
127 |     lager:info("kill_standby_nodes_test: (~w iters left) starting back ~120p", [Iterations, NodesToKill]),
128 |     pmap(fun(Node) -> detest:add_node(node_spec(Node)) end, NodesToKill),
129 |     configure_and_start(Name, NodesToKill, MST_Config),
130 | 
131 |     timer:sleep(1200 + 50*ClusterSize), % Let the leader allocate all shards
132 |     Map0 = get_validate_map(RemainingNodes, allocated),
133 | 
134 |     kill_standby_nodes_test(Name, Nodes, MST_Config, Iterations - 1).
135 | 
136 | 
137 | kill_leader_test(_Name, _Nodes, _Config, 0) ->
138 |     ok;
139 | kill_leader_test(Name, Nodes, MST_Config, Iterations) ->
140 |     SeenLeaders = multicall(Nodes, minishard_allocator, leader, [Name]),
141 |     [{_, Leader}] = lists:ukeysort(2, SeenLeaders),
142 |     RemainingNodes = Nodes -- [Leader],
143 | 
144 |     lager:info("kill_leader_test: (~w iters left) killing ~120p", [Iterations, Leader]),
145 |     detest:stop_node(Leader),
146 | 
147 |     timer:sleep(1200),
148 |     Map0 = get_validate_map(RemainingNodes, allocated),
149 |     NewSeenLeaders = multicall(RemainingNodes, minishard_allocator, leader, [Name]),
150 |     [{_, _NewLeader}] = lists:ukeysort(2, NewSeenLeaders),
151 | 
152 |     lager:info("kill_leader_test: (~w iters left) starting back ~120p", [Iterations, Leader]),
153 |     detest:add_node(node_spec(Leader)),
154 |     configure_and_start(Name, [Leader], MST_Config),
155 | 
156 |     timer:sleep(1200), % Let the leader allocate all shards
157 |     Map0 = get_validate_map(RemainingNodes, allocated),
158 | 
159 |     kill_leader_test(Name, Nodes, MST_Config, Iterations - 1).
160 | 
161 | node_spec(Node) when is_atom(Node) ->
162 |     {ok, [N], "@" ++ _} = io_lib:fread("mst_~d", atom_to_list(Node)),
163 |     node_spec(N);
164 | node_spec(N) when is_integer(N) ->
165 |     [{id, N}, {name, list_to_atom("mst_" ++ integer_to_list(N))}].
166 | 
167 | 


--------------------------------------------------------------------------------
/src/minishard_shard.erl:
--------------------------------------------------------------------------------
  1 | -module(minishard_shard).
  2 | -behavior(gen_server).
  3 | 
  4 | -export([start_link/2, name/1, status/1, info/1]).
  5 | -export([set_status/2]).
  6 | 
  7 | %% gen_server callbacks
  8 | -export([init/1, handle_info/2, handle_cast/2, handle_call/3, code_change/3, terminate/2]).
  9 | 
 10 | %% Conflict resolution: get score
 11 | -export([get_score_or_kill/1]).
 12 | 
 13 | name(ClusterName) when is_atom(ClusterName) ->
 14 |     list_to_atom("minishard_" ++ atom_to_list(ClusterName) ++ "_shard").
 15 | 
 16 | 
 17 | start_link(ClusterName, CallbackMod) when is_atom(ClusterName), is_atom(CallbackMod) ->
 18 |     State = seed_state(ClusterName, CallbackMod),
 19 |     gen_server:start_link({local, name(ClusterName)}, ?MODULE, State, []).
 20 | 
 21 | 
 22 | %% Set shard status (for use by allocator)
 23 | set_status(ShardPid, Status) when is_pid(ShardPid) ->
 24 |     gen_server:call(ShardPid, {set_status, Status}).
 25 | 
 26 | 
 27 | %% Get shard status
 28 | status(ClusterOrShard) when ClusterOrShard /= undefined ->
 29 |     {dictionary, Dict} = process_info(local_pid(ClusterOrShard), dictionary),
 30 |     proplists:get_value(status, Dict, undefined).
 31 | 
 32 | info(ClusterOrShard) when ClusterOrShard /= undefined ->
 33 |     {dictionary, Dict} = process_info(local_pid(ClusterOrShard), dictionary),
 34 |     case proplists:get_value(status, Dict, undefined) of
 35 |         active ->
 36 |             {active, #{
 37 |                     since => proplists:get_value(active_since, Dict),
 38 |                     shard => proplists:get_value(shard, Dict)
 39 |                     }};
 40 |         Inactive ->
 41 |             {Inactive, #{}}
 42 |     end.
 43 | 
 44 | 
 45 | local_pid(ManagerPid) when is_pid(ManagerPid) ->
 46 |     ManagerPid;
 47 | local_pid(ClusterName) when is_atom(ClusterName), ClusterName /= undefined ->
 48 |     whereis(name(ClusterName)).
 49 | 
 50 | 
 51 | -record(shard, {
 52 |         cluster_name,
 53 |         callback_mod,
 54 |         callback_state,
 55 |         max_number,
 56 |         my_number,
 57 |         monitors,
 58 |         recheck_timer,
 59 |         status
 60 |         }).
 61 | 
 62 | seed_state(ClusterName, CallbackMod) ->
 63 |     #shard{
 64 |         cluster_name = ClusterName,
 65 |         callback_mod = CallbackMod,
 66 |         max_number = CallbackMod:shard_count(ClusterName),
 67 |         my_number = undefined,
 68 |         monitors = #{},
 69 |         status = starting
 70 |         }.
 71 | 
 72 | 
 73 | init(#shard{} = State) ->
 74 |     {ok, export_status(State), 0}.
 75 | 
 76 | 
 77 | %% Initial status discovery. Later watcher will notify us about status changes
 78 | handle_info(timeout, #shard{status = starting} = State0) ->
 79 |     {noreply, schedule_recheck(join_cluster(State0#shard{status = idle}))};
 80 | 
 81 | handle_info({timeout, Timer, recheck_ownership}, #shard{recheck_timer = Timer, cluster_name = ClusterName} = State) ->
 82 |     % Ensure our allocator feels OK and responds to calls (did not stall)
 83 |     _ = minishard_allocator:leader(ClusterName),
 84 |     % OK, we did not crash, so allocator is running.
 85 |     % Now let's see if we missed deallocation
 86 |     handle_ownership_recheck(State#shard{recheck_timer = undefined});
 87 | 
 88 | handle_info(Unexpected, #shard{cluster_name = ClusterName} = State) ->
 89 |     error_logger:warning_msg("Minishard shard ~w got unexpected message: ~9999p", [ClusterName, Unexpected]),
 90 |     {noreply, State}.
 91 | 
 92 | 
 93 | handle_call(score, _From, #shard{status = active,
 94 |                                  callback_mod = CallbackMod, callback_state = CallbackState} = State) ->
 95 |     Score = CallbackMod:score(CallbackState),
 96 |     {reply, Score, State};
 97 | 
 98 | handle_call({set_status, {active, ShardNum}}, _From, #shard{status = active, my_number = ShardNum} = State) ->
 99 |     {reply, ok, State};
100 | handle_call({set_status, {active, OtherShardNum}}, _From, #shard{status = active, my_number = ShardNum} = State) ->
101 |     {stop, {wont_change_shard, ShardNum, OtherShardNum}, {error, shard_change}, State};
102 | handle_call({set_status, {active, ShardNum}}, _From, #shard{} = State) ->
103 |     {reply, ok, activate(ShardNum, State)};
104 | handle_call({set_status, Inactive}, _From, #shard{status = active} = State)
105 |         when Inactive == idle; Inactive == standby ->
106 |     % This should not happen - allocator should send an allocation event
107 |     NewState = callback_deallocate(undefined, State),
108 |     {stop, {shutdown, suddenly_deallocated}, ok, idle(NewState)};
109 | handle_call({set_status, idle}, _From, #shard{} = State) ->
110 |     {reply, ok, idle(State)};
111 | handle_call({set_status, standby}, _From, #shard{} = State) ->
112 |     {reply, ok, standby(State)};
113 | handle_call({set_status, Status}, _From, #shard{} = State) ->
114 |     {reply, {error, {bad_status, Status}}, State};
115 | 
116 | handle_call(_, _From, #shard{} = State) ->
117 |     {reply, {error, not_implemented}, State}.
118 | 
119 | 
120 | handle_cast({allocation, Action, Challenger}, #shard{} = State) ->
121 |     handle_allocation(Action, Challenger, State);
122 | 
123 | handle_cast(Unexpected, #shard{cluster_name = ClusterName} = State) ->
124 |     error_logger:warning_msg("Minishard shard ~w got unexpected cast: ~9999p", [ClusterName, Unexpected]),
125 |     {noreply, State}.
126 | 
127 | 
128 | code_change(_, #shard{} = State, _) ->
129 |     {ok, State}.
130 | 
131 | terminate(_, #shard{}) ->
132 |     ok.
133 | 
134 | 
135 | 
136 | %% Allocation notification on conflict
137 | handle_allocation(prolong, Loser, #shard{} = State) ->
138 |     {noreply, callback_prolong(Loser, State)};
139 | handle_allocation(cancel, Winner, #shard{} = State) ->
140 |     NewState = callback_deallocate(Winner, State),
141 |     % Gracefully shutdown for cleanup
142 |     {stop, {shutdown, cluster_degraded}, idle(NewState)}.
143 | 
144 | 
145 | %% Shard ownership recheck
146 | handle_ownership_recheck(#shard{status = active, cluster_name = ClusterName, my_number = MyNum} = State0) ->
147 |     Owner = minishard:get_manager(ClusterName, MyNum),
148 |     State = schedule_recheck(State0),
149 |     case (Owner == self()) of
150 |         true -> % OK, we still own the shard
151 |             {noreply, State};
152 |         false -> %% Oops...
153 |             error_logger:error_msg("Minishard: cluster ~w shard #~w ownership lost!", [ClusterName, MyNum]),
154 |             handle_allocation(cancel, undefined, State)
155 |     end;
156 | handle_ownership_recheck(#shard{} = State) ->
157 |     {noreply, schedule_recheck(State)}.
158 | 
159 | 
160 | %%%
161 | %%% Internals
162 | %%%
163 | 
164 | export_status(#shard{status = active = Status, my_number = MyNum} = State) ->
165 |     put(shard, MyNum),
166 |     put(active_since, os:timestamp()),
167 |     put(status, Status),
168 |     State;
169 | export_status(#shard{status = Status} = State) ->
170 |     put(status, Status),
171 |     erase(active_since),
172 |     erase(shard),
173 |     State.
174 | 
175 | 
176 | %% Try to join a cluster and take a free shard if possible
177 | join_cluster(#shard{status = standby} = State) ->
178 |     % Already waiting for free shard number. This may happen after transition
179 |     State;
180 | join_cluster(#shard{status = active} = State) ->
181 |     % Already active, do nothing. This may happen after transition
182 |     State;
183 | join_cluster(#shard{status = idle, cluster_name = ClusterName} = State) ->
184 |     case minishard_allocator:bind(ClusterName) of
185 |         {active, MyNumber} ->
186 |             activate(MyNumber, State);
187 |         standby ->
188 |             export_status(State#shard{status = standby, my_number = undefined})
189 |     end.
190 | 
191 | %% Perform all activation stuff when we capture a shard number
192 | activate(MyNumber, #shard{} = State) ->
193 |     Allocated = callback_allocate(State#shard{status = active, my_number = MyNumber}),
194 |     export_status(Allocated).
195 | 
196 | 
197 | %% Leave degraded cluster
198 | idle(#shard{status = idle} = State) ->
199 |     % Nothing to do
200 |     State;
201 | idle(#shard{status = standby} = State) ->
202 |     export_status(State#shard{status = idle});
203 | idle(#shard{status = active} = State) ->
204 |     export_status(State#shard{status = idle, my_number = undefined}).
205 | 
206 | standby(#shard{status = standby} = State) ->
207 |     % Nothing to do
208 |     State;
209 | standby(#shard{status = idle} = State) ->
210 |     export_status(State#shard{status = standby}).
211 | 
212 | 
213 | %% Due to some troubles allocator may have after netsplit, we need to periodically ensure we still own the shard number
214 | schedule_recheck(#shard{cluster_name = minishard_demo} = State) ->
215 |     State;
216 | schedule_recheck(#shard{} = State) ->
217 |     Timer = erlang:start_timer(100, self(), recheck_ownership),
218 |     State#shard{recheck_timer = Timer}.
219 | 
220 | 
221 | %% Callback management
222 | callback_allocate(#shard{cluster_name = ClusterName, callback_mod = CallbackMod, my_number = MyNumber} = State) ->
223 |     {ok, CallbackState} = CallbackMod:allocated(ClusterName, MyNumber),
224 |     State#shard{callback_state = CallbackState}.
225 | 
226 | callback_prolong(Loser, #shard{callback_mod = CallbackMod, callback_state = CallbackState} = State) ->
227 |     {ok, NewCallbackState} = CallbackMod:prolonged(Loser, CallbackState),
228 |     State#shard{callback_state = NewCallbackState}.
229 | 
230 | callback_deallocate(Winner, #shard{callback_mod = CallbackMod, callback_state = CallbackState} = State) ->
231 |     _ = CallbackMod:deallocated(Winner, CallbackState),
232 |     State#shard{callback_state = undefined}.
233 | 
234 | 
235 | %% We perform score getting in separate process to ensure allocator does not get garbage messages
236 | get_score_or_kill(ShardPid) ->
237 |     ScoreGetResult = rpc:call(node(), gen_server, call, [ShardPid, score, 1000]),
238 |     handle_score_result(ShardPid, ScoreGetResult).
239 | 
240 | handle_score_result(_Pid, Score) when is_number(Score) ->
241 |     Score;
242 | handle_score_result(ShardPid, _) ->
243 |     % We don't care what exactly goes wrong, we just kill it
244 |     exit(ShardPid, kill),
245 |     undefined.
246 | 


--------------------------------------------------------------------------------
/src/minishard_allocator.erl:
--------------------------------------------------------------------------------
  1 | %%% Minishard allocator
  2 | %%%
  3 | %%% This module is a callback module for gen_leader (well, local version of it)
  4 | %%% which tracks other members status and decides who runs which shard.
  5 | %%%
  6 | %%% Leader tracks the cluster status in a map. Each cluster node has a key in the map,
  7 | %%% corresponding value is its status. Possible statuses are:
  8 | %%%   * down            -- allocator on the node is down
  9 | %%%   * #transition{}   -- allocator has recently went down, waiting for it to come back
 10 | %%%   * #request{}      -- allocator is up, waiting for it to send its status
 11 | %%%   * #conflict{}     -- allocator is up and hosts a conflicting shard. Waiting for it to send its score
 12 | %%%   * idle            -- allocator is up, but shard manager has not been bound
 13 | %%%   * standby         -- allocator is up with a bound shard manager without a shard
 14 | %%%   * #active{}       -- allocator is up with a bound shard manager hosting shard N
 15 | %%%
 16 | -module(minishard_allocator).
 17 | -define(GEN_LEADER, minishard_gen_leader).
 18 | 
 19 | -behavior(?GEN_LEADER).
 20 | 
 21 | 
 22 | %% API
 23 | -export([name/1]).
 24 | -export([start_link/2, cluster_status/1, shard_map/1]).
 25 | -export([bind/1]).
 26 | -export([get_manager/2, get_node/2]).
 27 | 
 28 | %% Testing/debugging
 29 | -export([seed_state/3, set_hacks/2, leader/1]).
 30 | 
 31 | %% gen_leader callbacks
 32 | -export([
 33 |     init/1,
 34 |     handle_cast/3,
 35 |     handle_call/4,
 36 |     handle_info/3,
 37 |     handle_leader_call/4,
 38 |     handle_leader_cast/3,
 39 |     handle_DOWN/3,
 40 |     elected/3,
 41 |     surrendered/3,
 42 |     from_leader/3,
 43 |     code_change/4,
 44 |     terminate/2]).
 45 | 
 46 | 
 47 | %% Candidate status request
 48 | -record(request, {
 49 |         ref :: reference()              % Request reference
 50 |         }).
 51 | -record(status_update, {
 52 |         ref :: reference(),             % Request reference
 53 |         node  :: node(),                % Reporting node
 54 |         status :: node_status(),        % Reported status
 55 |         manager :: undefined | pid()    % Current node's shard manager
 56 |         }).
 57 | 
 58 | %% Conflict resolution status
 59 | -record(conflict, {
 60 |         shard :: integer(),                     % Conflicting shard number
 61 |         ref   :: reference(),                   % Resolution reference
 62 |         score :: undefined | minishard:score()  % Score reported by a member
 63 |         }).
 64 | %% Conflict score report
 65 | -record(score_report, {
 66 |         ref   :: reference(),                   % Resolution reference
 67 |         node  :: node(),                        % Reporting node
 68 |         score :: minishard:score()              % Reported score
 69 |         }).
 70 | 
 71 | %% Temporary state for nodes going down. Without this shard is reallocated even on interconnect socket reset
 72 | -record(transition, {
 73 |         shard   :: integer(),                   % Shard number just before disconnect
 74 |         ref     :: reference()                  % transition reference
 75 |         }).
 76 | 
 77 | %% Active status
 78 | -record(active, {
 79 |         shard :: integer()   % Active shard number
 80 |         }).
 81 | 
 82 | -type request() :: #request{}.
 83 | -type conflict() :: #conflict{}.
 84 | -type transition() :: #transition{}.
 85 | -type active() :: #active{}.
 86 | -type node_status() :: down | request() | conflict() | transition() | idle | standby | active().
 87 | -type allocation_map() :: #{node() => node_status()}.
 88 | -type manager_map() :: #{node() => undefined | pid()}.
 89 | 
 90 | %% gen_leader callback state
 91 | -record(allocator, {
 92 |         name :: atom(),
 93 |         callback_mod :: module(),
 94 |         my_status :: node_status(),
 95 |         last_response :: reference(),
 96 |         shard_manager :: undefined | pid(),
 97 |         shard_count :: integer(),
 98 |         map :: allocation_map(),
 99 |         managers :: manager_map(),
100 |         hacks :: #{atom() => any()}
101 |         }).
102 | 
103 | -type state() :: #allocator{}.
104 | 
105 | 
106 | %% ETS data model for shard information
107 | -define(ETS_SHARD_KEY(Shard), {shard, Shard}).
108 | -define(ETS_SHARD_NODE_POS, 2).
109 | -define(ETS_SHARD_MANAGER_POS, 3).
110 | -define(ETS_SHARD_RECORD(Shard, Node, Manager), {?ETS_SHARD_KEY(Shard), Node, Manager}).
111 | 
112 | %% Generate a process/ets name for a cluster name
113 | name(ClusterName) ->
114 |     list_to_atom("minishard_" ++ atom_to_list(ClusterName) ++ "_allocator").
115 | 
116 | %% API: Resolve a shard number to the shard manager pid
117 | get_manager(ClusterName, Shard) ->
118 |     ets:lookup_element(name(ClusterName), ?ETS_SHARD_KEY(Shard), ?ETS_SHARD_MANAGER_POS).
119 | 
120 | %% API: Resolve a shard number to the node currently hosting it
121 | get_node(ClusterName, Shard) ->
122 |     ets:lookup_element(name(ClusterName), ?ETS_SHARD_KEY(Shard), ?ETS_SHARD_NODE_POS).
123 | 
124 | %% API: start the allocator for given cluster
125 | start_link(ClusterName, CallbackMod) ->
126 |     start_link(ClusterName, CallbackMod, #{}).
127 | 
128 | start_link(ClusterName, CallbackMod, #{} = Hacks) when is_atom(ClusterName), is_atom(CallbackMod) ->
129 |     Name = name(ClusterName),
130 |     State0 = #allocator{map = Map} = seed_state(ClusterName, CallbackMod, Hacks),
131 |     Nodes = maps:keys(Map),
132 |     Options = leader_worker_options(Nodes) ++ [{heartbeat, 5}, {bcast_type, all}, {seed_node, none}],
133 |     ?GEN_LEADER:start_link(Name, Nodes, Options, ?MODULE, State0, []).
134 | 
135 | leader_worker_options(Nodes) ->
136 |     case lists:member(node(), Nodes) of
137 |         true -> [];
138 |         false -> [{workers, [node()]}]
139 |     end.
140 | 
141 | %% Test/debug API: set hacks for a running allocator
142 | set_hacks(ClusterName, #{} = Hacks) when is_atom(ClusterName) ->
143 |     ?GEN_LEADER:call(name(ClusterName), {set_hacks, Hacks}).
144 | 
145 | %% Seed state for a starting allocator
146 | seed_state(ClusterName, CallbackMod, Hacks) ->
147 |     Nodes = CallbackMod:cluster_nodes(ClusterName),
148 |     MyStatus = case lists:member(node(), Nodes) of
149 |         true -> idle;
150 |         false -> worker
151 |     end,
152 |     SeedMap = maps:from_list([{N, down} || N <- Nodes]),
153 |     SeedManagers = maps:from_list([{N, undefined} || N <- Nodes]),
154 |     #allocator{
155 |         name = name(ClusterName),
156 |         callback_mod = CallbackMod,
157 |         shard_manager = undefined,
158 |         my_status = MyStatus,
159 |         shard_count = CallbackMod:shard_count(ClusterName),
160 |         map = SeedMap,
161 |         managers = SeedManagers,
162 |         hacks = Hacks }.
163 | 
164 | 
165 | %% Register a shard manager ready to host a shard
166 | bind(ClusterName) ->
167 |     ?GEN_LEADER:call(name(ClusterName), {bind, self()}, 120000).
168 | 
169 | %% Helper for possible asynchronous manager reply
170 | manager_reply(undefined, _) ->
171 |     ok;
172 | manager_reply(From, Reply) ->
173 |     ?GEN_LEADER:reply(From, Reply).
174 | 
175 | 
176 | %% Return cluster status in form {OverallStatusAtom, NodeStatusMap}
177 | cluster_status(ClusterName) when is_atom(ClusterName) ->
178 |     ?GEN_LEADER:call(name(ClusterName), cluster_status).
179 | 
180 | %% Return current leader node
181 | leader(ClusterName) when is_atom(ClusterName) ->
182 |     ?GEN_LEADER:call(name(ClusterName), get_leader).
183 | 
184 | %% Return shard allocation map
185 | -spec shard_map(ClusterName :: atom()) -> #{Shard :: integer() => node()}.
186 | shard_map(ClusterName) ->
187 |     %ets:foldl(fun collect_shard_map/2, #{}, name(ClusterName)).
188 |     maps:from_list([{Shard, Node} || ?ETS_SHARD_RECORD(Shard, Node, _) <- ets:tab2list(name(ClusterName))]).
189 | 
190 | %% Init: nothing special, we start with an empty map
191 | init(#allocator{name = Name} = State) ->
192 |     Name = ets:new(Name, [protected, named_table, set, {read_concurrency, true}]),
193 |     ok = export_shard_map(State),
194 |     {ok, State}.
195 | 
196 | 
197 | handle_cast(Msg, #allocator{name = Name} = State, _Election) ->
198 |     error_logger:warning_msg("Minishard allocator ~w got unexpected cast ~9999p", [Name, Msg]),
199 |     {noreply, State}.
200 | 
201 | handle_info(Msg, #allocator{name = Name} = State, _Election) ->
202 |     error_logger:warning_msg("Minishard allocator ~w got unexpected info ~9999p", [Name, Msg]),
203 |     {noreply, State}.
204 | 
205 | handle_call({bind, ShardManager}, _From, #allocator{name = Name} = State, _Election) ->
206 |     NewState = State#allocator{shard_manager = ShardManager},
207 |     ok = ?GEN_LEADER:leader_cast(Name, {bind_manager, node(), ShardManager, undefined}),
208 |     {reply, standby, NewState};
209 | handle_call(cluster_status, _From, #allocator{} = State, Election) ->
210 |     {reply, make_cluster_status(State, Election), State};
211 | handle_call(get_leader, _From, #allocator{} = State, Election) ->
212 |     {reply, ?GEN_LEADER:leader_node(Election), State};
213 | handle_call({set_hacks, Hacks}, _From, #allocator{} = State, _Election) ->
214 |     {reply, ok, State#allocator{hacks = Hacks}};
215 | handle_call(_Request, _From, #allocator{} = State, _Election) ->
216 |     {reply, {error, not_implemented}, State}.
217 | 
218 | 
219 | 
220 | %% We are elected. Propagate our allocation map
221 | elected(#allocator{name = Name} = State, Election, Loser) ->
222 |     error_logger:info_msg("Minishard allocator ~w elected, ~w surrendered", [Name, Loser]),
223 |     StateRequestsRestarted = restart_requests(State),
224 |     NewState = handle_new_election(Election, StateRequestsRestarted),
225 |     {ok, NewState, NewState}.
226 | 
227 | 
228 | %% Node goes down. Deallocate its shard and remove from pool
229 | handle_DOWN(Node, #allocator{name = Name} = State, Election) ->
230 |     error_logger:info_msg("Minishard allocator ~w has seen ~w's death", [Name, Node]),
231 |     NewState = handle_new_election(Election, State),
232 |     {ok, NewState, NewState}.
233 | 
234 | 
235 | 
236 | get_allocation(Node, #{} = Map) ->
237 |     case maps:find(Node, Map) of
238 |         {ok, Status} -> Status;
239 |         error -> undefined
240 |     end.
241 | 
242 | %% We have surrendered. Inherit a new allocation map
243 | surrendered(#allocator{name = Name} = State, #allocator{} = Synch, Election) ->
244 |     error_logger:info_msg("Minishard allocator ~w surrendered, forwarding Synch to from_leader/3", [Name]),
245 |     from_leader(Synch, State, Election);
246 | surrendered(#allocator{name = Name} = State, _Synch, _Election) ->
247 |     error_logger:info_msg("Minishard allocator ~w surrendered", [Name]),
248 |     {ok, State}.
249 | 
250 | 
251 | handle_leader_call(_Request, _From, State, _Election) ->
252 |     {reply, {error, not_implemented}, State}.
253 | 
254 | handle_leader_cast({bind_manager, Node, ShardManager, From}, #allocator{map = Map, name = Name} = State, _Election) ->
255 |     case maps:is_key(Node, Map) of
256 |         true ->
257 |             error_logger:info_msg("Minishard allocator ~w *** LEADER *** adds ~w as good node", [Name, Node]),
258 |             StateWithManager = set_manager(Node, ShardManager, State),
259 |             NewState = #allocator{map = NewMap} = set_realloc_install([Node], standby, StateWithManager),
260 |             _ = manager_reply(From, get_allocation(Node, NewMap)),
261 |             {ok, NewState, NewState};
262 |         false ->
263 |             _ = manager_reply(From, not_my_cluster),
264 |             {noreply, State}
265 |     end;
266 | 
267 | handle_leader_cast({request_timeout, RequestRef}, #allocator{name = Name} = State, _Election) ->
268 |     case handle_request_timeout(RequestRef, State) of
269 |         {updated, NewState} ->
270 |             error_logger:info_msg("Minishard allocator ~w *** LEADER *** status update ~w timeout", [Name, RequestRef]),
271 |             {ok, NewState, NewState};
272 |         unchanged ->
273 |             {noreply, State}
274 |     end;
275 | 
276 | handle_leader_cast(#status_update{ref = RequestRef, node = Node, status = Status, manager = Manager},
277 |                    #allocator{name = Name, map = Map} = State, _Election) ->
278 |     error_logger:info_msg("Minishard allocator ~w *** LEADER *** got a status update from ~w (~w)",
279 |                           [Name, Node, Status]),
280 |     case get_allocation(Node, Map) of
281 |         #request{ref = RequestRef} ->
282 |             StateWithManager = set_manager(Node, Manager, State),
283 |             NewState = handle_possible_conflicts(Node, Status, StateWithManager),
284 |             {ok, NewState, NewState};
285 |         _ ->
286 |             {noreply, State}
287 |     end;
288 | 
289 | handle_leader_cast({conflict_timeout, ConflictRef}, #allocator{name = Name, map = Map} = State, _Election) ->
290 |     {Shard, NodeScores} = conflict_shard_and_scores(ConflictRef, Map),
291 |     Shard /= undefined andalso error_logger:info_msg(
292 |         "Minishard allocator ~w *** LEADER *** conflict ~w (shard ~w) timeout", [Name, ConflictRef, Shard]),
293 |     NewState = resolve_conflict(Shard, NodeScores, State),
294 |     {ok, NewState, NewState};
295 | 
296 | handle_leader_cast(#score_report{ref = ReportRef, node = Node, score = Score},
297 |                    #allocator{name = Name, map = Map} = State, _Election) ->
298 |     error_logger:info_msg("Minishard allocator ~w *** LEADER *** got a score report from ~w (~w)", [Name, Node, Score]),
299 |     NewMap = case get_allocation(Node, Map) of
300 |         #conflict{ref = ReportRef} = Conflict ->
301 |             set_statuses([Node], Conflict#conflict{score = Score}, Map);
302 |         _ ->
303 |             Map
304 |     end,
305 |     NewState = install_new_map(NewMap, State),
306 |     {Shard, NodeScores} = conflict_shard_and_scores(ReportRef, NewMap),
307 |     case lists:keymember(undefined, 2, NodeScores) of
308 |         true -> % Still have pending score requests, no action needed
309 |             {noreply, NewState};
310 |         false -> % All nodes have reported their scores, ok to resolve conflict now
311 |             ResolvedState = resolve_conflict(Shard, NodeScores, State),
312 |             {ok, ResolvedState, ResolvedState}
313 |     end;
314 | 
315 | handle_leader_cast({transition_timeout, TransRef}, #allocator{name = Name} = State, _Election) ->
316 |     case handle_transition_timeout(TransRef, State) of
317 |         {updated, NewState} ->
318 |             error_logger:info_msg("Minishard allocator ~w *** LEADER *** transition ~w finished", [Name, TransRef]),
319 |             {ok, NewState, NewState};
320 |         unchanged ->
321 |             {noreply, State}
322 |     end;
323 | 
324 | handle_leader_cast(Msg, #allocator{name = Name} = State, _Election) ->
325 |     error_logger:warning_msg("Minishard allocator ~w got unexpected leader cast ~9999p", [Name, Msg]),
326 |     {noreply, State}.
327 | 
328 | 
329 | from_leader(#allocator{map = NewMap, managers = ManagerMap}, #allocator{name = Name} = State, _Election) ->
330 |     error_logger:info_msg("Minishard allocator ~w got update from the leader.", [Name]),
331 |     {ok, install_new_map(NewMap, State#allocator{managers = ManagerMap})};
332 | 
333 | from_leader(Msg, #allocator{name = Name} = State, _Election) ->
334 |     error_logger:info_msg("Minishard allocator ~w got a message from the leader: ~9999p", [Name, Msg]),
335 |     {ok, State}.
336 | 
337 | 
338 | 
339 | terminate(_Reason, _State) ->
340 |     ok.
341 | 
342 | code_change(_OldVsn, #allocator{} = State, _Election, _Extra) ->
343 |     {ok, State}.
344 | 
345 | 
346 | 
347 | 
348 | %% When status request times out, we mark nodes which did not send their status update as idle
349 | handle_request_timeout(RequestRef, #allocator{map = Map} = State) ->
350 |     StalledNodes = maps:fold(fun
351 |                 (Node, #request{ref = NodeRef}, Acc) when NodeRef == RequestRef ->
352 |                     [Node|Acc];
353 |                 (_Node, _Status, Acc) ->
354 |                     Acc
355 |             end, [], Map),
356 |     case StalledNodes of
357 |         [] ->
358 |             unchanged;
359 |         [_|_] ->
360 |             NewState = set_realloc_install(StalledNodes, idle, State),
361 |             {updated, NewState}
362 |     end.
363 | 
364 | %% Transition timeout: here we mark nodes as really down
365 | handle_transition_timeout(TransRef, #allocator{map = Map} = State) ->
366 |     ReallyDownNodes = maps:fold(fun
367 |                 (Node, #transition{ref = NodeRef}, Acc) when NodeRef == TransRef ->
368 |                     [Node|Acc];
369 |                 (_Node, _Status, Acc) ->
370 |                     Acc
371 |             end, [], Map),
372 |     case ReallyDownNodes of
373 |         [] ->
374 |             unchanged;
375 |         [_|_] ->
376 |             NewState = set_realloc_install(ReallyDownNodes, down, State),
377 |             {updated, NewState}
378 |     end.
379 | 
380 | %% The leader has been elected. He has Election from a gen_leader and an outdated map.
381 | %% Here we mark nodes going down as down and request a status from nodes going up
382 | handle_new_election(Election, #allocator{name = Name} = State) ->
383 |     % Determine which nodes need a status request
384 |     OldAlive = alive_nodes(State),
385 |     Alive = ?GEN_LEADER:alive(Election),
386 |     BecameAlive = Alive -- OldAlive,
387 | 
388 |     % Determine which nodes should be marked as down
389 |     OldDown = down_nodes(State),
390 |     Down = ?GEN_LEADER:down(Election),
391 |     BecameDown = Down -- OldDown,
392 | 
393 |     % Apply status changes
394 |     StateDownMarked = lists:foldl(fun handle_node_down/2, State, BecameDown),
395 | 
396 |     AliveStatus = case BecameAlive of
397 |         [] -> % No status will be set, so here we may return any one
398 |             idle;
399 |         [_|_] ->
400 |             % Request statuses
401 |             {Request, _Timer} = make_status_request(Name),
402 |             Request
403 |     end,
404 | 
405 |     set_realloc_install(BecameAlive, AliveStatus, StateDownMarked).
406 | 
407 | 
408 | -spec make_status_request(Name :: atom()) -> {request(), timer:tref()}.
409 | make_status_request(Name) ->
410 |     % Set timer to handle possible troubles during status request
411 |     RequestRef = make_ref(),
412 |     Request = #request{ref = RequestRef},
413 |     {ok, Timer} = timer:apply_after(2000, ?GEN_LEADER, leader_cast, [Name, {request_timeout, RequestRef}]),
414 |     {Request, Timer}.
415 | 
416 | -spec make_conflict_request(Name :: atom(), Shard :: integer()) -> {conflict(), timer:tref()}.
417 | make_conflict_request(Name, Shard) ->
418 |     CRef = make_ref(),
419 |     Conflict = #conflict{shard = Shard, ref = CRef, score = undefined},
420 |     {ok, Timer} = timer:apply_after(2000, ?GEN_LEADER, leader_cast, [Name, {conflict_timeout, CRef}]),
421 |     {Conflict, Timer}.
422 | 
423 | -spec make_transition(Name :: atom(), Shard :: integer()) -> {transition(), timer:tref()}.
424 | make_transition(Name, Shard) ->
425 |     TransRef = make_ref(),
426 |     Request = #transition{ref = TransRef, shard = Shard},
427 |     {ok, Timer} = timer:apply_after(5000, ?GEN_LEADER, leader_cast, [Name, {transition_timeout, TransRef}]),
428 |     {Request, Timer}.
429 | 
430 | 
431 | %% Restart all running requests. When leader changes during request, response may be lost.
432 | %% So we search for all status and score requests, then generate new references for them, starting corresponding timers
433 | restart_requests(#allocator{name = Name, map = Map} = State) ->
434 |     {Name, NewMap, _RefMigration} = maps:fold(fun restart_request/3, {Name, #{}, #{}}, Map),
435 |     State#allocator{map = NewMap}.
436 | 
437 | restart_request(Node, #request{ref = OldRef} = OldRequest, {Name, Map, RefMigration}) ->
438 |     NewRequest = #request{ref = NewRef} = case maps:get(OldRef, RefMigration, undefined) of
439 |         undefined ->
440 |             {NewRequest_, _} = make_status_request(Name),
441 |             NewRequest_;
442 |         ExistingRef ->
443 |             OldRequest#request{ref = ExistingRef}
444 |     end,
445 |     restart_request_store(Name, Node, NewRequest, OldRef, NewRef, Map, RefMigration);
446 | restart_request(Node, #conflict{ref = OldRef, shard = Shard} = OldRequest, {Name, Map, RefMigration}) ->
447 |     NewRequest = #conflict{ref = NewRef} = case maps:get(OldRef, RefMigration, undefined) of
448 |         undefined ->
449 |             {NewRequest_, _} = make_conflict_request(Name, Shard),
450 |             NewRequest_;
451 |         ExistingRef ->
452 |             OldRequest#conflict{ref = ExistingRef, score = undefined}
453 |     end,
454 |     restart_request_store(Name, Node, NewRequest, OldRef, NewRef, Map, RefMigration);
455 | restart_request(Node, #transition{ref = OldRef, shard = Shard} = OldRequest, {Name, Map, RefMigration}) ->
456 |     NewRequest = #transition{ref = NewRef} = case maps:get(OldRef, RefMigration, undefined) of
457 |         undefined ->
458 |             {NewRequest_, _} = make_transition(Name, Shard),
459 |             NewRequest_;
460 |         ExistingRef ->
461 |             OldRequest#transition{ref = ExistingRef}
462 |     end,
463 |     restart_request_store(Name, Node, NewRequest, OldRef, NewRef, Map, RefMigration);
464 | restart_request(Node, NotRequest, {Name, Map, RefMigration}) ->
465 |     NewMap = maps:put(Node, NotRequest, Map),
466 |     {Name, NewMap, RefMigration}.
467 | 
468 | restart_request_store(Name, Node, NewRequest, OldRef, NewRef, Map, RefMigration) ->
469 |     NewMap = maps:put(Node, NewRequest, Map),
470 |     NewRefMigration = maps:put(OldRef, NewRef, RefMigration),
471 |     {Name, NewMap, NewRefMigration}.
472 | 
473 | 
474 | handle_node_down(Node, #allocator{name = Name, map = Map} = State) ->
475 |     NewStatus = case get_allocation(Node, Map) of
476 |         #active{shard = Shard} ->
477 |             {Transition, _} = make_transition(Name, Shard),
478 |             Transition;
479 |         _ ->
480 |             down
481 |     end,
482 | 
483 |     NewMap = set_statuses([Node], NewStatus, Map),
484 |     set_manager(Node, undefined, State#allocator{map = NewMap}).
485 | 
486 | %% Remember node's bound manager
487 | -spec set_manager(Node :: node(), Manager :: undefined | pid(), state()) -> state().
488 | set_manager(Node, ShardManager, #allocator{managers = ManMap} = State) ->
489 |     true = maps:is_key(Node, ManMap),
490 |     NewManMap = maps:update(Node, ShardManager, ManMap),
491 |     State#allocator{managers = NewManMap}.
492 | 
493 | 
494 | %% Set given status for a given list of nodes, reallocate shards, install the updated map
495 | -spec set_realloc_install(Nodes :: [node()], Status :: node_status(), state()) -> state().
496 | set_realloc_install(Nodes, Status, #allocator{map = Map, shard_count = ShardCount} = State) ->
497 |     MapUpdated = set_statuses(Nodes, Status, Map),
498 |     NewMap = reallocate(ShardCount, MapUpdated),
499 |     install_new_map(NewMap, State).
500 | 
501 | %% batch set status for given list of nodes
502 | -spec set_statuses(Nodes :: [node()], Status :: node_status(), Map :: allocation_map()) -> allocation_map().
503 | set_statuses(Nodes, Status, Map) ->
504 |     [] = Nodes -- maps:keys(Map),
505 |     OverrideMap = maps:from_list([{N, Status} || N <- Nodes]),
506 |     maps:merge(Map, OverrideMap).
507 | 
508 | 
509 | %% Perform shard allocation when possible
510 | -spec reallocate(ShardCount :: integer(), Map :: allocation_map()) -> allocation_map().
511 | reallocate(ShardCount, #{} = Map) when is_integer(ShardCount) ->
512 |     HaveQuorum = (length(alive_nodes(Map)) >= ShardCount),
513 |     HavePendingReq = lists:any(fun is_request/1, maps:values(Map)),
514 |     case {HaveQuorum, HavePendingReq} of
515 |         {true, false} -> % Require quorum and no status requests for reallocation
516 |             do_reallocate(ShardCount, Map);
517 |         {_, _} ->
518 |             Map
519 |     end.
520 | 
521 | do_reallocate(ShardCount, #{} = Map) ->
522 |     Shards = lists:seq(1, ShardCount),
523 |     AllocatedShards = shards_in_use(Map),
524 | 
525 |     ShardsToAllocate = Shards -- AllocatedShards,
526 |     StandbyNodes = maps:fold(fun collect_standby_nodes/3, [], Map),
527 | 
528 |     Allocations = safe_zip(StandbyNodes, ShardsToAllocate),
529 | 
530 |     MapOverride = maps:from_list([{Node, #active{shard = Shard}} || {Node, Shard} <- Allocations]),
531 |     maps:merge(Map, MapOverride).
532 | 
533 | 
534 | %% Install a new allocation map and perform all needed actions
535 | -spec install_new_map(Map :: allocation_map(), state()) -> state().
536 | install_new_map(NewMap, #allocator{name = Name} = State) ->
537 |     error_logger:info_msg("Minishard allocator ~w: installing new map ~9999p", [Name, NewMap]),
538 |     MyNewStatus = get_allocation(node(), NewMap),
539 |     NewState = handle_my_new_status(MyNewStatus, State#allocator{map = NewMap}),
540 |     ok = export_shard_map(NewState),
541 |     NewState.
542 | 
543 | 
544 | %% Leader has possibly changed our status. Let's see what we should do
545 | -spec handle_my_new_status(node_status(), state()) -> state().
546 | handle_my_new_status(undefined, #allocator{my_status = worker} = State) ->
547 |     % I am worker, so my status is always worker, and I don't appear in a map
548 |     State;
549 | handle_my_new_status(OldStatus, #allocator{my_status = OldStatus} = State) ->
550 |     % Unchanged, pass
551 |     State;
552 | handle_my_new_status(#request{ref = Ref}, #allocator{last_response = Ref} = State) ->
553 |     % We have already sent a status update for this request
554 |     State;
555 | handle_my_new_status(#request{ref = Ref}, #allocator{
556 |                 name = Name, my_status = MyStatus, shard_manager = Manager, hacks = Hacks} = State) ->
557 |     apply_hack(on_status_request, Hacks),
558 |     % New status request. Send an update and wait
559 |     Report = #status_update{ref = Ref, node = node(), status = MyStatus, manager = Manager},
560 |     ?GEN_LEADER:leader_cast(Name, Report),
561 |     State#allocator{last_response = Ref};
562 | handle_my_new_status(#conflict{ref = Ref}, #allocator{last_response = Ref} = State) ->
563 |     % We have already sent a score for this conflict
564 |     State;
565 | handle_my_new_status(#conflict{ref = Ref}, #allocator{name = Name, shard_manager = ShManager, hacks = Hacks} = State) ->
566 |     apply_hack(on_conflict_request, Hacks),
567 |     % New score request. Send an update and wait
568 |     case minishard_shard:get_score_or_kill(ShManager) of
569 |         undefined ->
570 |             throw({stop, shard_score_timeout, State});
571 |         Score when is_integer(Score) ->
572 |             Report = #score_report{ref = Ref, node = node(), score = Score},
573 |             ?GEN_LEADER:leader_cast(Name, Report),
574 |             State#allocator{last_response = Ref}
575 |     end;
576 | handle_my_new_status(down, #allocator{shard_manager = Manager} = State) ->
577 |     ok = set_manager_status(Manager, idle),
578 |     throw({stop, {shutdown, shut_down_by_leader}, State});
579 | handle_my_new_status(#active{shard = NewShard}, #allocator{my_status = #active{shard = OldShard}} = State)
580 |         when NewShard /= OldShard ->
581 |     throw({stop, {shard_suddenly_changed, OldShard, NewShard}, State});
582 | handle_my_new_status(#active{shard = OldShard} = Status, #allocator{my_status = #active{shard = OldShard}} = State) ->
583 |     State#allocator{my_status = Status};
584 | handle_my_new_status(Status, #allocator{shard_manager = Manager} = State)
585 |         when Status == idle; Status == standby; is_record(Status, active) ->
586 |     ok = set_manager_status(Manager, Status),
587 |     State#allocator{my_status = Status}.
588 | 
589 | 
590 | 
591 | 
592 | %% Export a shard map to the ETS
593 | export_shard_map(#allocator{name = Name, managers = ManagerMap} = State) ->
594 |     ShardNodeMap = shard_node_map(State),
595 |     _ = maps:fold(fun export_shard_info/3, {Name, ManagerMap}, ShardNodeMap),
596 |     ok.
597 | 
598 | shard_node_map(#allocator{shard_count = ShardCount, map = NodeMap}) ->
599 |     SeedShardMap = maps:from_list([{Shard, undefined} || Shard <- lists:seq(1, ShardCount)]),
600 |     maps:fold(fun collect_active_shards/3, SeedShardMap, NodeMap).
601 | 
602 | collect_active_shards(Node, #active{shard = Shard}, ShardNodeMap) ->
603 |     maps:update(Shard, Node, ShardNodeMap);
604 | collect_active_shards(_Node, _Status, ShardNodeMap) ->
605 |     ShardNodeMap.
606 | 
607 | export_shard_info(Shard, Node, {Name, ManagerMap}) ->
608 |     Manager = maps:get(Node, ManagerMap, undefined),
609 |     true = ets:insert(Name, ?ETS_SHARD_RECORD(Shard, Node, Manager)),
610 |     {Name, ManagerMap}.
611 | 
612 | 
613 | %% Set shard manager status when leader updates it
614 | -spec set_manager_status(Manager :: undefined | pid(), Status :: idle | standby | active()) -> ok.
615 | set_manager_status(undefined, _Status) -> % No manager, pass
616 |     ok;
617 | set_manager_status(Manager, Status) ->
618 |     minishard_shard:set_status(Manager, node_status_for_manager(Status)).
619 | 
620 | node_status_for_manager(idle) -> idle;
621 | node_status_for_manager(standby) -> standby;
622 | node_status_for_manager(#active{shard = Shard}) -> {active, Shard}.
623 | 
624 | 
625 | %% A node has sent a valid status update. We should check the updated map for conflicts and maybe start resolution
626 | -spec handle_possible_conflicts(Node :: node(), Status :: node_status(), state()) -> state().
627 | handle_possible_conflicts(Node, #active{shard = Shard} = Status,
628 |                           #allocator{name = Name, map = Map, hacks = Hacks} = State) ->
629 |     case shard_nodes(Shard, Map) of
630 |         [] -> % no other candidates for this shard
631 |             set_realloc_install([Node], Status, State);
632 |         [_|_] = ConflictingNodes -> % Oops, we have a conflict
633 |             apply_hack(on_conflict_detected, Hacks),
634 |             {Conflict, _} = make_conflict_request(Name, Shard),
635 |             set_realloc_install([Node|ConflictingNodes], Conflict, State)
636 |     end;
637 | 
638 | handle_possible_conflicts(Node, Status, #allocator{} = State) ->
639 |     set_realloc_install([Node], Status, State).
640 | 
641 | 
642 | resolve_conflict(undefined, [], #allocator{} = State) ->
643 |     % No conflict, pass
644 |     State;
645 | resolve_conflict(Shard, [_|_] = NodeScores, #allocator{map = Map} = State) ->
646 |     {Winner, _} = select_winner(NodeScores),
647 |     MapWithWinner = case Winner of
648 |         undefined -> Map;
649 |         _RealWinner -> set_statuses([Winner], #active{shard = Shard}, Map)
650 |     end,
651 |     Losers = [Node || {Node, _} <- NodeScores, Node /= Winner],
652 |     NewMap = set_statuses(Losers, down, MapWithWinner),
653 |     install_new_map(NewMap, State).
654 | 
655 | 
656 | %% Helper: list nodes marked as alive in a map
657 | alive_nodes(#allocator{map = Map}) ->
658 |     alive_nodes(Map);
659 | alive_nodes(#{} = Map) ->
660 |     maps:fold(fun collect_alive/3, [], Map).
661 | 
662 | collect_alive(_Node, down, Acc) -> Acc;
663 | collect_alive(_Node, #transition{}, Acc) -> Acc;
664 | collect_alive(Node, _, Acc) -> [Node|Acc].
665 | 
666 | %% Helper: list nodes marked as down in a map
667 | down_nodes(#allocator{map = Map}) ->
668 |     maps:fold(fun collect_down/3, [], Map).
669 | 
670 | collect_down(Node, down, Acc) -> [Node|Acc];
671 | collect_down(Node, #transition{}, Acc) -> [Node|Acc];
672 | collect_down(_Node, _, Acc) -> Acc.
673 | 
674 | 
675 | %% Helper: check if node status in map is 'status requested'
676 | is_request(#request{}) -> true;
677 | is_request(_) -> false.
678 | 
679 | %% Helper: return a list of shards in use
680 | shards_in_use(Map) ->
681 |     maps:fold(fun collect_shards_in_use/3, [], Map).
682 | 
683 | %% Helper: add active and conflicting shards to the accumulator
684 | collect_shards_in_use(_Node, #active{shard = Shard}, Acc) -> [Shard|Acc];
685 | collect_shards_in_use(_Node, #conflict{shard = Shard}, Acc) -> [Shard|Acc];
686 | collect_shards_in_use(_Node, #transition{shard = Shard}, Acc) -> [Shard|Acc];
687 | collect_shards_in_use(_Node, _, Acc) -> Acc.
688 | 
689 | %% Get a list of nodes pretending to host the shard
690 | shard_nodes(Shard, Map) ->
691 |     {Shard, Nodes} = maps:fold(fun collect_nodes_by_shard/3, {Shard, []}, Map),
692 |     Nodes.
693 | 
694 | %% Helper: when node wants to host the shard, add it to the accumulator
695 | collect_nodes_by_shard(Node, #active{shard = Shard}, {Shard, Acc}) -> {Shard, [Node|Acc]};
696 | collect_nodes_by_shard(Node, #conflict{shard = Shard}, {Shard, Acc}) -> {Shard, [Node|Acc]};
697 | collect_nodes_by_shard(_Node, _, {Shard, Acc}) -> {Shard, Acc}.
698 | 
699 | %% Helper: add standby nodes to the accumulator
700 | collect_standby_nodes(Node, standby, Acc) -> [Node|Acc];
701 | collect_standby_nodes(_Node, _, Acc) -> Acc.
702 | 
703 | %% Helper: do the same as lists:zip/2, but stop when any list ends
704 | safe_zip([H1|L1], [H2|L2]) ->
705 |     [{H1, H2}|safe_zip(L1, L2)];
706 | safe_zip(_L1, _L2) ->
707 |     [].
708 | 
709 | %% Helper: get conflict shard and node scores by a conflict ref
710 | conflict_shard_and_scores(Ref, Map) ->
711 |     {Ref, Shard, NodeScores} = maps:fold(fun collect_conflict_shard_scores/3, {Ref, undefined, []}, Map),
712 |     {Shard, NodeScores}.
713 | 
714 | collect_conflict_shard_scores(Node, #conflict{ref = Ref, shard = Shard, score = Score}, {Ref, _, NodeScores}) ->
715 |     {Ref, Shard, [{Node, Score}|NodeScores]};
716 | collect_conflict_shard_scores(_Node, _, {Ref, Shard, NodeScores}) ->
717 |     {Ref, Shard, NodeScores}.
718 | 
719 | %% Helper: select a winner from given {Node, Score} list when possible
720 | -spec select_winner([{node(), number()}]) -> {Winner :: undefined | node(), BestScore :: undefined | number()}.
721 | select_winner(NodeScores) ->
722 |     lists:foldl(fun find_best_score/2, {undefined, undefined}, NodeScores).
723 | 
724 | find_best_score({BestNode, BestScore}, {_Node, undefined}) ->
725 |     {BestNode, BestScore};
726 | find_best_score({_BestNode, undefined}, {Node, Score}) ->
727 |     {Node, Score};
728 | find_best_score({BestNode, BestScore}, {_Node, Score}) when BestScore >= Score ->
729 |     {BestNode, BestScore};
730 | find_best_score({_Node, _PrevBestScore}, {BetterNode, BetterScore}) ->
731 |     {BetterNode, BetterScore}.
732 | 
733 | 
734 | %% Build cluster status report
735 | make_cluster_status(#allocator{shard_count = ShardCount, map = NodeMap} = State, _Election) ->
736 |     TotalNodeCnt = length(maps:keys(NodeMap)),
737 |     AliveNodesCnt = length(alive_nodes(State)),
738 |     ExportNodeMap = maps:map(fun export_node_status/2, NodeMap),
739 |     ShardNodeMap = shard_node_map(State),
740 |     NodeStatusMap = maps:fold(fun allocation_to_node_status/3, ExportNodeMap, ShardNodeMap),
741 |     AllocatedShardCnt = length(lists:filter(fun(N) -> N /= undefined end, maps:values(ShardNodeMap))),
742 |     OverallStatus = overall_status(ShardCount, AllocatedShardCnt, AliveNodesCnt),
743 | 
744 |     Counts = #{shards => {ShardCount, AllocatedShardCnt}, nodes => {TotalNodeCnt, AliveNodesCnt}},
745 |     {OverallStatus, Counts, NodeStatusMap}.
746 | 
747 | -spec overall_status(ShardCount :: integer(), AllocatedShardCount :: integer(), AliveNodesCount :: integer()) -> atom().
748 | overall_status(ShardCount, ShardCount, _AliveNodesCnt) ->
749 |     available;
750 | overall_status(ShardCount, _AllocCnt, AliveNodesCnt) when AliveNodesCnt < ShardCount ->
751 |     degraded;
752 | overall_status(_ShardCount, AllocatedShardCnt, AliveNodesCnt) when AllocatedShardCnt < AliveNodesCnt ->
753 |     allocation_pending;
754 | overall_status(_ShardCount, _AllocatedShardCnt, _AliveNodesCount) ->
755 |     transition.
756 | 
757 | allocation_to_node_status(ShardNum, undefined, NodeStatuses) ->
758 |     maps:put({not_allocated, ShardNum}, undefined, NodeStatuses);
759 | allocation_to_node_status(ShardNum, ShardNode, NodeStatuses) ->
760 |     maps:put(ShardNode, {active, ShardNum}, NodeStatuses).
761 | 
762 | %% Translate node statuses to minishard external status format
763 | export_node_status(_Node, down) -> unavailable;
764 | export_node_status(_Node, idle) -> not_my_cluster;
765 | export_node_status(_Node, standby) -> available;
766 | export_node_status(_Node, #active{shard = Shard}) -> {active, Shard};
767 | export_node_status(_Node, _) -> transition.
768 | 
769 | 
770 | %% Hacks: apply a hack
771 | apply_hack(HackName, Hacks) ->
772 |     case maps:get(HackName, Hacks, undefined) of
773 |         undefined -> ok;
774 |         Fun when is_function(Fun, 0) -> Fun();
775 |         {M, F, A} when is_atom(M), is_atom(F), is_list(A) -> apply(M, F, A)
776 |     end.
777 | 
778 | 
779 | %collect_shard_map(?ETS_SHARD_RECORD(Shard, Node, _), Map) ->
780 | %    maps:put(Shard, Node, Map);
781 | %collect_shard_map(_, Map) ->
782 | %    Map.
783 | 


--------------------------------------------------------------------------------
/src/minishard_gen_leader.erl:
--------------------------------------------------------------------------------
   1 | %%% ``The contents of this file are subject to the Erlang Public License,
   2 | %%% Version 1.1, (the "License"); you may not use this file except in
   3 | %%% compliance with the License. You should have received a copy of the
   4 | %%% Erlang Public License along with this software. If not, it can be
   5 | %%% retrieved via the world wide web at http://www.erlang.org/.
   6 | %%%
   7 | %%% Software distributed under the License is distributed on an "AS IS"
   8 | %%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
   9 | %%% the License for the specific language governing rights and limitations
  10 | %%% under the License.
  11 | %%%
  12 | %%% The Initial Developer of the Original Code is Ericsson Utvecklings AB.
  13 | %%% Portions created by Ericsson are Copyright 1999, Ericsson Utvecklings
  14 | %%% AB. All Rights Reserved.''
  15 | %%%
  16 | %%%
  17 | %%%     $Id: gen_leader.erl, v 1.4 2008/09/19 07:40:15 hanssv Exp $
  18 | %%%
  19 | %%% @author Hans Svensson <hanssv@chalmers.se>
  20 | %%% @author Thomas Arts <thomas.arts@ituniv.se>
  21 | %%% @author Ulf Wiger <ulf.wiger@ericsson.com>
  22 | %%% @author (contributor: Serge Aleynikov <saleyn@gmail.com>)
  23 | %%% @author (contributor: Danil Zagoskin <z@gosk.in>)
  24 | %%%
  25 | %%% @doc Leader election behavior.
  26 | %%% <p>This application implements a leader election behavior modeled after
  27 | %%% gen_server. This behavior intends to make it reasonably
  28 | %%% straightforward to implement a fully distributed server with
  29 | %%% master-slave semantics.</p>
  30 | %%% <p>The gen_leader behavior supports nearly everything that gen_server
  31 | %%% does (some functions, such as multicall() and the internal timeout,
  32 | %%% have been removed), and adds a few callbacks and API functions to
  33 | %%% support leader election etc.</p>
  34 | %%% <p>Also included is an example program, a global dictionary, based
  35 | %%% on the modules gen_leader and dict. The callback implementing the
  36 | %%% global dictionary is called 'test_cb', for no particularly logical
  37 | %%% reason.</p>
  38 | %%% <p><b>New version:</b> The internal leader election algorithm was faulty
  39 | %%% and has been replaced with a new version based on a different leader
  40 | %%% election algorithm. As a consequence of this the query functions
  41 | %%% <tt>alive</tt> and <tt>down</tt> can no longer be provided.
  42 | %%% The new algorithm also make use of an incarnation parameter, by
  43 | %%% default written to disk in the function <tt>incarnation</tt>. This
  44 | %%% implies that only one <tt>gen_leader</tt> per node is permitted, if
  45 | %%% used in a diskless environment, <tt>incarnation</tt> must be adapted.
  46 | %%% </p>
  47 | %%% <p>
  48 | %%% Modifications contributed by Serge Aleynikov:
  49 | %%% <ol>
  50 | %%% <li>Added configurable startup options (see leader_options() type)</li>
  51 | %%% <li>Implemented handle_DOWN/3 callback with propagation of the
  52 | %%%     leader's state via broadcast to all connected candidates.</li>
  53 | %%% <li>Fixed population of the #election.down member so that down/1 query
  54 | %%%     can be used in the behavior's implementation</li>
  55 | %%% <li>Rewrote implementation of the tau timer to prevent the leader
  56 | %%%     looping on the timer timeout event when all candidates are connected.</li>
  57 | %%% </ol>
  58 | %%% </p>
  59 | %%% <p>
  60 | %%% Modifications done by Danil Zagoskin:
  61 | %%% <ol>
  62 | %%% <li>Renamed gen_leader to minishard_gen_leader to avoid name clashes</li>
  63 | %%% <li>Timestamp is used as incarnation value to avoid disk access</li>
  64 | %%% <li>Made initial discovery messages sending fast and unreliable.
  65 | %%%     This prevents blocking on start.</li>
  66 | %%% <li>Added node pinger so that cluster recovers after network problems</li>
  67 | %%% <li>Handled some corner cases which may appear during network problems</li>
  68 | %%% </ol>
  69 | %%% </p>
  70 | %%% @end
  71 | %%%
  72 | %%%
  73 | -module(minishard_gen_leader).
  74 | 
  75 | %% Time between rounds of query from the leader
  76 | -define(TAU, 5000).
  77 | 
  78 | -export([start/6,
  79 |          start_link/6,
  80 |          leader_call/2, leader_call/3, leader_cast/2,
  81 |          call/2, call/3, cast/2,
  82 |          reply/2]).
  83 | 
  84 | %% Query functions
  85 | -export([alive/1,
  86 |          down/1,
  87 |          candidates/1,
  88 |          workers/1,
  89 |          broadcast/3,
  90 |          leader_node/1]).
  91 | 
  92 | -export([system_continue/3,
  93 |          system_terminate/4,
  94 |          system_code_change/4,
  95 |          format_status/2,
  96 |          worker_announce/2
  97 |         ]).
  98 | 
  99 | %% Internal exports
 100 | -export([real_loop/4, real_safe_loop/4, real_mon_loop/2]).
 101 | -export([init_ping_loop/2, ping_loop/3]).
 102 | -export([init_it/6,
 103 |          print_event/3,
 104 |          send_checkleads/4
 105 |         ]).
 106 | 
 107 | 
 108 | %% Notification control of candidate membership changes. `all'
 109 | %% means that returns from the handle_DOWN/3 and elected/3 leader's events
 110 | %% will be broadcast to all candidates.
 111 | -type bcast_type() :: 'all' | 'sender'.
 112 | 
 113 | -type option() :: {'workers',    Workers::[node()]}
 114 |                 | {'vardir',     Dir::string()}
 115 |                 | {'bcast_type', Type::bcast_type()}
 116 |                 | {'heartbeat',  Seconds::integer()}
 117 |                 | {'seed_node', Seed::node()}
 118 |                 .
 119 | 
 120 | -type options() :: [option()].
 121 | 
 122 | -type status() :: 'elec1' | 'elec2' | 'wait' | 'joining' | 'worker' |
 123 |                   'waiting_worker' | 'norm'.
 124 | 
 125 | %% A locally registered name
 126 | -type name() :: atom().
 127 | 
 128 | %% A monitor ref
 129 | -type mon_ref() :: reference().
 130 | 
 131 | -type server_ref() :: name() | {name(), node()} | pid().
 132 | 
 133 | %% Incarnation number
 134 | -type incarn() :: non_neg_integer().
 135 | 
 136 | %% Logical clock
 137 | -type lclock() :: non_neg_integer().
 138 | 
 139 | %% Node priority in the election
 140 | -type priority() :: integer().
 141 | 
 142 | %% Election id
 143 | -type elid() :: {priority(), incarn(), lclock()}.
 144 | 
 145 | %% See gen_server.
 146 | -type caller_ref() :: {pid(), reference()}.
 147 | 
 148 | %% Opaque state of the gen_leader behaviour.
 149 | -record(election, {
 150 |           leader = none             :: 'none' | pid(),
 151 |           previous_leader = none    :: 'none' | pid(),
 152 |           name                      :: name(),
 153 |           leadernode = none         :: 'none' | node(),
 154 |           candidate_nodes = []      :: [node()],
 155 |           worker_nodes = []         :: [node()],
 156 |           down = []                 :: [node()],
 157 |           monitored = []            :: [{mon_ref(), node()}],
 158 |           buffered = []             :: [{reference(), caller_ref()}],
 159 |           seed_node = none          :: 'none' | node(),
 160 |           status                    :: status(),
 161 |           elid                      :: elid(),
 162 |           acks = []                 :: [node()],
 163 |           work_down = []            :: [node()],
 164 |           cand_timer_int            :: integer(),
 165 |           cand_timer                :: term(),
 166 |           pendack                   :: node(),
 167 |           incarn                    :: incarn(),
 168 |           nextel                    :: integer(),
 169 |           %% all | one. When `all' each election event
 170 |           %% will be broadcast to all candidate nodes.
 171 |           bcast_type                :: bcast_type()
 172 |          }).
 173 | 
 174 | -opaque election() :: #election{}.
 175 | 
 176 | -export_type([election/0]).
 177 | 
 178 | -record(server, {
 179 |           parent,
 180 |           mod,
 181 |           state,
 182 |           pinger_proc = spawn_pinger_proc(),
 183 |           monitor_proc = spawn_monitor_proc(),
 184 |           debug :: [sys:dbg_opt()]
 185 |          }).
 186 | 
 187 | %%% ---------------------------------------------------
 188 | %%% Interface functions.
 189 | %%% ---------------------------------------------------
 190 | 
 191 | -callback init(any()) ->
 192 |       {ok, term()}
 193 |     | {stop, term()}
 194 |     | ignore
 195 |     | {'EXIT', term()}
 196 |     .
 197 | -callback elected(term(), election(), pid() | undefined) ->
 198 |       {ok, term(), term()}
 199 |     | {reply, term(), term()}
 200 |     .
 201 | -callback surrendered(term(), term(), election()) ->
 202 |     {ok, term()} .
 203 | -callback handle_leader_call(term(), pid(), term(), election()) ->
 204 |       {reply, term(), term()}
 205 |     | {reply, term(), term(), term()}
 206 |     | {noreply, term()}
 207 |     | {stop, term(), term(), term()}
 208 |     .
 209 | -callback handle_leader_cast(term(), term(), election()) ->
 210 |       {noreply, term()}
 211 |     | {ok, term(), term()}
 212 |     .
 213 | -callback from_leader(term(), term(), election()) ->
 214 |       {noreply, term()}
 215 |     | {ok, term()}
 216 |     | {stop, term(), term()}
 217 |     | {'EXIT', term()}
 218 |     .
 219 | -callback handle_call(term(), pid(), term(), election()) ->
 220 |       {noreply, term()}
 221 |     | {reply, term(), term()}
 222 |     | {ok, term()}
 223 |     | {stop, term(), term()}
 224 |     | {'EXIT', term()}
 225 |     .
 226 | -callback handle_cast(term(), term(), election()) ->
 227 |       {noreply, term()}
 228 |     | {ok, term()}
 229 |     | {stop, term(), term()}
 230 |     | {'EXIT', term()}
 231 |     .
 232 | -callback handle_DOWN(node(), term(), election()) ->
 233 |       {ok, term()}
 234 |     | {ok, term(), term()}
 235 |     .
 236 | -callback handle_info(term(), term(), election()) ->
 237 |       {noreply, term()}
 238 |     | {ok, term()}
 239 |     | {stop, term(), term()}
 240 |     | {'EXIT', term()}
 241 |     .
 242 | -callback terminate(term(), term()) ->
 243 |     any() .
 244 | -callback code_change(term() | {down, term()}, term(), election(), term()) ->
 245 |       {ok, term()}
 246 |     | {error, term()}
 247 |     .
 248 | 
 249 | -on_load(notify_new_code/0).
 250 | notify_new_code() ->
 251 |     [P ! code_reloaded || P <- processes(), erlang:check_process_code(P, ?MODULE) == true],
 252 |     ok.
 253 | 
 254 | -type start_ret() :: {'ok', pid()} | {'error', term()}.
 255 | 
 256 | %% @doc Starts a gen_leader process without linking to the parent.
 257 | %% @see start_link/6
 258 | -spec start(Name::atom(), CandidateNodes::[node()], OptArgs::options(),
 259 |             Mod::module(), Arg::term(), Options::list()) -> start_ret().
 260 | start(Name, CandidateNodes, OptArgs, Mod, Arg, Options)
 261 |   when is_atom(Name), is_list(CandidateNodes), is_list(OptArgs) ->
 262 |     gen:start(?MODULE, nolink, {local, Name},
 263 |               Mod, {CandidateNodes, OptArgs, Arg}, Options).
 264 | 
 265 | %% @doc Starts a gen_leader process.
 266 | %% <table>
 267 | %%  <tr><td>Name</td><td>The locally registered name of the process</td></tr>
 268 | %%  <tr><td>CandidateNodes</td><td>The names of nodes capable of assuming
 269 | %%     a leadership role</td></tr>
 270 | %%  <tr><td valign="top">OptArgs</td>
 271 | %%      <td>Optional arguments given to `gen_leader'.
 272 | %%          <du>
 273 | %%          <dl>{workers, Workers}</dl>
 274 | %%          <dd>The names of nodes that will be part of the "cluster",
 275 | %%              but cannot ever assume a leadership role. Default: [].</dd>
 276 | %%          <dl>{vardir, Dir}</dl>
 277 | %%          <dd>Directory name used to store candidate's incarnation cookie.
 278 | %%              Default: "."</dd>
 279 | %%          <dl>{bcast_type, Type}</dl>
 280 | %%          <dd>When `Type' is 'all' each election event (when a new
 281 | %%              candidate becomes visible to the leader) will be broadcast
 282 | %%              to all live candidate nodes.  Each candidate will get
 283 | %%              a from_leader/3 callback. When `Type' is `sender', only
 284 | %%              the newly registered candidate will get the surrendered/3
 285 | %%              callback. Default: `sender'.</dd>
 286 | %%          <dl>{heartbeat, Seconds}</dl>
 287 | %%          <dd>Heartbeat timeout value used to send ping messages to inactive
 288 | %%              candidate nodes.</dd>
 289 | %%          </du>
 290 | %%      </td></tr>
 291 | %%  <tr><td>Mod</td><td>The name of the callback module</td></tr>
 292 | %%  <tr><td>Arg</td><td>Argument passed on to <code>Mod:init/1</code></td></tr>
 293 | %%  <tr><td>Options</td><td>Same as gen_server's Options</td></tr>
 294 | %% </table>
 295 | %%
 296 | %% <p>The list of candidates needs to be known from the start. Workers
 297 | %% could potentially be added at runtime, but no functionality to do
 298 | %% this is provided by this version.</p>
 299 | %% @end
 300 | -spec start_link(Name::atom(), CandidateNodes::[node()], OptArgs::options(),
 301 |             Mod::module(), Arg::term(), Options::list()) -> start_ret().
 302 | start_link(Name, CandidateNodes, OptArgs, Mod, Arg, Options)
 303 |   when is_atom(Name), is_list(CandidateNodes), is_list(OptArgs) ->
 304 |     gen:start(?MODULE, link, {local, Name},
 305 |               Mod, {CandidateNodes, OptArgs, Arg}, Options).
 306 | 
 307 | %% Query functions to be used from the callback module
 308 | 
 309 | %% @doc Returns list of alive nodes.
 310 | -spec alive(election()) -> [node()].
 311 | alive(E) ->
 312 |     candidates(E) -- down(E).
 313 | 
 314 | %% @doc Returns list of down nodes.
 315 | -spec down(election()) -> [node()].
 316 | down(#election{down = Down}) ->
 317 |     Down.
 318 | 
 319 | %% @doc Returns the current leader node.
 320 | -spec leader_node(election()) -> node() | 'none'.
 321 | leader_node(#election{leadernode=Leader}) ->
 322 |     Leader.
 323 | 
 324 | %% @doc Returns a list of known candidates.
 325 | -spec candidates(election()) -> [node()].
 326 | candidates(#election{candidate_nodes = Cands}) ->
 327 |     Cands.
 328 | 
 329 | %% @doc Returns a list of known workers.
 330 | -spec workers(election()) -> [node()].
 331 | workers(#election{worker_nodes = Workers}) ->
 332 |     Workers.
 333 | 
 334 | %% Used by dynamically added workers.
 335 | %% @hidden
 336 | worker_announce(Name, Pid) ->
 337 |   Name ! {add_worker, Pid},
 338 |   Name ! {heartbeat, Pid}.
 339 | 
 340 | %%
 341 | %% Make a call to a generic server.
 342 | %% If the server is located at another node, that node will
 343 | %% be monitored.
 344 | %% If the client is trapping exits and is linked server termination
 345 | %% is handled here (? Shall we do that here (or rely on timeouts) ?).
 346 | %%
 347 | %% @doc Equivalent to <code>gen_server:call/2</code>, but with a slightly
 348 | %% different exit reason if something goes wrong. This function calls
 349 | %% the <code>gen_leader</code> process exactly as if it were a gen_server
 350 | %% (which, for practical purposes, it is.)
 351 | %% @end
 352 | -spec call(server_ref(), term()) -> term().
 353 | call(Name, Request) ->
 354 |     case catch gen:call(Name, '$gen_call', Request) of
 355 |         {ok, Res} ->
 356 |             Res;
 357 |         {'EXIT', Reason} ->
 358 |             exit({Reason, {?MODULE, local_call, [Name, Request]}})
 359 |     end.
 360 | 
 361 | %% @doc Equivalent to <code>gen_server:call/3</code>, but with a slightly
 362 | %% different exit reason if something goes wrong. This function calls
 363 | %% the <code>gen_leader</code> process exactly as if it were a gen_server
 364 | %% (which, for practical purposes, it is.)
 365 | %% @end
 366 | -spec call(server_ref(), term(), integer()) -> term().
 367 | call(Name, Request, Timeout) ->
 368 |     case catch gen:call(Name, '$gen_call', Request, Timeout) of
 369 |         {ok, Res} ->
 370 |             Res;
 371 |         {'EXIT', Reason} ->
 372 |             exit({Reason, {?MODULE, local_call, [Name, Request, Timeout]}})
 373 |     end.
 374 | 
 375 | %% @doc Makes a call (similar to <code>gen_server:call/2</code>) to the
 376 | %% leader. The call is forwarded via the local gen_leader instance, if
 377 | %% that one isn't actually the leader. The client will exit if the
 378 | %% leader dies while the request is outstanding.
 379 | %% <p>This function uses <code>gen:call/3</code>, and is subject to the
 380 | %% same default timeout as e.g. <code>gen_server:call/2</code>.</p>
 381 | %% @end
 382 | %%
 383 | -spec leader_call(Name::server_ref(), Request::term()) -> term().
 384 | leader_call(Name, Request) ->
 385 |     case catch gen:call(Name, '$leader_call', Request) of
 386 |         {ok, {leader, reply, Res}} ->
 387 |             Res;
 388 |         {ok, {error, leader_died}} ->
 389 |             exit({leader_died, {?MODULE, leader_call, [Name, Request]}});
 390 |         {'EXIT', Reason} ->
 391 |             exit({Reason, {?MODULE, leader_call, [Name, Request]}})
 392 |     end.
 393 | 
 394 | %% @doc Makes a call (similar to <code>gen_server:call/3</code>) to the
 395 | %% leader. The call is forwarded via the local gen_leader instance, if
 396 | %% that one isn't actually the leader. The client will exit if the
 397 | %% leader dies while the request is outstanding.
 398 | %% @end
 399 | %%
 400 | -spec leader_call(Name::server_ref(), Request::term(),
 401 |                   Timeout::integer()) -> term().
 402 | leader_call(Name, Request, Timeout) ->
 403 |     case catch gen:call(Name, '$leader_call', Request, Timeout) of
 404 |         {ok, {leader, reply, Res}} ->
 405 |             Res;
 406 |         {'EXIT', Reason} ->
 407 |             exit({Reason, {?MODULE, leader_call, [Name, Request, Timeout]}})
 408 |     end.
 409 | 
 410 | 
 411 | %% @equiv gen_server:cast/2
 412 | -spec cast(Name::server_ref(), Request::term()) -> 'ok'.
 413 | cast(Name, Request) ->
 414 |     catch do_cast('$gen_cast', Name, Request),
 415 |     ok.
 416 | 
 417 | %% @doc Similar to <code>gen_server:cast/2</code> but will be forwarded to
 418 | %% the leader via the local gen_leader instance.
 419 | -spec leader_cast(Name::server_ref(), Request::term()) -> 'ok'.
 420 | leader_cast(Name, Request) ->
 421 |     catch do_cast('$leader_cast', Name, Request),
 422 |     ok.
 423 | 
 424 | 
 425 | do_cast(Tag, ServerRef, Request) ->
 426 |     ServerRef ! {Tag, Request}.
 427 | 
 428 | 
 429 | %% @equiv gen_server:reply/2
 430 | -spec reply(From::caller_ref(), Reply::term()) -> term().
 431 | reply({To, Tag}, Reply) ->
 432 |     catch To ! {Tag, Reply}.
 433 | 
 434 | 
 435 | %%% ---------------------------------------------------
 436 | %%% Initiate the new process.
 437 | %%% Register the name using the Rfunc function
 438 | %%% Calls the Mod:init/Args function.
 439 | %%% Finally an acknowledge is sent to Parent and the main
 440 | %%% loop is entered.
 441 | %%% ---------------------------------------------------
 442 | %%% @hidden
 443 | init_it(Starter, Parent, {local, Name}, Mod, {CandidateNodes, Workers, Arg}, Options) ->
 444 |     %% R13B passes {local, Name} instead of just Name
 445 |     init_it(Starter, Parent, Name, Mod,
 446 |             {CandidateNodes, Workers, Arg}, Options);
 447 | init_it(Starter, self, Name, Mod, {CandidateNodes, OptArgs, Arg}, Options) ->
 448 |     init_it(Starter, self(), Name, Mod,
 449 |             {CandidateNodes, OptArgs, Arg}, Options);
 450 | init_it(Starter, Parent, Name, Mod, {UnsortedCandidateNodes, OptArgs, Arg}, Options) ->
 451 |     Workers     = proplists:get_value(workers,   OptArgs, []),
 452 |     VarDir      = proplists:get_value(vardir,    OptArgs, "."),
 453 |     Interval    = proplists:get_value(heartbeat, OptArgs, ?TAU div 1000) * 1000,
 454 |     BcastType   = proplists:get_value(bcast_type, OptArgs, sender),
 455 |     Seed        = proplists:get_value(seed,      OptArgs, none),
 456 |     Debug       = debug_options(Name, Options),
 457 |     CandidateNodes = lists:sort(UnsortedCandidateNodes),
 458 |     [spawn_link(net_adm, ping, [Node]) || Node <- CandidateNodes], timer:sleep(1000),
 459 |     AmCandidate = case lists:member(node(), CandidateNodes) of
 460 |                       true -> true;
 461 |                       false ->
 462 |                           case lists:member(node(), Workers) of
 463 |                               true -> false;
 464 |                               false ->
 465 |                                   Seed =/= none
 466 |                           end
 467 |                   end,
 468 | 
 469 |     Election    = #election{
 470 |       candidate_nodes = CandidateNodes,
 471 |       worker_nodes    = Workers,
 472 |       name            = Name,
 473 |       nextel          = 0,
 474 |       cand_timer_int  = Interval,
 475 |       bcast_type      = BcastType
 476 |      },
 477 | 
 478 |     case {AmCandidate, lists:member(node(), Workers)} of
 479 |         {false, false} ->
 480 |             %% I am neither a candidate nor a worker - don't start this process
 481 |             error_logger:warning_msg("~w not started - node is not a candidate/worker\n", [Name]),
 482 |             proc_lib:init_ack(Starter, ignore),
 483 |             exit(normal);
 484 |         _ ->
 485 |             ok
 486 |     end,
 487 | 
 488 |     case {catch Mod:init(Arg), AmCandidate, Seed =/= none} of
 489 |         {{stop, Reason}, _, _} ->
 490 |             proc_lib:init_ack(Starter, {error, Reason}),
 491 |             exit(Reason);
 492 |         {ignore, _, _} ->
 493 |             proc_lib:init_ack(Starter, ignore),
 494 |             exit(normal);
 495 |         {{'EXIT', Reason}, _, _} ->
 496 |             proc_lib:init_ack(Starter, {error, Reason}),
 497 |             exit(Reason);
 498 |         {{ok, State}, true, false} ->
 499 |             Server = #server{parent = Parent, mod = Mod,
 500 |                              state = State, debug = Debug},
 501 |             Incarn = incarnation(VarDir, Name, node()),
 502 |             NewE = startStage1(Election#election{incarn = Incarn}, Server),
 503 |             proc_lib:init_ack(Starter, {ok, self()}),
 504 | 
 505 |             %% handle the case where there's only one candidate worker and we can't
 506 |             %% rely on DOWN messages to trigger the elected() call because we never get
 507 |             %% a DOWN for ourselves
 508 |             case CandidateNodes =:= [node()] of
 509 |                 true ->
 510 |                     %% there's only one candidate leader; us
 511 |                     hasBecomeLeader(NewE, Server, {init});
 512 |                 false ->
 513 |                     %% more than one candidate worker, continue as normal
 514 |                     safe_loop(Server, candidate, NewE, {init})
 515 |             end;
 516 |         {{ok, State}, true, true} ->
 517 |             Server = #server{parent = Parent, mod = Mod,
 518 |                              state = State, debug = Debug},
 519 |             Incarn = incarnation(VarDir, Name, node()),
 520 |             NewE1 = Election#election{incarn = Incarn, seed_node = Seed},
 521 |             NewE = joinCluster(NewE1, Server),
 522 |             proc_lib:init_ack(Starter, {ok, self()}),
 523 |             safe_loop(Server, candidate_joining, NewE, {init});
 524 |         {{ok, State}, false, HasSeed} ->
 525 |             proc_lib:init_ack(Starter, {ok, self()}),
 526 |             Candidates = case HasSeed of
 527 |                              true ->
 528 |                                  {ok, C} = call({Name, Seed}, get_candidates),
 529 |                                  C;
 530 |                              false -> CandidateNodes
 531 |                          end,
 532 |             case lists:member(node(), Workers) of
 533 |                 true ->
 534 |                     rpc:multicall(Candidates, ?MODULE,
 535 |                                   worker_announce, [Name, node(self())]);
 536 |                 false -> nop
 537 |             end,
 538 |             safe_loop(#server{parent = Parent, mod = Mod,
 539 |                               state = State, debug = Debug},
 540 |                       waiting_worker, Election, {init});
 541 |         {Else, _, _} ->
 542 |             Error = {bad_return_value, Else},
 543 |             proc_lib:init_ack(Starter, {error, Error}),
 544 |             exit(Error)
 545 |     end.
 546 | 
 547 | 
 548 | %%% ---------------------------------------------------
 549 | %%% The MAIN loops.
 550 | %%% ---------------------------------------------------
 551 | 
 552 | % this is the election loop.  Only specific messages related
 553 | % to the election process are received.  User messages, defined
 554 | % in e.g. a callback module, are postponed until the (re)election\
 555 | % is complete.
 556 | safe_loop(#server{} = Server, Role, #election{} = Election, PrevMsg) ->
 557 |     ?MODULE:real_safe_loop(Server, Role, Election, PrevMsg).
 558 | 
 559 | real_safe_loop(#server{mod = Mod, state = State} = Server, Role,
 560 |           #election{name = Name} = E, _PrevMsg) ->
 561 |     receive
 562 |         code_reloaded = Msg ->
 563 |             safe_loop(Server, Role, E, Msg);
 564 |         {system, From, Req} ->
 565 |             #server{parent = Parent, debug = Debug} = Server,
 566 |             sys:handle_system_msg(Req, From, Parent, ?MODULE, Debug,
 567 |                                   [safe, Server, Role, E]);
 568 |         {'EXIT', _, Reason} = Msg ->
 569 |             terminate(Reason, Msg, Server, Role, E);
 570 |         {update_candidates, _, _, _} = Msg ->
 571 |             safe_loop(Server, Role, E, Msg);
 572 |         {halt, T, From} = Msg ->
 573 |             NewE = halting(E, T, From, Server),
 574 |             From ! {ackLeader, T, self()},
 575 |             safe_loop(Server, Role, NewE, Msg);
 576 |         {hasLeader, Ldr, T, _} = Msg when Role == candidate_joining ->
 577 |             NewE1 = mon_node(E, Ldr, Server),
 578 |             NewE = NewE1#election{elid = T, leadernode = node(Ldr)},
 579 |             Ldr ! {isLeader, T, self()},
 580 |             safe_loop(Server, Role, NewE, Msg);
 581 |         {hasLeader, Ldr, T, _} = Msg ->
 582 |             NewE1 = mon_node(E, Ldr, Server),
 583 |             case ( (E#election.status == elec2) and (E#election.acks /= []) ) of
 584 |                 true ->
 585 |                     lists:foreach(
 586 |                       fun(Node) ->
 587 |                               {Name, Node} ! {hasLeader, Ldr, T, self()}
 588 |                       end, E#election.acks);
 589 |                 false ->
 590 |                     ok
 591 |             end,
 592 |             NewE = NewE1#election{elid = T,
 593 |                                   status = wait,
 594 |                                   leadernode = node(Ldr),
 595 |                                   down = E#election.down -- [node(Ldr)],
 596 |                                   acks = []},
 597 |             Ldr ! {isLeader, T, self()},
 598 |             safe_loop(Server, Role, NewE, Msg);
 599 |         {isLeader, T, From} = Msg ->
 600 |             From ! {notLeader, T, self()},
 601 |             safe_loop(Server, Role, E, Msg);
 602 |         {notLeader, T, _} = Msg when Role == candidate_joining ->
 603 |             NewE = case E#election.elid == T of
 604 |                        true ->
 605 |                            joinCluster(E, Server);
 606 |                        false ->
 607 |                            E
 608 |                    end,
 609 |             safe_loop(Server, Role, NewE, Msg);
 610 |         {notLeader, T, _} = Msg ->
 611 |             NewE =
 612 |                 case ((E#election.status == wait) and (E#election.elid == T)) of
 613 |                     true ->
 614 |                         startStage1(E, Server);
 615 |                     false ->
 616 |                         E
 617 |                 end,
 618 |             safe_loop(Server, Role, NewE, Msg);
 619 |         {ackLeader, T, From} = Msg ->
 620 |             NewE =
 621 |                 case ( (E#election.status == elec2) and (E#election.elid == T)
 622 |                        and (E#election.pendack == node(From)) ) of
 623 |                     true ->
 624 |                         continStage2(
 625 |                           E#election{acks = [node(From)|E#election.acks]},
 626 |                           Server);
 627 |                     false ->
 628 |                         E
 629 |                 end,
 630 |             hasBecomeLeader(NewE, Server, Msg);
 631 | 
 632 |         {ldr, Synch, T, _, _, From} = Msg when Role == waiting_worker ->
 633 |             case ( (T == E#election.elid)
 634 |                    and (node(From) == E#election.leadernode)) of
 635 |                 true ->
 636 |                     NewE = E#election{ leader = From, status = worker },
 637 |                     {ok, NewState} = Mod:surrendered(State, Synch, NewE),
 638 |                     loop(Server#server{state = NewState}, worker, NewE, Msg);
 639 |                 false ->
 640 |                     %% This should be a VERY special case...
 641 |                     %% But doing nothing is the right thing!
 642 |                     %% A DOWN message should arrive to solve this situation
 643 |                     safe_loop(Server, Role, E, Msg)
 644 |             end;
 645 |         {ldr, Synch, T, Workers, Candidates, From} = Msg ->
 646 |             case ( ( (E#election.status == wait) or (E#election.status == joining) )
 647 |                    and (E#election.elid == T) ) of
 648 |                 true ->
 649 |                     timer:cancel(E#election.cand_timer),
 650 |                     NewE1 = mon_node(E, From, Server),
 651 |                     NewE2 = NewE1#election{leader = From,
 652 |                                            leadernode = node(From),
 653 |                                            previous_leader = E#election.leader,
 654 |                                            worker_nodes = Workers,
 655 |                                            candidate_nodes = Candidates,
 656 |                                            status = norm,
 657 |                                            cand_timer=undefined},
 658 |                     NewE = case Role == candidate_joining of
 659 |                                true ->
 660 |                                    mon_nodes(NewE2, lesser(node(), candidates(NewE2)), Server);
 661 |                                false -> NewE2
 662 |                            end,
 663 |                     {ok, NewState} = Mod:surrendered(State, Synch, NewE),
 664 |                     loop(Server#server{state = NewState}, surrendered, NewE, Msg);
 665 |                 false ->
 666 |                     safe_loop(Server, Role, E, Msg)
 667 |             end;
 668 |         {normQ, T, From} = Msg ->
 669 |             NewE =
 670 |                 case ( (E#election.status == elec1)
 671 |                        or ( (E#election.status == wait)
 672 |                             and (E#election.elid == T) ) ) of
 673 |                     true ->
 674 |                         NE = halting(E, T, From, Server),
 675 |                         From ! {notNorm, T, self()},
 676 |                         NE;
 677 |                     false ->
 678 |                         E
 679 |                 end,
 680 |             safe_loop(Server, Role, NewE, Msg);
 681 |         {notNorm, _, _} = Msg ->
 682 |             safe_loop(Server, Role, E, Msg);
 683 |         {workerAlive, T, From} = Msg ->
 684 |             NewE =
 685 |                 case E#election.leadernode == none of
 686 |                     true ->
 687 |                         %% We should initiate activation,
 688 |                         %% monitor the possible leader!
 689 |                         NE = mon_node(E#election{leadernode = node(From),
 690 |                                                    elid = T},
 691 |                                         From, Server),
 692 |                         From ! {workerIsAlive, T, self()},
 693 |                         NE;
 694 |                     false ->
 695 |                         %% We should acutally ignore this, the present activation
 696 |                         %% will complete or abort first...
 697 |                         E
 698 |                 end,
 699 |             safe_loop(Server, Role, NewE, Msg);
 700 |         {workerIsAlive, _, _} = Msg ->
 701 |             %% If this happens, the activation process should abort
 702 |             %% This process is no longer the leader!
 703 |             %% The sender will notice this via a DOWN message
 704 |             safe_loop(Server, Role, E, Msg);
 705 |         {election} = Msg ->
 706 |             %% We're already in an election, so this is likely an old message.
 707 |             safe_loop(Server, Role, E, Msg);
 708 |         {heartbeat, _Node} = Msg ->
 709 |             safe_loop(Server, Role, E, Msg);
 710 |         {candidate_timer} = Msg ->
 711 |             Down = E#election.down,
 712 |             Server#server.pinger_proc ! {set_ping_nodes, Down},
 713 |             NewE =
 714 |                 case Down of
 715 |                     [] ->
 716 |                         timer:cancel(E#election.cand_timer),
 717 |                         E#election{cand_timer = undefined};
 718 |                     Down ->
 719 |                         %% get rid of any queued up candidate_timers, since we just handled one
 720 |                         flush_candidate_timers(),
 721 |                         %% Some of potential master candidate nodes are down.
 722 |                         %% Try to wake them up
 723 |                         F = fun(N) ->
 724 |                                 erlang:send({E#election.name, N}, {heartbeat, node()}, [nosuspend, noconnect])
 725 |                             end,
 726 |                         [F(N) || N <- Down, {ok, up} =/= net_kernel:node_info(N, state)],
 727 |                         E
 728 |                 end,
 729 |             safe_loop(Server, Role, halt_pendack(NewE), Msg);
 730 |         {checklead, Node} = Msg ->
 731 |             %% in the very exceptional case when a candidate comes up when the
 732 |             %% elected leader is *behind* it in the candidate list *and* all nodes
 733 |             %% before it in the candidate list are up, the candidate will be stuck in
 734 |             %% the safe_loop forever. This is because gen_leader relies on either
 735 |             %% one of the nodes being down, or the nodes responding to the heartbeat
 736 |             %% sent as part of stage1. However, nodes that are up but are NOT the
 737 |             %% leader do not respond to heartbeats. In this very exceptional case,
 738 |             %% we send a heartbeat to the leader in response to the checklead it
 739 |             %% sent us to bootstrap things and get out of this quagmire.
 740 |             case lists:member(Node, E#election.candidate_nodes) and
 741 |                     (E#election.status == elec1) of
 742 |                 true ->
 743 |                     case ( pos(Node, E#election.candidate_nodes) >
 744 |                              pos(node(), E#election.candidate_nodes) ) of
 745 |                         true ->
 746 |                             {Name, Node} ! {heartbeat, self()};
 747 |                         _ ->
 748 |                             ok
 749 |                    end;
 750 |                 _ ->
 751 |                     ok
 752 |             end,
 753 |             safe_loop(Server, Role, E, Msg);
 754 |         {ldr, 'DOWN', Node} = Msg when Role == waiting_worker ->
 755 |             NewE =
 756 |                 case Node == E#election.leadernode of
 757 |                     true ->
 758 |                         E#election{leader = none, leadernode = none,
 759 |                                    previous_leader = E#election.leader,
 760 |                                    status = waiting_worker,
 761 |                                    monitored = []};
 762 |                     false ->
 763 |                         E
 764 |                 end,
 765 |             safe_loop(Server, Role, NewE, Msg);
 766 |         {ldr, 'DOWN', Node} = Msg when Role == candidate_joining ->
 767 |             Ldr = E#election.leadernode,
 768 |             Seed = E#election.seed_node,
 769 |             case Node of
 770 |                 Seed ->
 771 |                     case net_adm:ping(Ldr) of
 772 |                         pong -> noop;
 773 |                         pang ->
 774 |                             terminate(seed_nodes_down, Msg, Server, Role, E)
 775 |                     end;
 776 |                 Ldr ->
 777 |                     case net_adm:ping(Seed) of
 778 |                         pong ->
 779 |                             NewE = joinCluster(E, Server),
 780 |                             safe_loop(Server, Role, NewE, Msg);
 781 |                         pang ->
 782 |                             terminate(seed_nodes_down, Msg, Server, Role, E)
 783 |                     end
 784 |             end;
 785 |         {ldr, 'DOWN', Node} = Msg ->
 786 |             NewMon = lists:keydelete(Node, 2, E#election.monitored),
 787 |             NewE =
 788 |                 case lists:member(Node, E#election.candidate_nodes) of
 789 |                     true ->
 790 |                         NewDown = [Node | E#election.down],
 791 |                         E1 = E#election{down = NewDown, monitored = NewMon},
 792 |                         case ( pos(Node, E#election.candidate_nodes) <
 793 |                                    pos(node(), E#election.candidate_nodes) ) of
 794 |                             true ->
 795 |                                 Lesser = lesser(node(), E#election.candidate_nodes),
 796 |                                 LesserIsSubset = (Lesser -- NewDown) == [],
 797 |                                 case ((E#election.status == wait)
 798 |                                       and (Node == E#election.leadernode)) of
 799 |                                     true ->
 800 |                                         startStage1(E1, Server);
 801 |                                     false ->
 802 |                                         case ((E#election.status == elec1) and
 803 |                                               LesserIsSubset) of
 804 |                                             true ->
 805 |                                                 startStage2(
 806 |                                                   E1#election{down = Lesser},
 807 |                                                   Server);
 808 |                                             false ->
 809 |                                                 E1
 810 |                                         end
 811 |                                 end;
 812 |                             false ->
 813 |                                 case ( (E#election.status == elec2)
 814 |                                        and (Node == E#election.pendack) ) of
 815 |                                     true ->
 816 |                                         continStage2(E1, Server);
 817 |                                     false ->
 818 |                                         case ( (E#election.status == wait)
 819 |                                                and (Node == E#election.leadernode)) of
 820 |                                             true ->
 821 |                                                 startStage1(E1, Server);
 822 |                                             false ->
 823 |                                                 E1
 824 |                                         end
 825 |                                 end
 826 |                         end
 827 |                 end,
 828 |             hasBecomeLeader(NewE, Server, Msg)
 829 |     end.
 830 | 
 831 | 
 832 | % this is the regular operation loop.  All messages are received,
 833 | % unexpected ones are discarded.
 834 | loop(#server{} = Server, Role, #election{} = Election, PrevMsg) ->
 835 |     ?MODULE:real_loop(Server, Role, Election, PrevMsg).
 836 | 
 837 | real_loop(#server{parent = Parent,
 838 |              mod = Mod,
 839 |              state = State,
 840 |              debug = Debug} = Server, Role,
 841 |      #election{name = Name} = E, _PrevMsg) ->
 842 |     receive
 843 |         Msg ->
 844 |             case Msg of
 845 |                 code_reloaded ->
 846 |                     loop(Server, Role, E, Msg);
 847 |                 {system, From, Req} ->
 848 |                     sys:handle_system_msg(Req, From, Parent, ?MODULE, Debug,
 849 |                                           [normal, Server, Role, E]);
 850 |                 {'EXIT', Parent, Reason} ->
 851 |                     terminate(Reason, Msg, Server, Role, E);
 852 | 
 853 |                 {join, From} ->
 854 |                     From ! {hasLeader, E#election.leader, E#election.elid, self()},
 855 |                     loop(Server, Role, E, Msg);
 856 |                 {update_candidates, T, Candidates, _From} ->
 857 |                     case E#election.elid == T of
 858 |                         true ->
 859 |                             NewE = E#election{candidate_nodes = Candidates},
 860 |                             loop(Server, Role, NewE, Msg);
 861 |                         false ->
 862 |                             loop(Server, Role, E, Msg)
 863 |                     end;
 864 |                 {halt, _, From} ->
 865 |                     T = E#election.elid,
 866 |                     case E#election.leader of
 867 |                         From ->
 868 |                             % The process we consider to be a leader seems to be in elec1 stage.
 869 |                             % So we downgrade to it too
 870 |                             NewE = startStage1(E, Server),
 871 |                             safe_loop(Server, candidate, NewE, Msg);
 872 |                         OtherLeader ->
 873 |                             From ! {hasLeader, OtherLeader, T, self()},
 874 |                             loop(Server, Role, E, Msg)
 875 |                     end;
 876 |                 {hasLeader, _, _, _} ->
 877 |                     loop(Server, Role, E, Msg);
 878 |                 {isLeader, T, From} ->
 879 |                     case (self() == E#election.leader) of
 880 |                         true ->
 881 |                             NewCandidates =
 882 |                                 case lists:member(node(From), candidates(E)) of
 883 |                                     true -> candidates(E);
 884 |                                     false ->
 885 |                                         NC = candidates(E) ++ [node(From)],
 886 |                                         lists:foreach(
 887 |                                           fun(Node) ->
 888 |                                                   {Name, Node} !
 889 |                                                       {update_candidates, E#election.elid,
 890 |                                                        NC, self()}
 891 |                                           end, candidates(E) -- lists:flatten([node()], down(E))),
 892 |                                         NC
 893 |                                 end,
 894 |                             NewDown = E#election.down -- [node(From)],
 895 |                             NewE1 = mon_node(E#election{down = NewDown},
 896 |                                              From, Server),
 897 |                             NewE = NewE1#election{candidate_nodes = NewCandidates},
 898 |                             NewState = call_elected(Mod, State, NewE, From),
 899 |                             loop(Server#server{state = NewState}, Role, NewE, Msg);
 900 |                         false ->
 901 |                             From ! {notLeader, T, self()},
 902 |                             loop(Server, Role, E, Msg)
 903 |                     end;
 904 |                 {ackLeader, _, _} ->
 905 |                     loop(Server, Role, E, Msg);
 906 |                 {notLeader, _, _} ->
 907 |                     loop(Server, Role, E, Msg);
 908 |                 {ack, _, _} ->
 909 |                     loop(Server, Role, E, Msg);
 910 |                 {ldr, _, _, _, _} ->
 911 |                     loop(Server, Role, E, Msg);
 912 |                 {normQ, _, _} ->
 913 |                     loop(Server, Role, E, Msg);
 914 |                 {notNorm, T, From} ->
 915 |                     case ( (E#election.leader == self())
 916 |                            and (E#election.elid == T) ) of
 917 |                         true ->
 918 |                             NewDown = E#election.down -- [node(From)],
 919 |                             NewE = mon_node(E#election{down = NewDown},
 920 |                                             From, Server),
 921 |                             NewState = call_elected(Mod, State, NewE, From),
 922 |                             loop(Server#server{state = NewState}, Role, NewE, Msg);
 923 |                         false ->
 924 |                             loop(Server, Role, E, Msg)
 925 |                     end;
 926 |                 {workerAlive, _, _} ->
 927 |                     %% Do nothing if we get this from a new leader
 928 |                     %% We will soon notice that the prev leader has died, and
 929 |                     %%get the same message again when we are back in safe_loop!
 930 |                     loop(Server, Role, E, Msg);
 931 |                 {activateWorker, _, _, _} ->
 932 |                     %% We ignore this, we are already active...
 933 |                     %% It must be an old message!
 934 |                     loop(Server, Role, E, Msg);
 935 |                 {workerIsAlive, T, From} ->
 936 |                     case ((T == E#election.elid) and (self() == E#election.leader)) of
 937 |                         true ->
 938 |                             NewDown = E#election.work_down -- [node(From)],
 939 |                             NewE = mon_node(E#election{work_down = NewDown},
 940 |                                             From, Server),
 941 |                             NewState = call_elected(Mod, State, NewE, From),
 942 |                             loop(Server#server{state = NewState}, Role, NewE, Msg);
 943 |                         false ->
 944 |                             loop(Server, Role, E, Msg)
 945 |                     end;
 946 |                 {election} ->
 947 |                     %% Told to do an election because of a leader conflict.
 948 |                     E1 = startStage1(E, Server),
 949 |                     safe_loop(Server, candidate, E1, Msg);
 950 |                 {checklead, Node} ->
 951 |                     case (E#election.leadernode == Node) of
 952 |                         true ->
 953 |                             %% Leaders match, nothing to do
 954 |                             loop(Server, Role, E, Msg);
 955 |                         false when E#election.leader == self() ->
 956 |                             %% We're a leader and we disagree with the other
 957 |                             %% leader. Tell everyone else to have an election.
 958 |                             lists:foreach(
 959 |                                 fun(N) ->
 960 |                                     {Name, N} ! {election}
 961 |                                 end, E#election.candidate_nodes),
 962 |                             %% Start participating in the election ourselves.
 963 |                             E1 = startStage1(E, Server),
 964 |                             safe_loop(Server, candidate, E1, Msg);
 965 |                         false ->
 966 |                             %% Not a leader, just wait to be told to do an
 967 |                             %% election, if applicable.
 968 |                             loop(Server, Role, E, Msg)
 969 |                     end;
 970 |                 {send_checklead} ->
 971 |                     case (E#election.leader == self()) of
 972 |                         true ->
 973 |                             case E#election.down of
 974 |                                 [] ->
 975 |                                     loop(Server, Role, E, Msg);
 976 |                                 Down ->
 977 |                                     %% For any nodes which are down, send them
 978 |                                     %% a message comparing their leader to our
 979 |                                     %% own. This allows us to trigger an
 980 |                                     %% election after a netsplit is healed.
 981 |                                     spawn(?MODULE, send_checkleads, [Name, E#election.cand_timer_int, self(), Down]),
 982 |                                     loop(Server, Role, E, Msg)
 983 |                             end;
 984 |                         false ->
 985 |                             loop(Server, Role, E, Msg)
 986 |                     end;
 987 |                 {heartbeat, _Node} ->
 988 |                     case (E#election.leader == self()) of
 989 |                         true ->
 990 |                             Candidates = E#election.down -- [lists:nth(1, E#election.candidate_nodes)],
 991 |                             lists:foreach(
 992 |                               fun(N) ->
 993 |                                       Elid = E#election.elid,
 994 |                                       erlang:send({Name, N}, {normQ, Elid, self()}, [nosuspend, noconnect])
 995 |                               end, Candidates),
 996 |                             lists:foreach(
 997 |                               fun(N) ->
 998 |                                       Elid = E#election.elid,
 999 |                                       erlang:send({Name, N}, {workerAlive, Elid, self()}, [nosuspend, noconnect])
1000 |                               end, E#election.work_down);
1001 |                         false ->
1002 |                             ok
1003 |                     end,
1004 |                     loop(Server, Role, E, Msg);
1005 |                 {candidate_timer} = Msg
1006 |                         when E#election.down =:= [] orelse (Role =/= elected andalso E#election.leadernode =/= none) ->
1007 |                     timer:cancel(E#election.cand_timer),
1008 |                     loop(Server, Role, E#election{cand_timer=undefined}, Msg);
1009 |                 {candidate_timer} = Msg ->
1010 |                     %% get rid of any queued up candidate_timers,
1011 |                     %% since we just handled one
1012 |                     flush_candidate_timers(),
1013 |                     %% This shouldn't happen in the leader - just ignore
1014 |                     loop(Server, Role, E, Msg);
1015 |                 {ldr, 'DOWN', Node} = Msg when Role == worker ->
1016 |                     case Node == E#election.leadernode of
1017 |                         true ->
1018 |                             NewE = E#election{ leader = none, leadernode = none,
1019 |                                                status = waiting_worker,
1020 |                                                monitored = []},
1021 |                             safe_loop(Server, waiting_worker, NewE, Msg);
1022 |                         false ->
1023 |                             loop(Server, Role, E, Msg)
1024 |                     end;
1025 |                 {ldr, 'DOWN', Node} = Msg ->
1026 |                     NewMon = lists:keydelete(Node, 2, E#election.monitored),
1027 |                     case lists:member(Node, E#election.candidate_nodes) of
1028 |                         true ->
1029 |                             NewDown = [Node | E#election.down],
1030 |                             E1 = E#election{down = NewDown, monitored = NewMon},
1031 |                             case (Node == E#election.leadernode) of
1032 |                                 true ->
1033 |                                     NewE = startStage1(E1, Server),
1034 |                                     safe_loop(Server, candidate, NewE, Msg);
1035 |                                 false when E#election.leadernode =:= node() ->
1036 |                                     %% Serge: call handle_DOWN
1037 |                                     {NewState, NewE} =
1038 |                                         case (Server#server.mod):handle_DOWN(Node, Server#server.state, E1) of
1039 |                                             {ok, NewState1} ->
1040 |                                                 {NewState1, E1};
1041 |                                             {ok, Synch, NewState1} ->
1042 |                                                 {NewState1, broadcast({from_leader, Synch}, E1)}
1043 |                                         end,
1044 |                                     %% We're the leader and one of our
1045 |                                     %% candidates has gone down. Start sending
1046 |                                     %% out checklead messages to the downed
1047 |                                     %% candidates so we can quickly trigger an
1048 |                                     %% election, if this was a netsplit when
1049 |                                     %% its healed.
1050 |                                     {Name, node()} ! {send_checklead},
1051 |                                     loop(Server#server{state=NewState}, Role, NewE, Msg);
1052 |                                 false ->
1053 |                                     loop(Server, Role, E1, Msg)
1054 |                             end;
1055 |                         false ->
1056 |                             %% I am the leader,
1057 |                             %% make sure the dead worker is in work_down.
1058 |                             E1 = E#election{
1059 |                                    monitored = NewMon,
1060 |                                    work_down = [Node |
1061 |                                                 (E#election.work_down -- [Node])]
1062 |                                   },
1063 |                             loop(Server, Role, E1, Msg)
1064 |                     end;
1065 |                 {add_worker, WorkerNode} ->
1066 |                     case lists:member(WorkerNode, E#election.worker_nodes) of
1067 |                         false ->
1068 |                             {WNodes, DNodes} = {E#election.worker_nodes, E#election.work_down},
1069 | 
1070 |                             loop(Server, Role, E#election{worker_nodes=[WorkerNode|WNodes],
1071 |                                                           work_down=[WorkerNode|DNodes]},
1072 |                                  Msg);
1073 |                         true -> % Redundancy, meet the mirror
1074 |                             loop(Server, Role, E, Msg)
1075 |                     end;
1076 |                 _Msg when Debug == [] ->
1077 |                     handle_msg(Msg, Server, Role, E);
1078 |                 _Msg ->
1079 |                     Debug1 = sys:handle_debug(Debug, fun ?MODULE:print_event/3,
1080 |                                               E#election.name, {in, Msg}),
1081 |                     handle_msg(Msg, Server#server{debug = Debug1}, Role, E)
1082 |             end
1083 |     end.
1084 | 
1085 | %%-----------------------------------------------------------------
1086 | %% Callback functions for system messages handling.
1087 | %%-----------------------------------------------------------------
1088 | %% @hidden
1089 | system_continue(_Parent, _Debug, [safe, Server, Role, E]) ->
1090 |     safe_loop(Server, Role, E, {});
1091 | system_continue(_Parent, _Debug, [normal, Server, Role, E]) ->
1092 |     loop(Server, Role, E, {}).
1093 | 
1094 | %% @hidden
1095 | -spec system_terminate(any(), any(), any(), any()) -> no_return() .
1096 | system_terminate(Reason, _Parent, _Debug, [_Mode, Server, Role, E]) ->
1097 |     terminate(Reason, [], Server, Role, E).
1098 | 
1099 | %% @hidden
1100 | system_code_change([Mode, Server, Role, E], _Module, OldVsn, Extra) ->
1101 |     #server{mod = Mod, state = State} = Server,
1102 |     case catch Mod:code_change(OldVsn, State, E, Extra) of
1103 |         {ok, NewState} ->
1104 |             NewServer = Server#server{state = NewState},
1105 |             {ok, [Mode, NewServer, Role, E]};
1106 |         {ok, NewState, NewE} ->
1107 |             NewServer = Server#server{state = NewState},
1108 |             {ok, [Mode, NewServer, Role, NewE]};
1109 |         Else -> Else
1110 |     end.
1111 | 
1112 | %%-----------------------------------------------------------------
1113 | %% Format debug messages.  Print them as the call-back module sees
1114 | %% them, not as the real erlang messages.  Use trace for that.
1115 | %%-----------------------------------------------------------------
1116 | %% @hidden
1117 | print_event(Dev, {in, Msg}, Name) ->
1118 |     case Msg of
1119 |         {'$gen_call', {From, _Tag}, Call} ->
1120 |             io:format(Dev, "*DBG* ~p got local call ~p from ~w~n",
1121 |                       [Name, Call, From]);
1122 |         {'$leader_call', {From, _Tag}, Call} ->
1123 |             io:format(Dev, "*DBG* ~p got global call ~p from ~w~n",
1124 |                       [Name, Call, From]);
1125 |         {'$gen_cast', Cast} ->
1126 |             io:format(Dev, "*DBG* ~p got local cast ~p~n",
1127 |                       [Name, Cast]);
1128 |         {'$leader_cast', Cast} ->
1129 |             io:format(Dev, "*DBG* ~p got global cast ~p~n",
1130 |                       [Name, Cast]);
1131 |         _ ->
1132 |             io:format(Dev, "*DBG* ~p got ~p~n", [Name, Msg])
1133 |     end;
1134 | print_event(Dev, {out, Msg, To, State}, Name) ->
1135 |     io:format(Dev, "*DBG* ~p sent ~p to ~w, new state ~w~n",
1136 |               [Name, Msg, To, State]);
1137 | print_event(Dev, {noreply, State}, Name) ->
1138 |     io:format(Dev, "*DBG* ~p new state ~w~n", [Name, State]);
1139 | print_event(Dev, Event, Name) ->
1140 |     io:format(Dev, "*DBG* ~p dbg  ~p~n", [Name, Event]).
1141 | 
1142 | 
1143 | handle_msg({'$leader_call', From, Request} = Msg,
1144 |            #server{mod = Mod, state = State} = Server, elected = Role, E) ->
1145 |     case catch Mod:handle_leader_call(Request, From, State, E) of
1146 |         {reply, Reply, NState} ->
1147 |             NewServer = reply(From, {leader, reply, Reply},
1148 |                               Server#server{state = NState}, Role, E),
1149 |             loop(NewServer, Role, E, Msg);
1150 |         {reply, Reply, Broadcast, NState} ->
1151 |             NewE = broadcast({from_leader, Broadcast}, E),
1152 |             NewServer = reply(From, {leader, reply, Reply},
1153 |                               Server#server{state = NState}, Role,
1154 |                               NewE),
1155 |             loop(NewServer, Role, NewE, Msg);
1156 |         {noreply, NState} = Reply ->
1157 |             NewServer = handle_debug(Server#server{state = NState},
1158 |                                      Role, E, Reply),
1159 |             loop(NewServer, Role, E, Msg);
1160 |         {stop, Reason, Reply, NState} ->
1161 |             {'EXIT', R} =
1162 |                 (catch terminate(Reason, Msg,
1163 |                                  Server#server{state = NState},
1164 |                                  Role, E)),
1165 |             reply(From, Reply),
1166 |             exit(R);
1167 |         Other ->
1168 |             handle_common_reply(Other, Msg, Server, Role, E)
1169 |     end;
1170 | handle_msg({from_leader, Cmd} = Msg,
1171 |            #server{mod = Mod, state = State} = Server, Role, E) ->
1172 |     NewE = check_candidates(E),
1173 |     handle_common_reply(catch Mod:from_leader(Cmd, State, NewE),
1174 |                         Msg, Server, Role, NewE);
1175 | handle_msg({'$leader_call', From, Request} = Msg, Server, Role,
1176 |            #election{buffered = Buffered, leader = Leader} = E) ->
1177 |     Ref = make_ref(),
1178 |     Leader ! {'$leader_call', {self(), Ref}, Request},
1179 |     NewBuffered = [{Ref, From}|Buffered],
1180 |     loop(Server, Role, E#election{buffered = NewBuffered}, Msg);
1181 | handle_msg({Ref, {leader, reply, Reply}} = Msg, Server, Role,
1182 |            #election{buffered = Buffered} = E) ->
1183 |     {value, {_, From}} = lists:keysearch(Ref, 1, Buffered),
1184 |     El = E#election{buffered = lists:keydelete(Ref, 1, Buffered)},
1185 | 
1186 |     NewServer = reply(From, {leader, reply, Reply}, Server, Role, El),
1187 | 
1188 |     loop(NewServer, Role, El, Msg);
1189 | handle_msg({'$gen_call', From, get_candidates} = Msg, Server, Role, E) ->
1190 |     NewServer = reply(From, {ok, candidates(E)}, Server, Role, E),
1191 |     loop(NewServer, Role, E, Msg);
1192 | handle_msg({'$gen_call', From, Request} = Msg,
1193 |            #server{mod = Mod, state = State} = Server, Role, E) ->
1194 |     case catch Mod:handle_call(Request, From, State, E) of
1195 |         {reply, Reply, NState} ->
1196 |             NewServer = reply(From, Reply,
1197 |                               Server#server{state = NState}, Role, E),
1198 |             loop(NewServer, Role, E, Msg);
1199 |         {noreply, NState} = Reply ->
1200 |             NewServer = handle_debug(Server#server{state = NState},
1201 |                                      Role, E, Reply),
1202 |             loop(NewServer, Role, E, Msg);
1203 |         {stop, Reason, Reply, NState} ->
1204 |             {'EXIT', R} =
1205 |                 (catch terminate(Reason, Msg, Server#server{state = NState},
1206 |                                  Role, E)),
1207 |             reply(From, Reply),
1208 |             exit(R);
1209 |         Other ->
1210 |             handle_common_reply(Other, Msg, Server, Role, E)
1211 |     end;
1212 | handle_msg({'$gen_cast', Msg} = Cast,
1213 |            #server{mod = Mod, state = State} = Server, Role, E) ->
1214 |     handle_common_reply(catch Mod:handle_cast(Msg, State, E),
1215 |                         Cast, Server, Role, E);
1216 | handle_msg({'$leader_cast', Msg} = Cast,
1217 |            #server{mod = Mod, state = State} = Server, elected = Role, E) ->
1218 |     case catch Mod:handle_leader_cast(Msg, State, E) of
1219 |         {noreply, NState} ->
1220 |             NewServer = handle_debug(Server#server{state = NState},
1221 |                                      Role, E, Cast),
1222 |             loop(NewServer, Role, E, Cast);
1223 |         {ok, Broadcast, NState} ->
1224 |             NewE = broadcast({from_leader, Broadcast}, E),
1225 |             NewServer = handle_debug(Server#server{state = NState},
1226 |                                      Role, E, Cast),
1227 |             loop(NewServer, Role, NewE, Cast);
1228 |         Other ->
1229 |             handle_common_reply(Other, Msg, Server, Role, E)
1230 |     end;
1231 | handle_msg({'$leader_cast', Msg} = Cast, Server, Role,
1232 |            #election{leader = Leader} = E) ->
1233 |     Leader ! {'$leader_cast', Msg},
1234 |     loop(Server, Role, E, Cast);
1235 | 
1236 | handle_msg(Msg, #server{mod = Mod, state = State} = Server, Role, E) ->
1237 |     handle_common_reply(catch Mod:handle_info(Msg, State, E),
1238 |                         Msg, Server, Role, E).
1239 | 
1240 | 
1241 | handle_common_reply(Reply, Msg, Server, Role, E) ->
1242 |     case Reply of
1243 |         {noreply, NState} ->
1244 |             NewServer = handle_debug(Server#server{state = NState},
1245 |                                      Role, E, Reply),
1246 |             loop(NewServer, Role, E, Msg);
1247 |         {ok, NState} ->
1248 |             NewServer = handle_debug(Server#server{state = NState},
1249 |                                      Role, E, Reply),
1250 |             loop(NewServer, Role, E, Msg);
1251 |         {stop, Reason, NState} ->
1252 |             terminate(Reason, Msg, Server#server{state = NState}, Role, E);
1253 |         {'EXIT', Reason} ->
1254 |             terminate(Reason, Msg, Server, Role, E);
1255 |         _ ->
1256 |             terminate({bad2_return_value, Reply}, Msg, Server, Role, E)
1257 |     end.
1258 | 
1259 | 
1260 | reply({To, Tag}, Reply, #server{state = State} = Server, Role, E) ->
1261 |     reply({To, Tag}, Reply),
1262 |     handle_debug(Server, Role, E, {out, Reply, To, State}).
1263 | 
1264 | 
1265 | handle_debug(#server{debug = []} = Server, _Role, _E, _Event) ->
1266 |     Server;
1267 | handle_debug(#server{debug = Debug} = Server, _Role, E, Event) ->
1268 |     Debug1 = sys:handle_debug(Debug, fun ?MODULE:print_event/3,
1269 |                               E#election.name, Event),
1270 |     Server#server{debug = Debug1}.
1271 | 
1272 | %%% ---------------------------------------------------
1273 | %%% Terminate the server.
1274 | %%% ---------------------------------------------------
1275 | 
1276 | terminate(Reason, Msg, #server{mod = Mod,
1277 |                                state = State,
1278 |                                debug = Debug} = _Server, _Role,
1279 |           #election{name = Name, cand_timer = Timer} = _E) ->
1280 |     timer:cancel(Timer),
1281 |     case catch Mod:terminate(Reason, State) of
1282 |         {'EXIT', R} ->
1283 |             error_info(R, Name, Msg, State, Debug),
1284 |             exit(R);
1285 |         _ ->
1286 |             case Reason of
1287 |                 normal ->
1288 |                     exit(normal);
1289 |                 shutdown ->
1290 |                     exit(shutdown);
1291 |                 _ ->
1292 |                     error_info(Reason, Name, Msg, State, Debug),
1293 |                     exit(Reason)
1294 |             end
1295 |     end.
1296 | 
1297 | %% Maybe we shouldn't do this?  We have the crash report...
1298 | error_info(Reason, Name, Msg, State, Debug) ->
1299 |     error_logger:format("** Generic leader ~p terminating \n"
1300 |                         "** Last message in was ~p~n"
1301 |                         "** When Server state == ~p~n"
1302 |                         "** Reason for termination == ~n** ~p~n",
1303 |                         [Name, Msg, State, Reason]),
1304 |     sys:print_log(Debug),
1305 |     ok.
1306 | 
1307 | %%% ---------------------------------------------------
1308 | %%% Misc. functions.
1309 | %%% ---------------------------------------------------
1310 | 
1311 | opt(Op, [{Op, Value}|_]) ->
1312 |     {ok, Value};
1313 | opt(Op, [_|Options]) ->
1314 |     opt(Op, Options);
1315 | opt(_, []) ->
1316 |     false.
1317 | 
1318 | debug_options(Name, Opts) ->
1319 |     case opt(debug, Opts) of
1320 |         {ok, Options} -> dbg_options(Name, Options);
1321 |         _ -> dbg_options(Name, [])
1322 |     end.
1323 | 
1324 | dbg_options(Name, []) ->
1325 |     Opts =
1326 |         case init:get_argument(generic_debug) of
1327 |             error ->
1328 |                 [];
1329 |             _ ->
1330 |                 [log, statistics]
1331 |         end,
1332 |     dbg_opts(Name, Opts);
1333 | dbg_options(Name, Opts) ->
1334 |     dbg_opts(Name, Opts).
1335 | 
1336 | dbg_opts(Name, Opts) ->
1337 |     case catch sys:debug_options(Opts) of
1338 |         {'EXIT', _} ->
1339 |             error_logger:format("~p: ignoring erroneous debug options - ~p~n",
1340 |                                 [Name, Opts]),
1341 |             [];
1342 |         Dbg ->
1343 |             Dbg
1344 |     end.
1345 | 
1346 | %%-----------------------------------------------------------------
1347 | %% Status information
1348 | %%-----------------------------------------------------------------
1349 | %% @hidden
1350 | format_status(Opt, StatusData) ->
1351 |     [PDict, SysState, Parent, Debug, [_Mode, Server, _Role, E]] = StatusData,
1352 |     Header = lists:concat(["Status for generic server ", E#election.name]),
1353 |     Log = sys:get_debug(log, Debug, []),
1354 |     #server{mod = Mod, state = State} = Server,
1355 |     Specific =
1356 |         case erlang:function_exported(Mod, format_status, 2) of
1357 |             true ->
1358 |                 case catch apply(Mod, format_status, [Opt, [PDict, State]]) of
1359 |                     {'EXIT', _} -> [{data, [{"State", State}]}];
1360 |                     Else -> Else
1361 |                 end;
1362 |             _ ->
1363 |                 [{data, [{"State", State}]}]
1364 |         end,
1365 |     [{header, Header},
1366 |      {data, [{"Status", SysState},
1367 |              {"Parent", Parent},
1368 |              {"Logged events", Log}]} |
1369 |      Specific].
1370 | 
1371 | 
1372 | %%-----------------------------------------------------------------
1373 | %% Leader-election functions
1374 | %%-----------------------------------------------------------------
1375 | 
1376 | %% Corresponds to startStage1 in Figure 1 in the Stoller-article
1377 | startStage1(E, Server) ->
1378 |     NodePos = pos(node(), E#election.candidate_nodes),
1379 |     Elid = {NodePos, E#election.incarn, E#election.nextel},
1380 |     NewE = E#election{
1381 |              elid = Elid,
1382 |              nextel = E#election.nextel + 1,
1383 |              down = [],
1384 |              status = elec1},
1385 |     case NodePos of
1386 |         1 ->
1387 |             startStage2(NewE, Server);
1388 |         _ ->
1389 |             mon_nodes(NewE, lesser(node(), E#election.candidate_nodes), Server)
1390 |     end.
1391 | 
1392 | %% Corresponds to startStage2
1393 | startStage2(E, Server) ->
1394 |     continStage2(E#election{status = elec2, pendack = node(), acks = []},
1395 |                  Server).
1396 | 
1397 | continStage2(E, Server) ->
1398 |     case (pos(E#election.pendack, E#election.candidate_nodes)
1399 |           < length(E#election.candidate_nodes)) of
1400 |         true ->
1401 |             Pendack = next(E#election.pendack, E#election.candidate_nodes),
1402 |             NewE = mon_nodes(E, [Pendack], Server),
1403 |             halt_pendack(NewE#election{pendack = Pendack});
1404 |         false ->
1405 |             %% I am the leader
1406 |             E#election{leader = self(),
1407 |                        leadernode = node(),
1408 |                        previous_leader = E#election.leader,
1409 |                        status = norm}
1410 |     end.
1411 | 
1412 | halt_pendack(#election{pendack = undefined} = E) ->
1413 |     E;
1414 | halt_pendack(#election{name = Name, elid = ElId, pendack = Pendack} = E) ->
1415 |     erlang:send({Name, Pendack}, {halt, ElId, self()}, [nosuspend, noconnect]),
1416 |     E.
1417 | 
1418 | %% corresponds to Halting
1419 | halting(E, T, From, Server) ->
1420 |     NewE = mon_node(E, From, Server),
1421 |     NewE#election{elid = T,
1422 |                   status = wait,
1423 |                   leadernode = node(From),
1424 |                   down = E#election.down -- [node(From)]
1425 |                  }.
1426 | 
1427 | 
1428 | joinCluster(E, Server) ->
1429 |     Pid = {E#election.name, E#election.seed_node},
1430 |     Pid ! {join, self()},
1431 |     NewE = mon_node(E, Pid, Server),
1432 |     NewE#election{status = joining}.
1433 | 
1434 | 
1435 | %%% checks if the proc has become the leader, if so switch to loop
1436 | hasBecomeLeader(E, Server, Msg) ->
1437 |     case ((E#election.status == norm) and (E#election.leader == self())) of
1438 |         true ->
1439 |             {ok, Synch, NewState} =
1440 |                 (Server#server.mod):elected(Server#server.state, E, undefined),
1441 |             lists:foreach(
1442 |               fun(Node) ->
1443 |                       {E#election.name, Node} !
1444 |                           {ldr, Synch, E#election.elid, workers(E), candidates(E), self()}
1445 |               end, E#election.acks),
1446 | 
1447 |             %% Make sure we will try to contact all workers!
1448 |             NewE = E#election{work_down = E#election.worker_nodes},
1449 | 
1450 |             %% io:format("==> I am the leader! (acks: ~200p)\n", [E#election.acks]),
1451 |             %% Set the internal timeout (corresponds to Periodically)
1452 |             timer:send_after(E#election.cand_timer_int, {heartbeat, node()}),
1453 |             {E#election.name, node()} ! {send_checklead},
1454 | 
1455 |             %% trigger handle_DOWN callback if previous leader is down
1456 |             PrevLeader = E#election.previous_leader,
1457 |             {NewState2, NewE2} =
1458 |                 case PrevLeader of
1459 |                     none -> {NewState, NewE};
1460 |                     Pid when is_pid(Pid) ->
1461 |                         case lists:member(node(PrevLeader), down(E)) of
1462 |                             false -> {NewState, NewE};
1463 |                             true ->
1464 |                                 case (Server#server.mod):handle_DOWN(node(PrevLeader), NewState, NewE) of
1465 |                                     {ok, NS} -> {NS, NewE};
1466 |                                     {ok, Synch2, NS} ->
1467 |                                         {NS, broadcast({from_leader, Synch2}, NewE)}
1468 |                                 end
1469 |                         end
1470 |                 end,
1471 | 
1472 |             %% (It's meaningful only when I am the leader!)
1473 |             loop(Server#server{state = NewState2}, elected, NewE2, Msg);
1474 |         false ->
1475 |             safe_loop(Server, candidate, E, Msg)
1476 |     end.
1477 | 
1478 | 
1479 | %%%
1480 | %%% No one checks incarnation type, we just check equality
1481 | %%% So it is OK to just use timestamp here
1482 | %%%
1483 | incarnation(_VarDir, _RegName, _Node) ->
1484 |     os:timestamp().
1485 | 
1486 | 
1487 | broadcast(Msg, #election{monitored = Monitored} = E) ->
1488 |     %% This function is used for broadcasts,
1489 |     %% and we make sure only to broadcast to already known nodes.
1490 |     ToNodes = [N || {_, N} <- Monitored],
1491 |     broadcast(Msg, ToNodes, E).
1492 | 
1493 | broadcast({from_leader, Msg}, ToNodes, E) ->
1494 |     lists:foreach(
1495 |       fun(Node) ->
1496 |               {E#election.name, Node} ! {from_leader, Msg}
1497 |       end, ToNodes),
1498 |     E.
1499 | 
1500 | 
1501 | lesser(_, []) ->
1502 |     [];
1503 | lesser(N, [N|_]) ->
1504 |     [];
1505 | lesser(N, [M|Ms]) ->
1506 |     [M|lesser(N, Ms)].
1507 | 
1508 | next(_, []) ->
1509 |     no_val;
1510 | next(N, [N|Ms]) ->
1511 |     lists:nth(1, Ms);
1512 | next(N, [_|Ms]) ->
1513 |     next(N, Ms).
1514 | 
1515 | pos(_, []) ->
1516 |     100000;
1517 | pos(N1, [N1|_]) ->
1518 |     1;
1519 | pos(N1, [_|Ns]) ->
1520 |     1+pos(N1, Ns).
1521 | 
1522 | check_candidates(#election{down = Down} = E) ->
1523 |     NewDown = [N || N <- Down, {ok, up} =/= net_kernel:node_info(N, state)],
1524 |     E#election{down = NewDown}.
1525 | 
1526 | broadcast_candidates(E, Synch, IgnoreNodes) ->
1527 |     case E#election.bcast_type of
1528 |         all ->
1529 |             Nodes = [N || {_, N} <- E#election.monitored] -- IgnoreNodes,
1530 |             broadcast({from_leader, Synch}, Nodes, E);
1531 |         _ ->
1532 |             ok
1533 |     end.
1534 | 
1535 | call_elected(Mod, State, E, From) when is_pid(From) ->
1536 |     case Mod:elected(State, E, node(From)) of
1537 |         {ok, Synch, NewState} ->
1538 |             From ! {ldr, Synch, E#election.elid, workers(E), candidates(E), self()},
1539 |             broadcast_candidates(E, Synch, [From]),
1540 |             NewState;
1541 |         {reply, Synch, NewState} ->
1542 |             From ! {ldr, Synch, E#election.elid, workers(E), candidates(E), self()},
1543 |             NewState
1544 |     end.
1545 | 
1546 | 
1547 | %% Start monitor a bunch of candidate nodes
1548 | mon_nodes(E, Nodes, Server) ->
1549 |     Server#server.pinger_proc ! {set_ping_nodes, Nodes},
1550 |     E1 =
1551 |         case E#election.cand_timer of
1552 |             undefined ->
1553 |                 {ok, TRef} = timer:send_interval(E#election.cand_timer_int, {candidate_timer}),
1554 |                 E#election{cand_timer = TRef};
1555 |             _ ->
1556 |                 E
1557 |         end,
1558 |     FromNode = node(),
1559 |     lists:foldl(
1560 |       fun(ToNode, El) ->
1561 |               Pid  = {El#election.name, ToNode},
1562 |               erlang:send(Pid, {heartbeat, FromNode}, [nosuspend, noconnect]),
1563 |               mon_node(El, Pid, Server)
1564 |       end, E1, Nodes -- [node()]).
1565 | 
1566 | %% Start monitoring one Process
1567 | mon_node(E, {_RegName, NodeName} = Proc, Server) ->
1568 |     do_mon_node(E, Proc, NodeName, Server);
1569 | 
1570 | mon_node(E, Proc, Server) when is_pid(Proc) ->
1571 |     do_mon_node(E, Proc, node(Proc), Server).
1572 | 
1573 | do_mon_node(E, Proc, NodeName, Server) ->
1574 |     case lists:keymember(NodeName, 2, E#election.monitored) of
1575 |         true -> E;
1576 |         false ->
1577 |             {Ref, Node} = do_monitor(Proc, Server),
1578 |             E#election{monitored = [{Ref, Node} | E#election.monitored]}
1579 |     end.
1580 | 
1581 | spawn_monitor_proc() ->
1582 |     Parent = self(),
1583 |     proc_lib:spawn_link(?MODULE, real_mon_loop, [Parent, []]).
1584 | 
1585 | 
1586 | do_monitor(Proc, #server{monitor_proc = P}) ->
1587 |     P ! {self(), {monitor, Proc}},
1588 |     receive
1589 |         {mon_reply, Reply} ->
1590 |             Reply
1591 |     after 10000 -> % can take quite a while to receive mon_reply if the node is down
1592 |             erlang:error(timeout)
1593 |     end.
1594 | 
1595 | mon_loop(Parent, Refs) ->
1596 |     ?MODULE:real_mon_loop(Parent, Refs).
1597 | 
1598 | real_mon_loop(Parent, Refs) ->
1599 |     receive
1600 |         code_reloaded ->
1601 |             mon_loop(Parent, Refs);
1602 |         {From, Req} ->
1603 |             mon_loop(Parent, mon_handle_req(Req, From, Refs));
1604 |         {'DOWN', Ref, _, _, _} ->
1605 |             mon_loop(Parent, mon_handle_down(Ref, Parent, Refs));
1606 |         Msg ->
1607 |             io:fwrite("mon_loop with parent: ~p refs: ~p received: ~p~n", [Parent, Refs, Msg]),
1608 |             mon_loop(Parent, Refs)
1609 |     end.
1610 | 
1611 | mon_handle_req({monitor, P}, From, Refs) ->
1612 |     Node = case P of
1613 |                {_Name, N}           -> N;
1614 |                Pid when is_pid(Pid) -> node(Pid)
1615 |            end,
1616 |     case lists:keyfind(Node, 2, Refs) of
1617 |         {Ref, _} ->
1618 |             mon_reply(From, {Ref, Node}),
1619 |             Refs;
1620 |         false ->
1621 |             Ref = erlang:monitor(process, P),
1622 |             mon_reply(From, {Ref, Node}),
1623 |             [{Ref, Node}|Refs]
1624 |     end.
1625 | 
1626 | mon_handle_down(Ref, Parent, Refs) ->
1627 |     case lists:keytake(Ref, 1, Refs) of
1628 |         {value, {_, Node}, Refs1} ->
1629 |             Parent ! {ldr, 'DOWN', Node},
1630 |             Refs1;
1631 |         false ->
1632 |             Refs
1633 |     end.
1634 | 
1635 | 
1636 | mon_reply(From, Reply) ->
1637 |     From ! {mon_reply, Reply}.
1638 | 
1639 | 
1640 | spawn_pinger_proc() ->
1641 |     Parent = self(),
1642 |     proc_lib:spawn_link(?MODULE, init_ping_loop, [Parent, []]).
1643 | 
1644 | init_ping_loop(Parent, NodesToPing) ->
1645 |     ping_loop(Parent, set_ping_timer(0), NodesToPing).
1646 | 
1647 | set_ping_timer(Timeout) ->
1648 |     erlang:start_timer(Timeout, self(), {do_ping}).
1649 | 
1650 | %% To avoid leader blocking on message send, we ping nodes here,
1651 | %% and leader sends messages to down nodes with [nosuspend, noconnect]
1652 | ping_loop(Parent, TRef, NodesToPing) ->
1653 |     receive
1654 |         code_reloaded ->
1655 |             ?MODULE:ping_loop(Parent, TRef, NodesToPing);
1656 |         {set_ping_nodes, NewNodesToPing} ->
1657 |             init_ping_loop(Parent, NewNodesToPing);
1658 |         {timeout, TRef, _} ->
1659 |             NewTRef = set_ping_timer(1000),
1660 |             [net_adm:ping(Node) || Node <- NodesToPing],
1661 |             ?MODULE:ping_loop(Parent, NewTRef, NodesToPing);
1662 |         {timeout, _, _} ->
1663 |             ?MODULE:ping_loop(Parent, TRef, NodesToPing);
1664 |         Msg ->
1665 |             io:fwrite("ping_loop with parent: ~p nodes: ~p received: ~p~n", [Parent, NodesToPing, Msg]),
1666 |             ?MODULE:ping_loop(Parent, TRef, NodesToPing)
1667 |     end.
1668 | 
1669 | 
1670 | 
1671 | %% the heartbeat messages sent to the downed nodes when the candicate_timer
1672 | %% message is received can take a very long time in the case of a partitioned
1673 | %% network (7 seconds in my testing). Since the candidate_timer is generated
1674 | %% by a send_interval, this means many candidate_timer messages can accumulate
1675 | %% in the mailbox. This function is used to clear them out after handling one
1676 | %% of the candidate_timers, so gen_leader doesn't spend all its time sending
1677 | %% heartbeats.
1678 | flush_candidate_timers() ->
1679 |     receive
1680 |         {candidate_timer} ->
1681 |             flush_candidate_timers()
1682 |     after
1683 |         0 ->
1684 |             ok
1685 |     end.
1686 | 
1687 | %% sending messages to disconnected nodes can take a long time
1688 | %% instead of doing this in the gen_leader process, do it here
1689 | %% in a new proc so that gen_leader can remain responsive
1690 | %% Reschedule the next round of checkleads after this round completes,
1691 | %% since sending the messages can take longer than the time between rounds
1692 | send_checkleads(Name, Time, GlProc, Down) ->
1693 |     Node = node(),
1694 |     [{Name, N} ! {checklead, Node} || N <- Down],
1695 |     erlang:send_after(Time, GlProc, {send_checklead})
1696 |     .
1697 | 
1698 | 


--------------------------------------------------------------------------------