├── test ├── test.config ├── minishard_test.erl └── minishard_detest.erl ├── elvis ├── .gitignore ├── src ├── minishard.app.src ├── minishard.erl ├── minishard_demo.erl ├── minishard_sup.erl ├── minishard_shard.erl ├── minishard_allocator.erl └── minishard_gen_leader.erl ├── Makefile ├── AUTHORS ├── LICENSE ├── README.md └── elvis.config /test/test.config: -------------------------------------------------------------------------------- 1 | % -*- mode: erlang -*- 2 | [ 3 | ]. 4 | -------------------------------------------------------------------------------- /elvis: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yandex/minishard/HEAD/elvis -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | deps/* 2 | ebin 3 | .*.sw? 4 | .erlang.mk* 5 | *.d 6 | erl_crash.dump 7 | test/*.ebin 8 | .detest 9 | log 10 | -------------------------------------------------------------------------------- /src/minishard.app.src: -------------------------------------------------------------------------------- 1 | {application, minishard, [ 2 | {description, ""}, 3 | {vsn, "0.1.0"}, 4 | {id, "git"}, 5 | {modules, []}, 6 | {registered, []}, 7 | {applications, [ 8 | kernel, 9 | stdlib 10 | ]}, 11 | {mod, {minishard, []}}, 12 | {env, []} 13 | ]}. 14 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | PROJECT = minishard 2 | COMPILE_FIRST = minishard_gen_leader 3 | 4 | NID := 1 5 | SHELL_OPTS = -sname minishard$(NID) -setcookie minishard_demo -s minishard -boot start_sasl -sasl errlog_type error 6 | 7 | BUILD_DEPS = elvis_mk 8 | DEP_PLUGINS = elvis_mk 9 | TEST_DEPS = detest 10 | 11 | dep_elvis_mk = git https://github.com/inaka/elvis.mk.git 784e41bcb91 12 | 13 | include erlang.mk 14 | -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | The following authors have created the source code of "minishard" 2 | published and distributed by YANDEX LLC as the owner: 3 | 4 | Danil Zagoskin 5 | 6 | 7 | The list of authors and contributors, who created the source code of 8 | "minishard_gen_leader" module ("src/minishard_gen_leader.erl"), which is 9 | a part of "minishard", you may find in a comment at the beginning of that file. 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2016, YANDEX LLC 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, 5 | are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, 8 | this list of conditions and the following disclaimer. 9 | 10 | 2. Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | 3. Neither the name of the copyright holder nor the names of its contributors 15 | may be used to endorse or promote products derived from this software 16 | without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 20 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /src/minishard.erl: -------------------------------------------------------------------------------- 1 | -module(minishard). 2 | -behaviour(application). 3 | 4 | -type score() :: number(). 5 | -export_type([score/0]). 6 | 7 | %%% Behavior callbacks 8 | % Configuration 9 | -callback shard_count(ClusterName :: atom()) -> integer(). 10 | -callback cluster_nodes(ClusterName :: atom()) -> [node()]. 11 | % Life cycle 12 | -callback allocated(ClusterName :: atom(), ShardNum :: integer()) -> State :: any(). 13 | -callback score(State :: any()) -> score(). 14 | -callback prolonged(Loser :: pid(), State :: any()) -> {ok, NextState :: any()}. 15 | -callback deallocated(Winner :: pid(), State :: any()) -> any(). 16 | 17 | % API 18 | -export([start/0]). 19 | -export([join/2, get_node/2, get_manager/2]). 20 | -export([status/1, status/2]). 21 | 22 | % Application callbacks 23 | -export([start/2, stop/1]). 24 | 25 | 26 | start() -> 27 | application:start(?MODULE, permanent). 28 | 29 | 30 | start(_Type, _Args) -> 31 | minishard_sup:start_link(root). 32 | 33 | stop(_State) -> 34 | ok. 35 | 36 | 37 | %% Join the cluster 38 | join(ClusterName, CallbackMod) -> 39 | minishard_sup:join_cluster(ClusterName, CallbackMod). 40 | 41 | %% Resolve a shard number to the shard manager pid 42 | get_manager(ClusterName, ShardNum) -> 43 | minishard_allocator:get_manager(ClusterName, ShardNum). 44 | 45 | %% Resolve a shard number to the node currently hosting it 46 | get_node(ClusterName, ShardNum) -> 47 | minishard_allocator:get_node(ClusterName, ShardNum). 48 | 49 | %% Cluster status 50 | status(ClusterName) -> 51 | minishard_allocator:cluster_status(ClusterName). 52 | 53 | status(ClusterName, _CallbackMod) -> % old API compatibility 54 | {Status, _Counts, NodeMap} = status(ClusterName), 55 | {Status, NodeMap}. 56 | -------------------------------------------------------------------------------- /src/minishard_demo.erl: -------------------------------------------------------------------------------- 1 | -module(minishard_demo). 2 | -behavior(minishard). 3 | 4 | -export([cluster_nodes/1, shard_count/1]). 5 | -export([allocated/2, score/1, prolonged/2, deallocated/2]). 6 | 7 | % Generate fake node list by changing a number in local node name 8 | cluster_nodes(_) -> 9 | BinNode = atom_to_binary(node(), latin1), 10 | [make_node(BinNode, N) || N <- lists:seq(1, 5)]. 11 | 12 | % Shard count, needed to monitor cluster for degrades 13 | shard_count(_) -> 14 | 2. 15 | 16 | % Helper for cluster node names generation 17 | make_node(BinPattern, N) -> 18 | IOLNode = re:replace(BinPattern, "[0-9]+@", [integer_to_list(N), "@"]), 19 | binary_to_atom(iolist_to_binary(IOLNode), latin1). 20 | 21 | 22 | -record(demo, {name, num, alloc_time}). 23 | 24 | allocated(Cluster, Num) -> 25 | error_logger:info_msg("Woo-hoo!!! Minishard demo cluster (name ~w) has allocated us as shard #~w", [Cluster, Num]), 26 | {ok, #demo{name = Cluster, num = Num, alloc_time = os:timestamp()}}. 27 | 28 | score(#demo{alloc_time = AllocTime}) -> 29 | % Let the score be number of seconds we are active 30 | timer:now_diff(os:timestamp(), AllocTime) div 10000000. 31 | 32 | prolonged(Loser, #demo{name = Cluster, num = Num} = State) -> 33 | error_logger:info_msg("Wheeeeeee!!! We still own minishard cluster ~w shard #~w, and ~w at ~w is loser!", 34 | [Cluster, Num, Loser, node(Loser)]), 35 | {ok, State}. 36 | 37 | deallocated(undefined, #demo{name = Cluster, num = Num}) -> 38 | error_logger:info_msg("Bad news: minishard cluster ~w has degraded, so we lose the shard #~w :(", [Cluster, Num]), 39 | ok; 40 | deallocated(Winner, #demo{name = Cluster, num = Num}) -> 41 | error_logger:info_msg("Bad news: ~w at ~w has won the competition for minishard cluster ~w shard #~w :(", 42 | [Winner, node(Winner), Cluster, Num]), 43 | ok. 44 | -------------------------------------------------------------------------------- /test/minishard_test.erl: -------------------------------------------------------------------------------- 1 | -module(minishard_test). 2 | 3 | -export([set_config/1, set_config/2, start/0, start/1, map/0, map/1]). 4 | 5 | -behavior(minishard). 6 | -export([shard_count/1, cluster_nodes/1, score/1, prolonged/2, allocated/2, deallocated/2]). 7 | 8 | 9 | start() -> 10 | start(test). 11 | 12 | start(Name) -> 13 | application:ensure_all_started(lager), 14 | application:ensure_all_started(minishard), 15 | %error_logger:info_msg("Minishard config: ~120p~n", [application:get_all_env(minishard)]), 16 | {ok, _} = minishard:join(Name, ?MODULE), 17 | ok. 18 | 19 | 20 | map() -> 21 | map(test). 22 | map(Name) -> 23 | minishard_allocator:shard_map(Name). 24 | 25 | 26 | set_config(Config) -> 27 | set_config(test, Config). 28 | 29 | set_config(Name, Config) -> 30 | application:load(minishard), 31 | ClustersConf = application:get_env(minishard, clusters, []), 32 | NewClustersConf = lists:ukeymerge(1, [{Name, Config}], ClustersConf), 33 | application:set_env(minishard, clusters, NewClustersConf), 34 | ok. 35 | 36 | get_config(Name) -> 37 | ClustersConf = application:get_env(minishard, clusters, []), 38 | proplists:get_value(Name, ClustersConf, []). 39 | 40 | get_conf_value(Name, Key, Default) -> 41 | MyConf = get_config(Name), 42 | proplists:get_value(Key, MyConf, Default). 43 | 44 | shard_count(Name) -> 45 | get_conf_value(Name, shard_count, 3). 46 | cluster_nodes(Name) -> 47 | get_conf_value(Name, nodes, [node()]). 48 | 49 | -record(shaman, { 50 | name, 51 | shard, 52 | started_at 53 | }). 54 | 55 | allocated(Name, Shard) -> 56 | {ok, #shaman{name = Name, shard = Shard, started_at = os:timestamp()}}. 57 | 58 | score(#shaman{started_at = Started}) -> 59 | timer:now_diff(os:timestamp(), Started)/1000000. 60 | 61 | prolonged(_, State) -> 62 | {ok, State}. 63 | 64 | deallocated(_, State) -> 65 | {ok, State}. 66 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Minishard — lightweight sharding for distributed Erlang applications 2 | ======= 3 | 4 | Goal 5 | ----- 6 | Sometimes you need to store large amount of temporary data available from any node of Erlang cluster. 7 | Storing all the data on single node may cause a memory problem and makes this node a single point of failure. 8 | Replication is even worse sometimes — in case of unreliable network (e.g. multiple datacenters) you get 9 | inconsistencies and merge conflicts. 10 | 11 | Minishard keeps configured number of unique shards allocated on nodes of your cluster, restores cluster connectivity, 12 | resolves possible conflicts after netsplit, notifies your application when cluster degrades. 13 | 14 | How minishard is supposed to work 15 | ----------- 16 | Minishard node is started with two arguments: ```ClusterName :: atom()``` and ```CallbackMod :: module()```. 17 | 18 | Before joining the cluster we first need to know the list of nodes it is constructed of. This is done by calling ```CallbackMod:cluster_nodes(ClusterName) -> [node()]```. 19 | 20 | Also minishard gets the number of required shards by calling ```CallbackMod:shard_count(ClusterName) -> integer()```. 21 | 22 | Minishard uses modified version of ```gen_leader``` for leader election. 23 | Leader is responsible for all shard allocations/deallocations. 24 | 25 | After manager has been selected as a shard owner, ```CallbackMod:allocated(ClusterName, ShardId) -> {ok, State}``` is called. 26 | Any actions needed to initialize a shard should be performed in this function. 27 | ```State``` is any term you want, it will be passed to other callbacks. 28 | 29 | When conflict occurs minishard decides which shard instance should be shut down. To do that each instance is queried for its score by calling ```CallbackMod:score(State) -> integer()```. 30 | Instance with the highest score is a winner and remains allocated. Loser is deallocated. Corresponding callbacks are called: 31 | * ```CallbackMod:prolonged(LoserPid, State) -> {ok, NewState}``` 32 | * ```CallbackMod:deallocated(WinnerPid, State) -> any()``` — final cleanup, return value is ignored. If deallocation is done due to cluster degradation or shutdown, ```WinnerPid``` is ```undefined```. 33 | 34 | If you need a complex migration algorithm, implement it yourself by calling e.g. ```gen_server:enter_loop``` on deallocation. 35 | 36 | -------------------------------------------------------------------------------- /src/minishard_sup.erl: -------------------------------------------------------------------------------- 1 | -module(minishard_sup). 2 | -behaviour(supervisor). 3 | 4 | -export([start_link/1]). 5 | -export([join_cluster/2, get_pid/2]). 6 | -export([init/1]). 7 | 8 | %% For embedding minishard cluster in any supervision tree 9 | -export([cluster_child_spec/2, cluster_internal_specs/2]). 10 | 11 | sup_name(root) -> 12 | minishard; 13 | sup_name({cluster, ClusterName, _}) -> 14 | list_to_atom("minishard_" ++ atom_to_list(ClusterName) ++ "_sup"). 15 | 16 | % Helper: get pid of started infrastructure part 17 | get_pid(undefined, _) -> 18 | throw(undefined_cluster); 19 | get_pid(ClusterName, shard) when is_atom(ClusterName) -> 20 | strict_whereis(minishard_shard:name(ClusterName)); 21 | get_pid(ClusterName, PartName) when is_atom(ClusterName), is_atom(PartName) -> 22 | Sup = sup_name({cluster, ClusterName, undefined}), 23 | Children = supervisor:which_children(Sup), 24 | case lists:keyfind(PartName, 1, Children) of 25 | {PartName, Pid, _, _} -> Pid; 26 | false -> undefined 27 | end. 28 | 29 | strict_whereis(ProcessName) when is_atom(ProcessName) -> 30 | Pid = whereis(ProcessName), 31 | Pid == undefined andalso error(no_cluster), 32 | Pid. 33 | 34 | start_link(Arg) -> 35 | supervisor:start_link({local, sup_name(Arg)}, ?MODULE, Arg). 36 | 37 | cluster_child_spec(ClusterName, CallbackMod) when is_atom(ClusterName), is_atom(CallbackMod) -> 38 | {ClusterName, 39 | {?MODULE, start_link, [{cluster, ClusterName, CallbackMod}]}, 40 | permanent, 10000, supervisor, []}. 41 | 42 | allocator_spec(ClusterName, CallbackMod) -> 43 | {allocator, 44 | {minishard_allocator, start_link, [ClusterName, CallbackMod]}, 45 | permanent, 1000, worker, [minishard_allocator]}. 46 | 47 | shard_spec(ClusterName, CallbackMod) -> 48 | {shard, 49 | {minishard_shard, start_link, [ClusterName, CallbackMod]}, 50 | permanent, 1000, worker, [minishard_shard]}. 51 | 52 | cluster_internal_specs(ClusterName, CallbackMod) when is_atom(ClusterName), is_atom(CallbackMod) -> 53 | [allocator_spec(ClusterName, CallbackMod), shard_spec(ClusterName, CallbackMod)]. 54 | 55 | 56 | join_cluster(ClusterName, CallbackMod) -> 57 | supervisor:start_child(sup_name(root), cluster_child_spec(ClusterName, CallbackMod)). 58 | 59 | 60 | 61 | init(root) -> 62 | {ok, {{one_for_one, 1, 5}, []}}; 63 | 64 | init({cluster, ClusterName, CallbackMod}) -> 65 | {ok, {{one_for_all, 5, 10}, cluster_internal_specs(ClusterName, CallbackMod)}}. 66 | -------------------------------------------------------------------------------- /elvis.config: -------------------------------------------------------------------------------- 1 | % -*- mode: erlang -*- 2 | [ 3 | { 4 | elvis, 5 | [ 6 | {config, 7 | [#{dirs => ["src"], 8 | filter => "*.erl", 9 | rules => [{elvis_style, line_length, #{limit => 120, 10 | skip_comments => false}}, 11 | {elvis_style, no_tabs}, 12 | {elvis_style, no_trailing_whitespace}, 13 | {elvis_style, macro_names}, 14 | {elvis_style, operator_spaces, #{ 15 | rules => [{right, ","}, {right, "++"}, {left, "++"}]}}, 16 | {elvis_style, nesting_level, #{level => 3, ignore => [minishard_gen_leader]}}, 17 | {elvis_style, god_modules, #{limit => 40}}, 18 | {elvis_style, no_if_expression}, 19 | {elvis_style, invalid_dynamic_call, #{ignore => [minishard_allocator, minishard_shard, elvis, minishard_gen_leader]}}, 20 | {elvis_style, used_ignored_variable}, 21 | {elvis_style, no_behavior_info}, 22 | { 23 | elvis_style, 24 | module_naming_convention, 25 | #{regex => "^([a-z][a-z0-9]*_?)*(_SUITE)?$", 26 | ignore => []} 27 | }, 28 | {elvis_style, no_spec_with_records}, 29 | {elvis_style, dont_repeat_yourself, #{min_complexity => 10, ignore => [minishard_gen_leader]}} 30 | ] 31 | }, 32 | #{dirs => ["."], 33 | filter => "Makefile", 34 | rules => [{elvis_project, no_deps_master_erlang_mk, #{ignore => []}}, 35 | {elvis_project, protocol_for_deps_erlang_mk, #{ignore => []}}] 36 | }, 37 | #{dirs => ["."], 38 | filter => "rebar.config", 39 | rules => [{elvis_project, no_deps_master_rebar, #{ignore => []}}, 40 | {elvis_project, git_for_deps_rebar, #{ignore => []}}] 41 | }, 42 | #{dirs => ["."], 43 | filter => "elvis.config", 44 | rules => [{elvis_project, old_configuration_format}] 45 | } 46 | ] 47 | } 48 | ] 49 | } 50 | ]. 51 | -------------------------------------------------------------------------------- /test/minishard_detest.erl: -------------------------------------------------------------------------------- 1 | -module(minishard_detest). 2 | 3 | % mandatory detest functions 4 | -export([cfg/1,run/1,setup/1,cleanup/1]). 5 | 6 | cfg(_TestArgs) -> 7 | % Compile test callback module 8 | compile:file("test/minishard_test", [{outdir, "test"}]), 9 | [ 10 | % {global_cfg,[{"test/nodes.yaml",[{fixedvals,KV}]},"test/withoutparams.yaml"]}, 11 | {per_node_cfg, ["test/test.config"]}, 12 | {cmd , "-pa test -s minishard -config test/test.config"}, 13 | {wait_for_app, minishard}, 14 | {nodes , []}, 15 | {erlenv , [{"ERL_LIBS","deps"}]} 16 | ]. 17 | 18 | 19 | setup(_Param) -> 20 | ok. 21 | 22 | cleanup(_Param) -> 23 | ok. 24 | 25 | 26 | run(Param) -> 27 | random:seed(os:timestamp()), 28 | lager:info("Script params: ~p", [Param]), 29 | ClusterSize = proplists:get_value(cluster_size, Param, 10), 30 | 31 | 32 | IdsToStart = lists:seq(1, ClusterSize), 33 | % Warning: do not start new nodes with pmap because they will get a same address 34 | NodeMap = maps:from_list([{Id, detest:add_node(node_spec(Id))} || Id <- IdsToStart]), 35 | 36 | lager:info("Started nodes, map: ~120p", [NodeMap]), 37 | 38 | Nodes = maps:values(NodeMap), 39 | MST_Config = [{shard_count, 3}, {nodes, Nodes}], 40 | 41 | configure_and_start(test, Nodes, MST_Config), 42 | 43 | timer:sleep(1200 + 50*ClusterSize), % Let the leader allocate all shards 44 | 45 | lager:info("initial shard map: ~120p", [get_validate_map(Nodes, allocated)]), 46 | 47 | kill_standby_nodes_test(test, Nodes, MST_Config, 10), 48 | 49 | kill_leader_test(test, Nodes, MST_Config, 10), 50 | 51 | ok. 52 | 53 | 54 | configure_and_start(Name, Nodes, MST_Config) -> 55 | ConfigResults = multicall(Nodes, minishard_test, set_config, [Name, MST_Config]), 56 | [{_, ok}] = lists:ukeysort(2, ConfigResults), 57 | StartResults = multicall(Nodes, minishard_test, start, [Name]), 58 | [{_, ok}] = lists:ukeysort(2, StartResults), 59 | ok. 60 | 61 | %% Helper: parallel map for faster cluster startup 62 | pmap(Function, List) -> 63 | S = self(), 64 | Pids = [spawn_link(fun() -> execute(S, Function, El) end) || El <- List], 65 | gather(Pids). 66 | 67 | execute(Recv, Function, Element) -> 68 | Recv ! {self(), Function(Element)}. 69 | 70 | gather([]) -> []; 71 | gather([H|T]) -> 72 | receive 73 | {H, Ret} -> [Ret|gather(T)] 74 | end. 75 | 76 | %% This multicall is not compatible with rpc:multicall: 77 | %% it takes only MFA and returns a tuplelist where results are tagged with node names 78 | multicall(Nodes, M, F, A) -> 79 | pmap(fun(Node) -> 80 | {Node, rpc:call(Node, M, F, A)} 81 | end, Nodes). 82 | 83 | 84 | get_validate_map(Nodes, ExpectedState) -> 85 | Map = get_same_map(Nodes), 86 | ok = validate_map(Map, ExpectedState), 87 | Map. 88 | 89 | get_same_map(Nodes) -> 90 | NodeMaps = multicall(Nodes, minishard_test, map, [test]), 91 | case lists:ukeysort(2, NodeMaps) of 92 | [{_, #{} = Map}] -> 93 | Map; 94 | _Other -> 95 | error({different_maps, NodeMaps}) 96 | end. 97 | 98 | validate_map(Map, allocated) -> 99 | case missing_shards(Map) of 100 | [] -> ok; 101 | [_|_] = Missing -> {error, {missing, Missing}} 102 | end. 103 | 104 | missing_shards(Map) -> 105 | maps:fold(fun collect_missing_shards/3, [], Map). 106 | 107 | collect_missing_shards(Shard, undefined, Acc) -> [Shard|Acc]; 108 | collect_missing_shards(_Shard, _, Acc) -> Acc. 109 | 110 | 111 | kill_standby_nodes_test(_Name, _Nodes, _Config, 0) -> 112 | ok; 113 | kill_standby_nodes_test(Name, Nodes, MST_Config, Iterations) -> 114 | ClusterSize = length(Nodes), 115 | Map0 = get_validate_map(Nodes, allocated), 116 | BusyNodes = maps:values(Map0), 117 | KillCandidates = Nodes -- BusyNodes, 118 | NodesToKill = lists:filter(fun(_) -> crypto:rand_uniform(0, 2) == 1 end, KillCandidates), 119 | RemainingNodes = Nodes -- NodesToKill, 120 | 121 | lager:info("kill_standby_nodes_test: (~w iters left) killing ~120p", [Iterations, NodesToKill]), 122 | pmap(fun(Node) -> detest:stop_node(Node) end, NodesToKill), 123 | 124 | timer:sleep(1200), 125 | Map0 = get_validate_map(RemainingNodes, allocated), 126 | 127 | lager:info("kill_standby_nodes_test: (~w iters left) starting back ~120p", [Iterations, NodesToKill]), 128 | pmap(fun(Node) -> detest:add_node(node_spec(Node)) end, NodesToKill), 129 | configure_and_start(Name, NodesToKill, MST_Config), 130 | 131 | timer:sleep(1200 + 50*ClusterSize), % Let the leader allocate all shards 132 | Map0 = get_validate_map(RemainingNodes, allocated), 133 | 134 | kill_standby_nodes_test(Name, Nodes, MST_Config, Iterations - 1). 135 | 136 | 137 | kill_leader_test(_Name, _Nodes, _Config, 0) -> 138 | ok; 139 | kill_leader_test(Name, Nodes, MST_Config, Iterations) -> 140 | SeenLeaders = multicall(Nodes, minishard_allocator, leader, [Name]), 141 | [{_, Leader}] = lists:ukeysort(2, SeenLeaders), 142 | RemainingNodes = Nodes -- [Leader], 143 | 144 | lager:info("kill_leader_test: (~w iters left) killing ~120p", [Iterations, Leader]), 145 | detest:stop_node(Leader), 146 | 147 | timer:sleep(1200), 148 | Map0 = get_validate_map(RemainingNodes, allocated), 149 | NewSeenLeaders = multicall(RemainingNodes, minishard_allocator, leader, [Name]), 150 | [{_, _NewLeader}] = lists:ukeysort(2, NewSeenLeaders), 151 | 152 | lager:info("kill_leader_test: (~w iters left) starting back ~120p", [Iterations, Leader]), 153 | detest:add_node(node_spec(Leader)), 154 | configure_and_start(Name, [Leader], MST_Config), 155 | 156 | timer:sleep(1200), % Let the leader allocate all shards 157 | Map0 = get_validate_map(RemainingNodes, allocated), 158 | 159 | kill_leader_test(Name, Nodes, MST_Config, Iterations - 1). 160 | 161 | node_spec(Node) when is_atom(Node) -> 162 | {ok, [N], "@" ++ _} = io_lib:fread("mst_~d", atom_to_list(Node)), 163 | node_spec(N); 164 | node_spec(N) when is_integer(N) -> 165 | [{id, N}, {name, list_to_atom("mst_" ++ integer_to_list(N))}]. 166 | 167 | -------------------------------------------------------------------------------- /src/minishard_shard.erl: -------------------------------------------------------------------------------- 1 | -module(minishard_shard). 2 | -behavior(gen_server). 3 | 4 | -export([start_link/2, name/1, status/1, info/1]). 5 | -export([set_status/2]). 6 | 7 | %% gen_server callbacks 8 | -export([init/1, handle_info/2, handle_cast/2, handle_call/3, code_change/3, terminate/2]). 9 | 10 | %% Conflict resolution: get score 11 | -export([get_score_or_kill/1]). 12 | 13 | name(ClusterName) when is_atom(ClusterName) -> 14 | list_to_atom("minishard_" ++ atom_to_list(ClusterName) ++ "_shard"). 15 | 16 | 17 | start_link(ClusterName, CallbackMod) when is_atom(ClusterName), is_atom(CallbackMod) -> 18 | State = seed_state(ClusterName, CallbackMod), 19 | gen_server:start_link({local, name(ClusterName)}, ?MODULE, State, []). 20 | 21 | 22 | %% Set shard status (for use by allocator) 23 | set_status(ShardPid, Status) when is_pid(ShardPid) -> 24 | gen_server:call(ShardPid, {set_status, Status}). 25 | 26 | 27 | %% Get shard status 28 | status(ClusterOrShard) when ClusterOrShard /= undefined -> 29 | {dictionary, Dict} = process_info(local_pid(ClusterOrShard), dictionary), 30 | proplists:get_value(status, Dict, undefined). 31 | 32 | info(ClusterOrShard) when ClusterOrShard /= undefined -> 33 | {dictionary, Dict} = process_info(local_pid(ClusterOrShard), dictionary), 34 | case proplists:get_value(status, Dict, undefined) of 35 | active -> 36 | {active, #{ 37 | since => proplists:get_value(active_since, Dict), 38 | shard => proplists:get_value(shard, Dict) 39 | }}; 40 | Inactive -> 41 | {Inactive, #{}} 42 | end. 43 | 44 | 45 | local_pid(ManagerPid) when is_pid(ManagerPid) -> 46 | ManagerPid; 47 | local_pid(ClusterName) when is_atom(ClusterName), ClusterName /= undefined -> 48 | whereis(name(ClusterName)). 49 | 50 | 51 | -record(shard, { 52 | cluster_name, 53 | callback_mod, 54 | callback_state, 55 | max_number, 56 | my_number, 57 | monitors, 58 | recheck_timer, 59 | status 60 | }). 61 | 62 | seed_state(ClusterName, CallbackMod) -> 63 | #shard{ 64 | cluster_name = ClusterName, 65 | callback_mod = CallbackMod, 66 | max_number = CallbackMod:shard_count(ClusterName), 67 | my_number = undefined, 68 | monitors = #{}, 69 | status = starting 70 | }. 71 | 72 | 73 | init(#shard{} = State) -> 74 | {ok, export_status(State), 0}. 75 | 76 | 77 | %% Initial status discovery. Later watcher will notify us about status changes 78 | handle_info(timeout, #shard{status = starting} = State0) -> 79 | {noreply, schedule_recheck(join_cluster(State0#shard{status = idle}))}; 80 | 81 | handle_info({timeout, Timer, recheck_ownership}, #shard{recheck_timer = Timer, cluster_name = ClusterName} = State) -> 82 | % Ensure our allocator feels OK and responds to calls (did not stall) 83 | _ = minishard_allocator:leader(ClusterName), 84 | % OK, we did not crash, so allocator is running. 85 | % Now let's see if we missed deallocation 86 | handle_ownership_recheck(State#shard{recheck_timer = undefined}); 87 | 88 | handle_info(Unexpected, #shard{cluster_name = ClusterName} = State) -> 89 | error_logger:warning_msg("Minishard shard ~w got unexpected message: ~9999p", [ClusterName, Unexpected]), 90 | {noreply, State}. 91 | 92 | 93 | handle_call(score, _From, #shard{status = active, 94 | callback_mod = CallbackMod, callback_state = CallbackState} = State) -> 95 | Score = CallbackMod:score(CallbackState), 96 | {reply, Score, State}; 97 | 98 | handle_call({set_status, {active, ShardNum}}, _From, #shard{status = active, my_number = ShardNum} = State) -> 99 | {reply, ok, State}; 100 | handle_call({set_status, {active, OtherShardNum}}, _From, #shard{status = active, my_number = ShardNum} = State) -> 101 | {stop, {wont_change_shard, ShardNum, OtherShardNum}, {error, shard_change}, State}; 102 | handle_call({set_status, {active, ShardNum}}, _From, #shard{} = State) -> 103 | {reply, ok, activate(ShardNum, State)}; 104 | handle_call({set_status, Inactive}, _From, #shard{status = active} = State) 105 | when Inactive == idle; Inactive == standby -> 106 | % This should not happen - allocator should send an allocation event 107 | NewState = callback_deallocate(undefined, State), 108 | {stop, {shutdown, suddenly_deallocated}, ok, idle(NewState)}; 109 | handle_call({set_status, idle}, _From, #shard{} = State) -> 110 | {reply, ok, idle(State)}; 111 | handle_call({set_status, standby}, _From, #shard{} = State) -> 112 | {reply, ok, standby(State)}; 113 | handle_call({set_status, Status}, _From, #shard{} = State) -> 114 | {reply, {error, {bad_status, Status}}, State}; 115 | 116 | handle_call(_, _From, #shard{} = State) -> 117 | {reply, {error, not_implemented}, State}. 118 | 119 | 120 | handle_cast({allocation, Action, Challenger}, #shard{} = State) -> 121 | handle_allocation(Action, Challenger, State); 122 | 123 | handle_cast(Unexpected, #shard{cluster_name = ClusterName} = State) -> 124 | error_logger:warning_msg("Minishard shard ~w got unexpected cast: ~9999p", [ClusterName, Unexpected]), 125 | {noreply, State}. 126 | 127 | 128 | code_change(_, #shard{} = State, _) -> 129 | {ok, State}. 130 | 131 | terminate(_, #shard{}) -> 132 | ok. 133 | 134 | 135 | 136 | %% Allocation notification on conflict 137 | handle_allocation(prolong, Loser, #shard{} = State) -> 138 | {noreply, callback_prolong(Loser, State)}; 139 | handle_allocation(cancel, Winner, #shard{} = State) -> 140 | NewState = callback_deallocate(Winner, State), 141 | % Gracefully shutdown for cleanup 142 | {stop, {shutdown, cluster_degraded}, idle(NewState)}. 143 | 144 | 145 | %% Shard ownership recheck 146 | handle_ownership_recheck(#shard{status = active, cluster_name = ClusterName, my_number = MyNum} = State0) -> 147 | Owner = minishard:get_manager(ClusterName, MyNum), 148 | State = schedule_recheck(State0), 149 | case (Owner == self()) of 150 | true -> % OK, we still own the shard 151 | {noreply, State}; 152 | false -> %% Oops... 153 | error_logger:error_msg("Minishard: cluster ~w shard #~w ownership lost!", [ClusterName, MyNum]), 154 | handle_allocation(cancel, undefined, State) 155 | end; 156 | handle_ownership_recheck(#shard{} = State) -> 157 | {noreply, schedule_recheck(State)}. 158 | 159 | 160 | %%% 161 | %%% Internals 162 | %%% 163 | 164 | export_status(#shard{status = active = Status, my_number = MyNum} = State) -> 165 | put(shard, MyNum), 166 | put(active_since, os:timestamp()), 167 | put(status, Status), 168 | State; 169 | export_status(#shard{status = Status} = State) -> 170 | put(status, Status), 171 | erase(active_since), 172 | erase(shard), 173 | State. 174 | 175 | 176 | %% Try to join a cluster and take a free shard if possible 177 | join_cluster(#shard{status = standby} = State) -> 178 | % Already waiting for free shard number. This may happen after transition 179 | State; 180 | join_cluster(#shard{status = active} = State) -> 181 | % Already active, do nothing. This may happen after transition 182 | State; 183 | join_cluster(#shard{status = idle, cluster_name = ClusterName} = State) -> 184 | case minishard_allocator:bind(ClusterName) of 185 | {active, MyNumber} -> 186 | activate(MyNumber, State); 187 | standby -> 188 | export_status(State#shard{status = standby, my_number = undefined}) 189 | end. 190 | 191 | %% Perform all activation stuff when we capture a shard number 192 | activate(MyNumber, #shard{} = State) -> 193 | Allocated = callback_allocate(State#shard{status = active, my_number = MyNumber}), 194 | export_status(Allocated). 195 | 196 | 197 | %% Leave degraded cluster 198 | idle(#shard{status = idle} = State) -> 199 | % Nothing to do 200 | State; 201 | idle(#shard{status = standby} = State) -> 202 | export_status(State#shard{status = idle}); 203 | idle(#shard{status = active} = State) -> 204 | export_status(State#shard{status = idle, my_number = undefined}). 205 | 206 | standby(#shard{status = standby} = State) -> 207 | % Nothing to do 208 | State; 209 | standby(#shard{status = idle} = State) -> 210 | export_status(State#shard{status = standby}). 211 | 212 | 213 | %% Due to some troubles allocator may have after netsplit, we need to periodically ensure we still own the shard number 214 | schedule_recheck(#shard{cluster_name = minishard_demo} = State) -> 215 | State; 216 | schedule_recheck(#shard{} = State) -> 217 | Timer = erlang:start_timer(100, self(), recheck_ownership), 218 | State#shard{recheck_timer = Timer}. 219 | 220 | 221 | %% Callback management 222 | callback_allocate(#shard{cluster_name = ClusterName, callback_mod = CallbackMod, my_number = MyNumber} = State) -> 223 | {ok, CallbackState} = CallbackMod:allocated(ClusterName, MyNumber), 224 | State#shard{callback_state = CallbackState}. 225 | 226 | callback_prolong(Loser, #shard{callback_mod = CallbackMod, callback_state = CallbackState} = State) -> 227 | {ok, NewCallbackState} = CallbackMod:prolonged(Loser, CallbackState), 228 | State#shard{callback_state = NewCallbackState}. 229 | 230 | callback_deallocate(Winner, #shard{callback_mod = CallbackMod, callback_state = CallbackState} = State) -> 231 | _ = CallbackMod:deallocated(Winner, CallbackState), 232 | State#shard{callback_state = undefined}. 233 | 234 | 235 | %% We perform score getting in separate process to ensure allocator does not get garbage messages 236 | get_score_or_kill(ShardPid) -> 237 | ScoreGetResult = rpc:call(node(), gen_server, call, [ShardPid, score, 1000]), 238 | handle_score_result(ShardPid, ScoreGetResult). 239 | 240 | handle_score_result(_Pid, Score) when is_number(Score) -> 241 | Score; 242 | handle_score_result(ShardPid, _) -> 243 | % We don't care what exactly goes wrong, we just kill it 244 | exit(ShardPid, kill), 245 | undefined. 246 | -------------------------------------------------------------------------------- /src/minishard_allocator.erl: -------------------------------------------------------------------------------- 1 | %%% Minishard allocator 2 | %%% 3 | %%% This module is a callback module for gen_leader (well, local version of it) 4 | %%% which tracks other members status and decides who runs which shard. 5 | %%% 6 | %%% Leader tracks the cluster status in a map. Each cluster node has a key in the map, 7 | %%% corresponding value is its status. Possible statuses are: 8 | %%% * down -- allocator on the node is down 9 | %%% * #transition{} -- allocator has recently went down, waiting for it to come back 10 | %%% * #request{} -- allocator is up, waiting for it to send its status 11 | %%% * #conflict{} -- allocator is up and hosts a conflicting shard. Waiting for it to send its score 12 | %%% * idle -- allocator is up, but shard manager has not been bound 13 | %%% * standby -- allocator is up with a bound shard manager without a shard 14 | %%% * #active{} -- allocator is up with a bound shard manager hosting shard N 15 | %%% 16 | -module(minishard_allocator). 17 | -define(GEN_LEADER, minishard_gen_leader). 18 | 19 | -behavior(?GEN_LEADER). 20 | 21 | 22 | %% API 23 | -export([name/1]). 24 | -export([start_link/2, cluster_status/1, shard_map/1]). 25 | -export([bind/1]). 26 | -export([get_manager/2, get_node/2]). 27 | 28 | %% Testing/debugging 29 | -export([seed_state/3, set_hacks/2, leader/1]). 30 | 31 | %% gen_leader callbacks 32 | -export([ 33 | init/1, 34 | handle_cast/3, 35 | handle_call/4, 36 | handle_info/3, 37 | handle_leader_call/4, 38 | handle_leader_cast/3, 39 | handle_DOWN/3, 40 | elected/3, 41 | surrendered/3, 42 | from_leader/3, 43 | code_change/4, 44 | terminate/2]). 45 | 46 | 47 | %% Candidate status request 48 | -record(request, { 49 | ref :: reference() % Request reference 50 | }). 51 | -record(status_update, { 52 | ref :: reference(), % Request reference 53 | node :: node(), % Reporting node 54 | status :: node_status(), % Reported status 55 | manager :: undefined | pid() % Current node's shard manager 56 | }). 57 | 58 | %% Conflict resolution status 59 | -record(conflict, { 60 | shard :: integer(), % Conflicting shard number 61 | ref :: reference(), % Resolution reference 62 | score :: undefined | minishard:score() % Score reported by a member 63 | }). 64 | %% Conflict score report 65 | -record(score_report, { 66 | ref :: reference(), % Resolution reference 67 | node :: node(), % Reporting node 68 | score :: minishard:score() % Reported score 69 | }). 70 | 71 | %% Temporary state for nodes going down. Without this shard is reallocated even on interconnect socket reset 72 | -record(transition, { 73 | shard :: integer(), % Shard number just before disconnect 74 | ref :: reference() % transition reference 75 | }). 76 | 77 | %% Active status 78 | -record(active, { 79 | shard :: integer() % Active shard number 80 | }). 81 | 82 | -type request() :: #request{}. 83 | -type conflict() :: #conflict{}. 84 | -type transition() :: #transition{}. 85 | -type active() :: #active{}. 86 | -type node_status() :: down | request() | conflict() | transition() | idle | standby | active(). 87 | -type allocation_map() :: #{node() => node_status()}. 88 | -type manager_map() :: #{node() => undefined | pid()}. 89 | 90 | %% gen_leader callback state 91 | -record(allocator, { 92 | name :: atom(), 93 | callback_mod :: module(), 94 | my_status :: node_status(), 95 | last_response :: reference(), 96 | shard_manager :: undefined | pid(), 97 | shard_count :: integer(), 98 | map :: allocation_map(), 99 | managers :: manager_map(), 100 | hacks :: #{atom() => any()} 101 | }). 102 | 103 | -type state() :: #allocator{}. 104 | 105 | 106 | %% ETS data model for shard information 107 | -define(ETS_SHARD_KEY(Shard), {shard, Shard}). 108 | -define(ETS_SHARD_NODE_POS, 2). 109 | -define(ETS_SHARD_MANAGER_POS, 3). 110 | -define(ETS_SHARD_RECORD(Shard, Node, Manager), {?ETS_SHARD_KEY(Shard), Node, Manager}). 111 | 112 | %% Generate a process/ets name for a cluster name 113 | name(ClusterName) -> 114 | list_to_atom("minishard_" ++ atom_to_list(ClusterName) ++ "_allocator"). 115 | 116 | %% API: Resolve a shard number to the shard manager pid 117 | get_manager(ClusterName, Shard) -> 118 | ets:lookup_element(name(ClusterName), ?ETS_SHARD_KEY(Shard), ?ETS_SHARD_MANAGER_POS). 119 | 120 | %% API: Resolve a shard number to the node currently hosting it 121 | get_node(ClusterName, Shard) -> 122 | ets:lookup_element(name(ClusterName), ?ETS_SHARD_KEY(Shard), ?ETS_SHARD_NODE_POS). 123 | 124 | %% API: start the allocator for given cluster 125 | start_link(ClusterName, CallbackMod) -> 126 | start_link(ClusterName, CallbackMod, #{}). 127 | 128 | start_link(ClusterName, CallbackMod, #{} = Hacks) when is_atom(ClusterName), is_atom(CallbackMod) -> 129 | Name = name(ClusterName), 130 | State0 = #allocator{map = Map} = seed_state(ClusterName, CallbackMod, Hacks), 131 | Nodes = maps:keys(Map), 132 | Options = leader_worker_options(Nodes) ++ [{heartbeat, 5}, {bcast_type, all}, {seed_node, none}], 133 | ?GEN_LEADER:start_link(Name, Nodes, Options, ?MODULE, State0, []). 134 | 135 | leader_worker_options(Nodes) -> 136 | case lists:member(node(), Nodes) of 137 | true -> []; 138 | false -> [{workers, [node()]}] 139 | end. 140 | 141 | %% Test/debug API: set hacks for a running allocator 142 | set_hacks(ClusterName, #{} = Hacks) when is_atom(ClusterName) -> 143 | ?GEN_LEADER:call(name(ClusterName), {set_hacks, Hacks}). 144 | 145 | %% Seed state for a starting allocator 146 | seed_state(ClusterName, CallbackMod, Hacks) -> 147 | Nodes = CallbackMod:cluster_nodes(ClusterName), 148 | MyStatus = case lists:member(node(), Nodes) of 149 | true -> idle; 150 | false -> worker 151 | end, 152 | SeedMap = maps:from_list([{N, down} || N <- Nodes]), 153 | SeedManagers = maps:from_list([{N, undefined} || N <- Nodes]), 154 | #allocator{ 155 | name = name(ClusterName), 156 | callback_mod = CallbackMod, 157 | shard_manager = undefined, 158 | my_status = MyStatus, 159 | shard_count = CallbackMod:shard_count(ClusterName), 160 | map = SeedMap, 161 | managers = SeedManagers, 162 | hacks = Hacks }. 163 | 164 | 165 | %% Register a shard manager ready to host a shard 166 | bind(ClusterName) -> 167 | ?GEN_LEADER:call(name(ClusterName), {bind, self()}, 120000). 168 | 169 | %% Helper for possible asynchronous manager reply 170 | manager_reply(undefined, _) -> 171 | ok; 172 | manager_reply(From, Reply) -> 173 | ?GEN_LEADER:reply(From, Reply). 174 | 175 | 176 | %% Return cluster status in form {OverallStatusAtom, NodeStatusMap} 177 | cluster_status(ClusterName) when is_atom(ClusterName) -> 178 | ?GEN_LEADER:call(name(ClusterName), cluster_status). 179 | 180 | %% Return current leader node 181 | leader(ClusterName) when is_atom(ClusterName) -> 182 | ?GEN_LEADER:call(name(ClusterName), get_leader). 183 | 184 | %% Return shard allocation map 185 | -spec shard_map(ClusterName :: atom()) -> #{Shard :: integer() => node()}. 186 | shard_map(ClusterName) -> 187 | %ets:foldl(fun collect_shard_map/2, #{}, name(ClusterName)). 188 | maps:from_list([{Shard, Node} || ?ETS_SHARD_RECORD(Shard, Node, _) <- ets:tab2list(name(ClusterName))]). 189 | 190 | %% Init: nothing special, we start with an empty map 191 | init(#allocator{name = Name} = State) -> 192 | Name = ets:new(Name, [protected, named_table, set, {read_concurrency, true}]), 193 | ok = export_shard_map(State), 194 | {ok, State}. 195 | 196 | 197 | handle_cast(Msg, #allocator{name = Name} = State, _Election) -> 198 | error_logger:warning_msg("Minishard allocator ~w got unexpected cast ~9999p", [Name, Msg]), 199 | {noreply, State}. 200 | 201 | handle_info(Msg, #allocator{name = Name} = State, _Election) -> 202 | error_logger:warning_msg("Minishard allocator ~w got unexpected info ~9999p", [Name, Msg]), 203 | {noreply, State}. 204 | 205 | handle_call({bind, ShardManager}, _From, #allocator{name = Name} = State, _Election) -> 206 | NewState = State#allocator{shard_manager = ShardManager}, 207 | ok = ?GEN_LEADER:leader_cast(Name, {bind_manager, node(), ShardManager, undefined}), 208 | {reply, standby, NewState}; 209 | handle_call(cluster_status, _From, #allocator{} = State, Election) -> 210 | {reply, make_cluster_status(State, Election), State}; 211 | handle_call(get_leader, _From, #allocator{} = State, Election) -> 212 | {reply, ?GEN_LEADER:leader_node(Election), State}; 213 | handle_call({set_hacks, Hacks}, _From, #allocator{} = State, _Election) -> 214 | {reply, ok, State#allocator{hacks = Hacks}}; 215 | handle_call(_Request, _From, #allocator{} = State, _Election) -> 216 | {reply, {error, not_implemented}, State}. 217 | 218 | 219 | 220 | %% We are elected. Propagate our allocation map 221 | elected(#allocator{name = Name} = State, Election, Loser) -> 222 | error_logger:info_msg("Minishard allocator ~w elected, ~w surrendered", [Name, Loser]), 223 | StateRequestsRestarted = restart_requests(State), 224 | NewState = handle_new_election(Election, StateRequestsRestarted), 225 | {ok, NewState, NewState}. 226 | 227 | 228 | %% Node goes down. Deallocate its shard and remove from pool 229 | handle_DOWN(Node, #allocator{name = Name} = State, Election) -> 230 | error_logger:info_msg("Minishard allocator ~w has seen ~w's death", [Name, Node]), 231 | NewState = handle_new_election(Election, State), 232 | {ok, NewState, NewState}. 233 | 234 | 235 | 236 | get_allocation(Node, #{} = Map) -> 237 | case maps:find(Node, Map) of 238 | {ok, Status} -> Status; 239 | error -> undefined 240 | end. 241 | 242 | %% We have surrendered. Inherit a new allocation map 243 | surrendered(#allocator{name = Name} = State, #allocator{} = Synch, Election) -> 244 | error_logger:info_msg("Minishard allocator ~w surrendered, forwarding Synch to from_leader/3", [Name]), 245 | from_leader(Synch, State, Election); 246 | surrendered(#allocator{name = Name} = State, _Synch, _Election) -> 247 | error_logger:info_msg("Minishard allocator ~w surrendered", [Name]), 248 | {ok, State}. 249 | 250 | 251 | handle_leader_call(_Request, _From, State, _Election) -> 252 | {reply, {error, not_implemented}, State}. 253 | 254 | handle_leader_cast({bind_manager, Node, ShardManager, From}, #allocator{map = Map, name = Name} = State, _Election) -> 255 | case maps:is_key(Node, Map) of 256 | true -> 257 | error_logger:info_msg("Minishard allocator ~w *** LEADER *** adds ~w as good node", [Name, Node]), 258 | StateWithManager = set_manager(Node, ShardManager, State), 259 | NewState = #allocator{map = NewMap} = set_realloc_install([Node], standby, StateWithManager), 260 | _ = manager_reply(From, get_allocation(Node, NewMap)), 261 | {ok, NewState, NewState}; 262 | false -> 263 | _ = manager_reply(From, not_my_cluster), 264 | {noreply, State} 265 | end; 266 | 267 | handle_leader_cast({request_timeout, RequestRef}, #allocator{name = Name} = State, _Election) -> 268 | case handle_request_timeout(RequestRef, State) of 269 | {updated, NewState} -> 270 | error_logger:info_msg("Minishard allocator ~w *** LEADER *** status update ~w timeout", [Name, RequestRef]), 271 | {ok, NewState, NewState}; 272 | unchanged -> 273 | {noreply, State} 274 | end; 275 | 276 | handle_leader_cast(#status_update{ref = RequestRef, node = Node, status = Status, manager = Manager}, 277 | #allocator{name = Name, map = Map} = State, _Election) -> 278 | error_logger:info_msg("Minishard allocator ~w *** LEADER *** got a status update from ~w (~w)", 279 | [Name, Node, Status]), 280 | case get_allocation(Node, Map) of 281 | #request{ref = RequestRef} -> 282 | StateWithManager = set_manager(Node, Manager, State), 283 | NewState = handle_possible_conflicts(Node, Status, StateWithManager), 284 | {ok, NewState, NewState}; 285 | _ -> 286 | {noreply, State} 287 | end; 288 | 289 | handle_leader_cast({conflict_timeout, ConflictRef}, #allocator{name = Name, map = Map} = State, _Election) -> 290 | {Shard, NodeScores} = conflict_shard_and_scores(ConflictRef, Map), 291 | Shard /= undefined andalso error_logger:info_msg( 292 | "Minishard allocator ~w *** LEADER *** conflict ~w (shard ~w) timeout", [Name, ConflictRef, Shard]), 293 | NewState = resolve_conflict(Shard, NodeScores, State), 294 | {ok, NewState, NewState}; 295 | 296 | handle_leader_cast(#score_report{ref = ReportRef, node = Node, score = Score}, 297 | #allocator{name = Name, map = Map} = State, _Election) -> 298 | error_logger:info_msg("Minishard allocator ~w *** LEADER *** got a score report from ~w (~w)", [Name, Node, Score]), 299 | NewMap = case get_allocation(Node, Map) of 300 | #conflict{ref = ReportRef} = Conflict -> 301 | set_statuses([Node], Conflict#conflict{score = Score}, Map); 302 | _ -> 303 | Map 304 | end, 305 | NewState = install_new_map(NewMap, State), 306 | {Shard, NodeScores} = conflict_shard_and_scores(ReportRef, NewMap), 307 | case lists:keymember(undefined, 2, NodeScores) of 308 | true -> % Still have pending score requests, no action needed 309 | {noreply, NewState}; 310 | false -> % All nodes have reported their scores, ok to resolve conflict now 311 | ResolvedState = resolve_conflict(Shard, NodeScores, State), 312 | {ok, ResolvedState, ResolvedState} 313 | end; 314 | 315 | handle_leader_cast({transition_timeout, TransRef}, #allocator{name = Name} = State, _Election) -> 316 | case handle_transition_timeout(TransRef, State) of 317 | {updated, NewState} -> 318 | error_logger:info_msg("Minishard allocator ~w *** LEADER *** transition ~w finished", [Name, TransRef]), 319 | {ok, NewState, NewState}; 320 | unchanged -> 321 | {noreply, State} 322 | end; 323 | 324 | handle_leader_cast(Msg, #allocator{name = Name} = State, _Election) -> 325 | error_logger:warning_msg("Minishard allocator ~w got unexpected leader cast ~9999p", [Name, Msg]), 326 | {noreply, State}. 327 | 328 | 329 | from_leader(#allocator{map = NewMap, managers = ManagerMap}, #allocator{name = Name} = State, _Election) -> 330 | error_logger:info_msg("Minishard allocator ~w got update from the leader.", [Name]), 331 | {ok, install_new_map(NewMap, State#allocator{managers = ManagerMap})}; 332 | 333 | from_leader(Msg, #allocator{name = Name} = State, _Election) -> 334 | error_logger:info_msg("Minishard allocator ~w got a message from the leader: ~9999p", [Name, Msg]), 335 | {ok, State}. 336 | 337 | 338 | 339 | terminate(_Reason, _State) -> 340 | ok. 341 | 342 | code_change(_OldVsn, #allocator{} = State, _Election, _Extra) -> 343 | {ok, State}. 344 | 345 | 346 | 347 | 348 | %% When status request times out, we mark nodes which did not send their status update as idle 349 | handle_request_timeout(RequestRef, #allocator{map = Map} = State) -> 350 | StalledNodes = maps:fold(fun 351 | (Node, #request{ref = NodeRef}, Acc) when NodeRef == RequestRef -> 352 | [Node|Acc]; 353 | (_Node, _Status, Acc) -> 354 | Acc 355 | end, [], Map), 356 | case StalledNodes of 357 | [] -> 358 | unchanged; 359 | [_|_] -> 360 | NewState = set_realloc_install(StalledNodes, idle, State), 361 | {updated, NewState} 362 | end. 363 | 364 | %% Transition timeout: here we mark nodes as really down 365 | handle_transition_timeout(TransRef, #allocator{map = Map} = State) -> 366 | ReallyDownNodes = maps:fold(fun 367 | (Node, #transition{ref = NodeRef}, Acc) when NodeRef == TransRef -> 368 | [Node|Acc]; 369 | (_Node, _Status, Acc) -> 370 | Acc 371 | end, [], Map), 372 | case ReallyDownNodes of 373 | [] -> 374 | unchanged; 375 | [_|_] -> 376 | NewState = set_realloc_install(ReallyDownNodes, down, State), 377 | {updated, NewState} 378 | end. 379 | 380 | %% The leader has been elected. He has Election from a gen_leader and an outdated map. 381 | %% Here we mark nodes going down as down and request a status from nodes going up 382 | handle_new_election(Election, #allocator{name = Name} = State) -> 383 | % Determine which nodes need a status request 384 | OldAlive = alive_nodes(State), 385 | Alive = ?GEN_LEADER:alive(Election), 386 | BecameAlive = Alive -- OldAlive, 387 | 388 | % Determine which nodes should be marked as down 389 | OldDown = down_nodes(State), 390 | Down = ?GEN_LEADER:down(Election), 391 | BecameDown = Down -- OldDown, 392 | 393 | % Apply status changes 394 | StateDownMarked = lists:foldl(fun handle_node_down/2, State, BecameDown), 395 | 396 | AliveStatus = case BecameAlive of 397 | [] -> % No status will be set, so here we may return any one 398 | idle; 399 | [_|_] -> 400 | % Request statuses 401 | {Request, _Timer} = make_status_request(Name), 402 | Request 403 | end, 404 | 405 | set_realloc_install(BecameAlive, AliveStatus, StateDownMarked). 406 | 407 | 408 | -spec make_status_request(Name :: atom()) -> {request(), timer:tref()}. 409 | make_status_request(Name) -> 410 | % Set timer to handle possible troubles during status request 411 | RequestRef = make_ref(), 412 | Request = #request{ref = RequestRef}, 413 | {ok, Timer} = timer:apply_after(2000, ?GEN_LEADER, leader_cast, [Name, {request_timeout, RequestRef}]), 414 | {Request, Timer}. 415 | 416 | -spec make_conflict_request(Name :: atom(), Shard :: integer()) -> {conflict(), timer:tref()}. 417 | make_conflict_request(Name, Shard) -> 418 | CRef = make_ref(), 419 | Conflict = #conflict{shard = Shard, ref = CRef, score = undefined}, 420 | {ok, Timer} = timer:apply_after(2000, ?GEN_LEADER, leader_cast, [Name, {conflict_timeout, CRef}]), 421 | {Conflict, Timer}. 422 | 423 | -spec make_transition(Name :: atom(), Shard :: integer()) -> {transition(), timer:tref()}. 424 | make_transition(Name, Shard) -> 425 | TransRef = make_ref(), 426 | Request = #transition{ref = TransRef, shard = Shard}, 427 | {ok, Timer} = timer:apply_after(5000, ?GEN_LEADER, leader_cast, [Name, {transition_timeout, TransRef}]), 428 | {Request, Timer}. 429 | 430 | 431 | %% Restart all running requests. When leader changes during request, response may be lost. 432 | %% So we search for all status and score requests, then generate new references for them, starting corresponding timers 433 | restart_requests(#allocator{name = Name, map = Map} = State) -> 434 | {Name, NewMap, _RefMigration} = maps:fold(fun restart_request/3, {Name, #{}, #{}}, Map), 435 | State#allocator{map = NewMap}. 436 | 437 | restart_request(Node, #request{ref = OldRef} = OldRequest, {Name, Map, RefMigration}) -> 438 | NewRequest = #request{ref = NewRef} = case maps:get(OldRef, RefMigration, undefined) of 439 | undefined -> 440 | {NewRequest_, _} = make_status_request(Name), 441 | NewRequest_; 442 | ExistingRef -> 443 | OldRequest#request{ref = ExistingRef} 444 | end, 445 | restart_request_store(Name, Node, NewRequest, OldRef, NewRef, Map, RefMigration); 446 | restart_request(Node, #conflict{ref = OldRef, shard = Shard} = OldRequest, {Name, Map, RefMigration}) -> 447 | NewRequest = #conflict{ref = NewRef} = case maps:get(OldRef, RefMigration, undefined) of 448 | undefined -> 449 | {NewRequest_, _} = make_conflict_request(Name, Shard), 450 | NewRequest_; 451 | ExistingRef -> 452 | OldRequest#conflict{ref = ExistingRef, score = undefined} 453 | end, 454 | restart_request_store(Name, Node, NewRequest, OldRef, NewRef, Map, RefMigration); 455 | restart_request(Node, #transition{ref = OldRef, shard = Shard} = OldRequest, {Name, Map, RefMigration}) -> 456 | NewRequest = #transition{ref = NewRef} = case maps:get(OldRef, RefMigration, undefined) of 457 | undefined -> 458 | {NewRequest_, _} = make_transition(Name, Shard), 459 | NewRequest_; 460 | ExistingRef -> 461 | OldRequest#transition{ref = ExistingRef} 462 | end, 463 | restart_request_store(Name, Node, NewRequest, OldRef, NewRef, Map, RefMigration); 464 | restart_request(Node, NotRequest, {Name, Map, RefMigration}) -> 465 | NewMap = maps:put(Node, NotRequest, Map), 466 | {Name, NewMap, RefMigration}. 467 | 468 | restart_request_store(Name, Node, NewRequest, OldRef, NewRef, Map, RefMigration) -> 469 | NewMap = maps:put(Node, NewRequest, Map), 470 | NewRefMigration = maps:put(OldRef, NewRef, RefMigration), 471 | {Name, NewMap, NewRefMigration}. 472 | 473 | 474 | handle_node_down(Node, #allocator{name = Name, map = Map} = State) -> 475 | NewStatus = case get_allocation(Node, Map) of 476 | #active{shard = Shard} -> 477 | {Transition, _} = make_transition(Name, Shard), 478 | Transition; 479 | _ -> 480 | down 481 | end, 482 | 483 | NewMap = set_statuses([Node], NewStatus, Map), 484 | set_manager(Node, undefined, State#allocator{map = NewMap}). 485 | 486 | %% Remember node's bound manager 487 | -spec set_manager(Node :: node(), Manager :: undefined | pid(), state()) -> state(). 488 | set_manager(Node, ShardManager, #allocator{managers = ManMap} = State) -> 489 | true = maps:is_key(Node, ManMap), 490 | NewManMap = maps:update(Node, ShardManager, ManMap), 491 | State#allocator{managers = NewManMap}. 492 | 493 | 494 | %% Set given status for a given list of nodes, reallocate shards, install the updated map 495 | -spec set_realloc_install(Nodes :: [node()], Status :: node_status(), state()) -> state(). 496 | set_realloc_install(Nodes, Status, #allocator{map = Map, shard_count = ShardCount} = State) -> 497 | MapUpdated = set_statuses(Nodes, Status, Map), 498 | NewMap = reallocate(ShardCount, MapUpdated), 499 | install_new_map(NewMap, State). 500 | 501 | %% batch set status for given list of nodes 502 | -spec set_statuses(Nodes :: [node()], Status :: node_status(), Map :: allocation_map()) -> allocation_map(). 503 | set_statuses(Nodes, Status, Map) -> 504 | [] = Nodes -- maps:keys(Map), 505 | OverrideMap = maps:from_list([{N, Status} || N <- Nodes]), 506 | maps:merge(Map, OverrideMap). 507 | 508 | 509 | %% Perform shard allocation when possible 510 | -spec reallocate(ShardCount :: integer(), Map :: allocation_map()) -> allocation_map(). 511 | reallocate(ShardCount, #{} = Map) when is_integer(ShardCount) -> 512 | HaveQuorum = (length(alive_nodes(Map)) >= ShardCount), 513 | HavePendingReq = lists:any(fun is_request/1, maps:values(Map)), 514 | case {HaveQuorum, HavePendingReq} of 515 | {true, false} -> % Require quorum and no status requests for reallocation 516 | do_reallocate(ShardCount, Map); 517 | {_, _} -> 518 | Map 519 | end. 520 | 521 | do_reallocate(ShardCount, #{} = Map) -> 522 | Shards = lists:seq(1, ShardCount), 523 | AllocatedShards = shards_in_use(Map), 524 | 525 | ShardsToAllocate = Shards -- AllocatedShards, 526 | StandbyNodes = maps:fold(fun collect_standby_nodes/3, [], Map), 527 | 528 | Allocations = safe_zip(StandbyNodes, ShardsToAllocate), 529 | 530 | MapOverride = maps:from_list([{Node, #active{shard = Shard}} || {Node, Shard} <- Allocations]), 531 | maps:merge(Map, MapOverride). 532 | 533 | 534 | %% Install a new allocation map and perform all needed actions 535 | -spec install_new_map(Map :: allocation_map(), state()) -> state(). 536 | install_new_map(NewMap, #allocator{name = Name} = State) -> 537 | error_logger:info_msg("Minishard allocator ~w: installing new map ~9999p", [Name, NewMap]), 538 | MyNewStatus = get_allocation(node(), NewMap), 539 | NewState = handle_my_new_status(MyNewStatus, State#allocator{map = NewMap}), 540 | ok = export_shard_map(NewState), 541 | NewState. 542 | 543 | 544 | %% Leader has possibly changed our status. Let's see what we should do 545 | -spec handle_my_new_status(node_status(), state()) -> state(). 546 | handle_my_new_status(undefined, #allocator{my_status = worker} = State) -> 547 | % I am worker, so my status is always worker, and I don't appear in a map 548 | State; 549 | handle_my_new_status(OldStatus, #allocator{my_status = OldStatus} = State) -> 550 | % Unchanged, pass 551 | State; 552 | handle_my_new_status(#request{ref = Ref}, #allocator{last_response = Ref} = State) -> 553 | % We have already sent a status update for this request 554 | State; 555 | handle_my_new_status(#request{ref = Ref}, #allocator{ 556 | name = Name, my_status = MyStatus, shard_manager = Manager, hacks = Hacks} = State) -> 557 | apply_hack(on_status_request, Hacks), 558 | % New status request. Send an update and wait 559 | Report = #status_update{ref = Ref, node = node(), status = MyStatus, manager = Manager}, 560 | ?GEN_LEADER:leader_cast(Name, Report), 561 | State#allocator{last_response = Ref}; 562 | handle_my_new_status(#conflict{ref = Ref}, #allocator{last_response = Ref} = State) -> 563 | % We have already sent a score for this conflict 564 | State; 565 | handle_my_new_status(#conflict{ref = Ref}, #allocator{name = Name, shard_manager = ShManager, hacks = Hacks} = State) -> 566 | apply_hack(on_conflict_request, Hacks), 567 | % New score request. Send an update and wait 568 | case minishard_shard:get_score_or_kill(ShManager) of 569 | undefined -> 570 | throw({stop, shard_score_timeout, State}); 571 | Score when is_integer(Score) -> 572 | Report = #score_report{ref = Ref, node = node(), score = Score}, 573 | ?GEN_LEADER:leader_cast(Name, Report), 574 | State#allocator{last_response = Ref} 575 | end; 576 | handle_my_new_status(down, #allocator{shard_manager = Manager} = State) -> 577 | ok = set_manager_status(Manager, idle), 578 | throw({stop, {shutdown, shut_down_by_leader}, State}); 579 | handle_my_new_status(#active{shard = NewShard}, #allocator{my_status = #active{shard = OldShard}} = State) 580 | when NewShard /= OldShard -> 581 | throw({stop, {shard_suddenly_changed, OldShard, NewShard}, State}); 582 | handle_my_new_status(#active{shard = OldShard} = Status, #allocator{my_status = #active{shard = OldShard}} = State) -> 583 | State#allocator{my_status = Status}; 584 | handle_my_new_status(Status, #allocator{shard_manager = Manager} = State) 585 | when Status == idle; Status == standby; is_record(Status, active) -> 586 | ok = set_manager_status(Manager, Status), 587 | State#allocator{my_status = Status}. 588 | 589 | 590 | 591 | 592 | %% Export a shard map to the ETS 593 | export_shard_map(#allocator{name = Name, managers = ManagerMap} = State) -> 594 | ShardNodeMap = shard_node_map(State), 595 | _ = maps:fold(fun export_shard_info/3, {Name, ManagerMap}, ShardNodeMap), 596 | ok. 597 | 598 | shard_node_map(#allocator{shard_count = ShardCount, map = NodeMap}) -> 599 | SeedShardMap = maps:from_list([{Shard, undefined} || Shard <- lists:seq(1, ShardCount)]), 600 | maps:fold(fun collect_active_shards/3, SeedShardMap, NodeMap). 601 | 602 | collect_active_shards(Node, #active{shard = Shard}, ShardNodeMap) -> 603 | maps:update(Shard, Node, ShardNodeMap); 604 | collect_active_shards(_Node, _Status, ShardNodeMap) -> 605 | ShardNodeMap. 606 | 607 | export_shard_info(Shard, Node, {Name, ManagerMap}) -> 608 | Manager = maps:get(Node, ManagerMap, undefined), 609 | true = ets:insert(Name, ?ETS_SHARD_RECORD(Shard, Node, Manager)), 610 | {Name, ManagerMap}. 611 | 612 | 613 | %% Set shard manager status when leader updates it 614 | -spec set_manager_status(Manager :: undefined | pid(), Status :: idle | standby | active()) -> ok. 615 | set_manager_status(undefined, _Status) -> % No manager, pass 616 | ok; 617 | set_manager_status(Manager, Status) -> 618 | minishard_shard:set_status(Manager, node_status_for_manager(Status)). 619 | 620 | node_status_for_manager(idle) -> idle; 621 | node_status_for_manager(standby) -> standby; 622 | node_status_for_manager(#active{shard = Shard}) -> {active, Shard}. 623 | 624 | 625 | %% A node has sent a valid status update. We should check the updated map for conflicts and maybe start resolution 626 | -spec handle_possible_conflicts(Node :: node(), Status :: node_status(), state()) -> state(). 627 | handle_possible_conflicts(Node, #active{shard = Shard} = Status, 628 | #allocator{name = Name, map = Map, hacks = Hacks} = State) -> 629 | case shard_nodes(Shard, Map) of 630 | [] -> % no other candidates for this shard 631 | set_realloc_install([Node], Status, State); 632 | [_|_] = ConflictingNodes -> % Oops, we have a conflict 633 | apply_hack(on_conflict_detected, Hacks), 634 | {Conflict, _} = make_conflict_request(Name, Shard), 635 | set_realloc_install([Node|ConflictingNodes], Conflict, State) 636 | end; 637 | 638 | handle_possible_conflicts(Node, Status, #allocator{} = State) -> 639 | set_realloc_install([Node], Status, State). 640 | 641 | 642 | resolve_conflict(undefined, [], #allocator{} = State) -> 643 | % No conflict, pass 644 | State; 645 | resolve_conflict(Shard, [_|_] = NodeScores, #allocator{map = Map} = State) -> 646 | {Winner, _} = select_winner(NodeScores), 647 | MapWithWinner = case Winner of 648 | undefined -> Map; 649 | _RealWinner -> set_statuses([Winner], #active{shard = Shard}, Map) 650 | end, 651 | Losers = [Node || {Node, _} <- NodeScores, Node /= Winner], 652 | NewMap = set_statuses(Losers, down, MapWithWinner), 653 | install_new_map(NewMap, State). 654 | 655 | 656 | %% Helper: list nodes marked as alive in a map 657 | alive_nodes(#allocator{map = Map}) -> 658 | alive_nodes(Map); 659 | alive_nodes(#{} = Map) -> 660 | maps:fold(fun collect_alive/3, [], Map). 661 | 662 | collect_alive(_Node, down, Acc) -> Acc; 663 | collect_alive(_Node, #transition{}, Acc) -> Acc; 664 | collect_alive(Node, _, Acc) -> [Node|Acc]. 665 | 666 | %% Helper: list nodes marked as down in a map 667 | down_nodes(#allocator{map = Map}) -> 668 | maps:fold(fun collect_down/3, [], Map). 669 | 670 | collect_down(Node, down, Acc) -> [Node|Acc]; 671 | collect_down(Node, #transition{}, Acc) -> [Node|Acc]; 672 | collect_down(_Node, _, Acc) -> Acc. 673 | 674 | 675 | %% Helper: check if node status in map is 'status requested' 676 | is_request(#request{}) -> true; 677 | is_request(_) -> false. 678 | 679 | %% Helper: return a list of shards in use 680 | shards_in_use(Map) -> 681 | maps:fold(fun collect_shards_in_use/3, [], Map). 682 | 683 | %% Helper: add active and conflicting shards to the accumulator 684 | collect_shards_in_use(_Node, #active{shard = Shard}, Acc) -> [Shard|Acc]; 685 | collect_shards_in_use(_Node, #conflict{shard = Shard}, Acc) -> [Shard|Acc]; 686 | collect_shards_in_use(_Node, #transition{shard = Shard}, Acc) -> [Shard|Acc]; 687 | collect_shards_in_use(_Node, _, Acc) -> Acc. 688 | 689 | %% Get a list of nodes pretending to host the shard 690 | shard_nodes(Shard, Map) -> 691 | {Shard, Nodes} = maps:fold(fun collect_nodes_by_shard/3, {Shard, []}, Map), 692 | Nodes. 693 | 694 | %% Helper: when node wants to host the shard, add it to the accumulator 695 | collect_nodes_by_shard(Node, #active{shard = Shard}, {Shard, Acc}) -> {Shard, [Node|Acc]}; 696 | collect_nodes_by_shard(Node, #conflict{shard = Shard}, {Shard, Acc}) -> {Shard, [Node|Acc]}; 697 | collect_nodes_by_shard(_Node, _, {Shard, Acc}) -> {Shard, Acc}. 698 | 699 | %% Helper: add standby nodes to the accumulator 700 | collect_standby_nodes(Node, standby, Acc) -> [Node|Acc]; 701 | collect_standby_nodes(_Node, _, Acc) -> Acc. 702 | 703 | %% Helper: do the same as lists:zip/2, but stop when any list ends 704 | safe_zip([H1|L1], [H2|L2]) -> 705 | [{H1, H2}|safe_zip(L1, L2)]; 706 | safe_zip(_L1, _L2) -> 707 | []. 708 | 709 | %% Helper: get conflict shard and node scores by a conflict ref 710 | conflict_shard_and_scores(Ref, Map) -> 711 | {Ref, Shard, NodeScores} = maps:fold(fun collect_conflict_shard_scores/3, {Ref, undefined, []}, Map), 712 | {Shard, NodeScores}. 713 | 714 | collect_conflict_shard_scores(Node, #conflict{ref = Ref, shard = Shard, score = Score}, {Ref, _, NodeScores}) -> 715 | {Ref, Shard, [{Node, Score}|NodeScores]}; 716 | collect_conflict_shard_scores(_Node, _, {Ref, Shard, NodeScores}) -> 717 | {Ref, Shard, NodeScores}. 718 | 719 | %% Helper: select a winner from given {Node, Score} list when possible 720 | -spec select_winner([{node(), number()}]) -> {Winner :: undefined | node(), BestScore :: undefined | number()}. 721 | select_winner(NodeScores) -> 722 | lists:foldl(fun find_best_score/2, {undefined, undefined}, NodeScores). 723 | 724 | find_best_score({BestNode, BestScore}, {_Node, undefined}) -> 725 | {BestNode, BestScore}; 726 | find_best_score({_BestNode, undefined}, {Node, Score}) -> 727 | {Node, Score}; 728 | find_best_score({BestNode, BestScore}, {_Node, Score}) when BestScore >= Score -> 729 | {BestNode, BestScore}; 730 | find_best_score({_Node, _PrevBestScore}, {BetterNode, BetterScore}) -> 731 | {BetterNode, BetterScore}. 732 | 733 | 734 | %% Build cluster status report 735 | make_cluster_status(#allocator{shard_count = ShardCount, map = NodeMap} = State, _Election) -> 736 | TotalNodeCnt = length(maps:keys(NodeMap)), 737 | AliveNodesCnt = length(alive_nodes(State)), 738 | ExportNodeMap = maps:map(fun export_node_status/2, NodeMap), 739 | ShardNodeMap = shard_node_map(State), 740 | NodeStatusMap = maps:fold(fun allocation_to_node_status/3, ExportNodeMap, ShardNodeMap), 741 | AllocatedShardCnt = length(lists:filter(fun(N) -> N /= undefined end, maps:values(ShardNodeMap))), 742 | OverallStatus = overall_status(ShardCount, AllocatedShardCnt, AliveNodesCnt), 743 | 744 | Counts = #{shards => {ShardCount, AllocatedShardCnt}, nodes => {TotalNodeCnt, AliveNodesCnt}}, 745 | {OverallStatus, Counts, NodeStatusMap}. 746 | 747 | -spec overall_status(ShardCount :: integer(), AllocatedShardCount :: integer(), AliveNodesCount :: integer()) -> atom(). 748 | overall_status(ShardCount, ShardCount, _AliveNodesCnt) -> 749 | available; 750 | overall_status(ShardCount, _AllocCnt, AliveNodesCnt) when AliveNodesCnt < ShardCount -> 751 | degraded; 752 | overall_status(_ShardCount, AllocatedShardCnt, AliveNodesCnt) when AllocatedShardCnt < AliveNodesCnt -> 753 | allocation_pending; 754 | overall_status(_ShardCount, _AllocatedShardCnt, _AliveNodesCount) -> 755 | transition. 756 | 757 | allocation_to_node_status(ShardNum, undefined, NodeStatuses) -> 758 | maps:put({not_allocated, ShardNum}, undefined, NodeStatuses); 759 | allocation_to_node_status(ShardNum, ShardNode, NodeStatuses) -> 760 | maps:put(ShardNode, {active, ShardNum}, NodeStatuses). 761 | 762 | %% Translate node statuses to minishard external status format 763 | export_node_status(_Node, down) -> unavailable; 764 | export_node_status(_Node, idle) -> not_my_cluster; 765 | export_node_status(_Node, standby) -> available; 766 | export_node_status(_Node, #active{shard = Shard}) -> {active, Shard}; 767 | export_node_status(_Node, _) -> transition. 768 | 769 | 770 | %% Hacks: apply a hack 771 | apply_hack(HackName, Hacks) -> 772 | case maps:get(HackName, Hacks, undefined) of 773 | undefined -> ok; 774 | Fun when is_function(Fun, 0) -> Fun(); 775 | {M, F, A} when is_atom(M), is_atom(F), is_list(A) -> apply(M, F, A) 776 | end. 777 | 778 | 779 | %collect_shard_map(?ETS_SHARD_RECORD(Shard, Node, _), Map) -> 780 | % maps:put(Shard, Node, Map); 781 | %collect_shard_map(_, Map) -> 782 | % Map. 783 | -------------------------------------------------------------------------------- /src/minishard_gen_leader.erl: -------------------------------------------------------------------------------- 1 | %%% ``The contents of this file are subject to the Erlang Public License, 2 | %%% Version 1.1, (the "License"); you may not use this file except in 3 | %%% compliance with the License. You should have received a copy of the 4 | %%% Erlang Public License along with this software. If not, it can be 5 | %%% retrieved via the world wide web at http://www.erlang.org/. 6 | %%% 7 | %%% Software distributed under the License is distributed on an "AS IS" 8 | %%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See 9 | %%% the License for the specific language governing rights and limitations 10 | %%% under the License. 11 | %%% 12 | %%% The Initial Developer of the Original Code is Ericsson Utvecklings AB. 13 | %%% Portions created by Ericsson are Copyright 1999, Ericsson Utvecklings 14 | %%% AB. All Rights Reserved.'' 15 | %%% 16 | %%% 17 | %%% $Id: gen_leader.erl, v 1.4 2008/09/19 07:40:15 hanssv Exp $ 18 | %%% 19 | %%% @author Hans Svensson 20 | %%% @author Thomas Arts 21 | %%% @author Ulf Wiger 22 | %%% @author (contributor: Serge Aleynikov ) 23 | %%% @author (contributor: Danil Zagoskin ) 24 | %%% 25 | %%% @doc Leader election behavior. 26 | %%%

This application implements a leader election behavior modeled after 27 | %%% gen_server. This behavior intends to make it reasonably 28 | %%% straightforward to implement a fully distributed server with 29 | %%% master-slave semantics.

30 | %%%

The gen_leader behavior supports nearly everything that gen_server 31 | %%% does (some functions, such as multicall() and the internal timeout, 32 | %%% have been removed), and adds a few callbacks and API functions to 33 | %%% support leader election etc.

34 | %%%

Also included is an example program, a global dictionary, based 35 | %%% on the modules gen_leader and dict. The callback implementing the 36 | %%% global dictionary is called 'test_cb', for no particularly logical 37 | %%% reason.

38 | %%%

New version: The internal leader election algorithm was faulty 39 | %%% and has been replaced with a new version based on a different leader 40 | %%% election algorithm. As a consequence of this the query functions 41 | %%% alive and down can no longer be provided. 42 | %%% The new algorithm also make use of an incarnation parameter, by 43 | %%% default written to disk in the function incarnation. This 44 | %%% implies that only one gen_leader per node is permitted, if 45 | %%% used in a diskless environment, incarnation must be adapted. 46 | %%%

47 | %%%

48 | %%% Modifications contributed by Serge Aleynikov: 49 | %%%

    50 | %%%
  1. Added configurable startup options (see leader_options() type)
  2. 51 | %%%
  3. Implemented handle_DOWN/3 callback with propagation of the 52 | %%% leader's state via broadcast to all connected candidates.
  4. 53 | %%%
  5. Fixed population of the #election.down member so that down/1 query 54 | %%% can be used in the behavior's implementation
  6. 55 | %%%
  7. Rewrote implementation of the tau timer to prevent the leader 56 | %%% looping on the timer timeout event when all candidates are connected.
  8. 57 | %%%
58 | %%%

59 | %%%

60 | %%% Modifications done by Danil Zagoskin: 61 | %%%

    62 | %%%
  1. Renamed gen_leader to minishard_gen_leader to avoid name clashes
  2. 63 | %%%
  3. Timestamp is used as incarnation value to avoid disk access
  4. 64 | %%%
  5. Made initial discovery messages sending fast and unreliable. 65 | %%% This prevents blocking on start.
  6. 66 | %%%
  7. Added node pinger so that cluster recovers after network problems
  8. 67 | %%%
  9. Handled some corner cases which may appear during network problems
  10. 68 | %%%
69 | %%%

70 | %%% @end 71 | %%% 72 | %%% 73 | -module(minishard_gen_leader). 74 | 75 | %% Time between rounds of query from the leader 76 | -define(TAU, 5000). 77 | 78 | -export([start/6, 79 | start_link/6, 80 | leader_call/2, leader_call/3, leader_cast/2, 81 | call/2, call/3, cast/2, 82 | reply/2]). 83 | 84 | %% Query functions 85 | -export([alive/1, 86 | down/1, 87 | candidates/1, 88 | workers/1, 89 | broadcast/3, 90 | leader_node/1]). 91 | 92 | -export([system_continue/3, 93 | system_terminate/4, 94 | system_code_change/4, 95 | format_status/2, 96 | worker_announce/2 97 | ]). 98 | 99 | %% Internal exports 100 | -export([real_loop/4, real_safe_loop/4, real_mon_loop/2]). 101 | -export([init_ping_loop/2, ping_loop/3]). 102 | -export([init_it/6, 103 | print_event/3, 104 | send_checkleads/4 105 | ]). 106 | 107 | 108 | %% Notification control of candidate membership changes. `all' 109 | %% means that returns from the handle_DOWN/3 and elected/3 leader's events 110 | %% will be broadcast to all candidates. 111 | -type bcast_type() :: 'all' | 'sender'. 112 | 113 | -type option() :: {'workers', Workers::[node()]} 114 | | {'vardir', Dir::string()} 115 | | {'bcast_type', Type::bcast_type()} 116 | | {'heartbeat', Seconds::integer()} 117 | | {'seed_node', Seed::node()} 118 | . 119 | 120 | -type options() :: [option()]. 121 | 122 | -type status() :: 'elec1' | 'elec2' | 'wait' | 'joining' | 'worker' | 123 | 'waiting_worker' | 'norm'. 124 | 125 | %% A locally registered name 126 | -type name() :: atom(). 127 | 128 | %% A monitor ref 129 | -type mon_ref() :: reference(). 130 | 131 | -type server_ref() :: name() | {name(), node()} | pid(). 132 | 133 | %% Incarnation number 134 | -type incarn() :: non_neg_integer(). 135 | 136 | %% Logical clock 137 | -type lclock() :: non_neg_integer(). 138 | 139 | %% Node priority in the election 140 | -type priority() :: integer(). 141 | 142 | %% Election id 143 | -type elid() :: {priority(), incarn(), lclock()}. 144 | 145 | %% See gen_server. 146 | -type caller_ref() :: {pid(), reference()}. 147 | 148 | %% Opaque state of the gen_leader behaviour. 149 | -record(election, { 150 | leader = none :: 'none' | pid(), 151 | previous_leader = none :: 'none' | pid(), 152 | name :: name(), 153 | leadernode = none :: 'none' | node(), 154 | candidate_nodes = [] :: [node()], 155 | worker_nodes = [] :: [node()], 156 | down = [] :: [node()], 157 | monitored = [] :: [{mon_ref(), node()}], 158 | buffered = [] :: [{reference(), caller_ref()}], 159 | seed_node = none :: 'none' | node(), 160 | status :: status(), 161 | elid :: elid(), 162 | acks = [] :: [node()], 163 | work_down = [] :: [node()], 164 | cand_timer_int :: integer(), 165 | cand_timer :: term(), 166 | pendack :: node(), 167 | incarn :: incarn(), 168 | nextel :: integer(), 169 | %% all | one. When `all' each election event 170 | %% will be broadcast to all candidate nodes. 171 | bcast_type :: bcast_type() 172 | }). 173 | 174 | -opaque election() :: #election{}. 175 | 176 | -export_type([election/0]). 177 | 178 | -record(server, { 179 | parent, 180 | mod, 181 | state, 182 | pinger_proc = spawn_pinger_proc(), 183 | monitor_proc = spawn_monitor_proc(), 184 | debug :: [sys:dbg_opt()] 185 | }). 186 | 187 | %%% --------------------------------------------------- 188 | %%% Interface functions. 189 | %%% --------------------------------------------------- 190 | 191 | -callback init(any()) -> 192 | {ok, term()} 193 | | {stop, term()} 194 | | ignore 195 | | {'EXIT', term()} 196 | . 197 | -callback elected(term(), election(), pid() | undefined) -> 198 | {ok, term(), term()} 199 | | {reply, term(), term()} 200 | . 201 | -callback surrendered(term(), term(), election()) -> 202 | {ok, term()} . 203 | -callback handle_leader_call(term(), pid(), term(), election()) -> 204 | {reply, term(), term()} 205 | | {reply, term(), term(), term()} 206 | | {noreply, term()} 207 | | {stop, term(), term(), term()} 208 | . 209 | -callback handle_leader_cast(term(), term(), election()) -> 210 | {noreply, term()} 211 | | {ok, term(), term()} 212 | . 213 | -callback from_leader(term(), term(), election()) -> 214 | {noreply, term()} 215 | | {ok, term()} 216 | | {stop, term(), term()} 217 | | {'EXIT', term()} 218 | . 219 | -callback handle_call(term(), pid(), term(), election()) -> 220 | {noreply, term()} 221 | | {reply, term(), term()} 222 | | {ok, term()} 223 | | {stop, term(), term()} 224 | | {'EXIT', term()} 225 | . 226 | -callback handle_cast(term(), term(), election()) -> 227 | {noreply, term()} 228 | | {ok, term()} 229 | | {stop, term(), term()} 230 | | {'EXIT', term()} 231 | . 232 | -callback handle_DOWN(node(), term(), election()) -> 233 | {ok, term()} 234 | | {ok, term(), term()} 235 | . 236 | -callback handle_info(term(), term(), election()) -> 237 | {noreply, term()} 238 | | {ok, term()} 239 | | {stop, term(), term()} 240 | | {'EXIT', term()} 241 | . 242 | -callback terminate(term(), term()) -> 243 | any() . 244 | -callback code_change(term() | {down, term()}, term(), election(), term()) -> 245 | {ok, term()} 246 | | {error, term()} 247 | . 248 | 249 | -on_load(notify_new_code/0). 250 | notify_new_code() -> 251 | [P ! code_reloaded || P <- processes(), erlang:check_process_code(P, ?MODULE) == true], 252 | ok. 253 | 254 | -type start_ret() :: {'ok', pid()} | {'error', term()}. 255 | 256 | %% @doc Starts a gen_leader process without linking to the parent. 257 | %% @see start_link/6 258 | -spec start(Name::atom(), CandidateNodes::[node()], OptArgs::options(), 259 | Mod::module(), Arg::term(), Options::list()) -> start_ret(). 260 | start(Name, CandidateNodes, OptArgs, Mod, Arg, Options) 261 | when is_atom(Name), is_list(CandidateNodes), is_list(OptArgs) -> 262 | gen:start(?MODULE, nolink, {local, Name}, 263 | Mod, {CandidateNodes, OptArgs, Arg}, Options). 264 | 265 | %% @doc Starts a gen_leader process. 266 | %% 267 | %% 268 | %% 270 | %% 271 | %% 291 | %% 292 | %% 293 | %% 294 | %%
NameThe locally registered name of the process
CandidateNodesThe names of nodes capable of assuming 269 | %% a leadership role
OptArgsOptional arguments given to `gen_leader'. 272 | %% 273 | %%
{workers, Workers}
274 | %%
The names of nodes that will be part of the "cluster", 275 | %% but cannot ever assume a leadership role. Default: [].
276 | %%
{vardir, Dir}
277 | %%
Directory name used to store candidate's incarnation cookie. 278 | %% Default: "."
279 | %%
{bcast_type, Type}
280 | %%
When `Type' is 'all' each election event (when a new 281 | %% candidate becomes visible to the leader) will be broadcast 282 | %% to all live candidate nodes. Each candidate will get 283 | %% a from_leader/3 callback. When `Type' is `sender', only 284 | %% the newly registered candidate will get the surrendered/3 285 | %% callback. Default: `sender'.
286 | %%
{heartbeat, Seconds}
287 | %%
Heartbeat timeout value used to send ping messages to inactive 288 | %% candidate nodes.
289 | %%
290 | %%
ModThe name of the callback module
ArgArgument passed on to Mod:init/1
OptionsSame as gen_server's Options
295 | %% 296 | %%

The list of candidates needs to be known from the start. Workers 297 | %% could potentially be added at runtime, but no functionality to do 298 | %% this is provided by this version.

299 | %% @end 300 | -spec start_link(Name::atom(), CandidateNodes::[node()], OptArgs::options(), 301 | Mod::module(), Arg::term(), Options::list()) -> start_ret(). 302 | start_link(Name, CandidateNodes, OptArgs, Mod, Arg, Options) 303 | when is_atom(Name), is_list(CandidateNodes), is_list(OptArgs) -> 304 | gen:start(?MODULE, link, {local, Name}, 305 | Mod, {CandidateNodes, OptArgs, Arg}, Options). 306 | 307 | %% Query functions to be used from the callback module 308 | 309 | %% @doc Returns list of alive nodes. 310 | -spec alive(election()) -> [node()]. 311 | alive(E) -> 312 | candidates(E) -- down(E). 313 | 314 | %% @doc Returns list of down nodes. 315 | -spec down(election()) -> [node()]. 316 | down(#election{down = Down}) -> 317 | Down. 318 | 319 | %% @doc Returns the current leader node. 320 | -spec leader_node(election()) -> node() | 'none'. 321 | leader_node(#election{leadernode=Leader}) -> 322 | Leader. 323 | 324 | %% @doc Returns a list of known candidates. 325 | -spec candidates(election()) -> [node()]. 326 | candidates(#election{candidate_nodes = Cands}) -> 327 | Cands. 328 | 329 | %% @doc Returns a list of known workers. 330 | -spec workers(election()) -> [node()]. 331 | workers(#election{worker_nodes = Workers}) -> 332 | Workers. 333 | 334 | %% Used by dynamically added workers. 335 | %% @hidden 336 | worker_announce(Name, Pid) -> 337 | Name ! {add_worker, Pid}, 338 | Name ! {heartbeat, Pid}. 339 | 340 | %% 341 | %% Make a call to a generic server. 342 | %% If the server is located at another node, that node will 343 | %% be monitored. 344 | %% If the client is trapping exits and is linked server termination 345 | %% is handled here (? Shall we do that here (or rely on timeouts) ?). 346 | %% 347 | %% @doc Equivalent to gen_server:call/2, but with a slightly 348 | %% different exit reason if something goes wrong. This function calls 349 | %% the gen_leader process exactly as if it were a gen_server 350 | %% (which, for practical purposes, it is.) 351 | %% @end 352 | -spec call(server_ref(), term()) -> term(). 353 | call(Name, Request) -> 354 | case catch gen:call(Name, '$gen_call', Request) of 355 | {ok, Res} -> 356 | Res; 357 | {'EXIT', Reason} -> 358 | exit({Reason, {?MODULE, local_call, [Name, Request]}}) 359 | end. 360 | 361 | %% @doc Equivalent to gen_server:call/3, but with a slightly 362 | %% different exit reason if something goes wrong. This function calls 363 | %% the gen_leader process exactly as if it were a gen_server 364 | %% (which, for practical purposes, it is.) 365 | %% @end 366 | -spec call(server_ref(), term(), integer()) -> term(). 367 | call(Name, Request, Timeout) -> 368 | case catch gen:call(Name, '$gen_call', Request, Timeout) of 369 | {ok, Res} -> 370 | Res; 371 | {'EXIT', Reason} -> 372 | exit({Reason, {?MODULE, local_call, [Name, Request, Timeout]}}) 373 | end. 374 | 375 | %% @doc Makes a call (similar to gen_server:call/2) to the 376 | %% leader. The call is forwarded via the local gen_leader instance, if 377 | %% that one isn't actually the leader. The client will exit if the 378 | %% leader dies while the request is outstanding. 379 | %%

This function uses gen:call/3, and is subject to the 380 | %% same default timeout as e.g. gen_server:call/2.

381 | %% @end 382 | %% 383 | -spec leader_call(Name::server_ref(), Request::term()) -> term(). 384 | leader_call(Name, Request) -> 385 | case catch gen:call(Name, '$leader_call', Request) of 386 | {ok, {leader, reply, Res}} -> 387 | Res; 388 | {ok, {error, leader_died}} -> 389 | exit({leader_died, {?MODULE, leader_call, [Name, Request]}}); 390 | {'EXIT', Reason} -> 391 | exit({Reason, {?MODULE, leader_call, [Name, Request]}}) 392 | end. 393 | 394 | %% @doc Makes a call (similar to gen_server:call/3) to the 395 | %% leader. The call is forwarded via the local gen_leader instance, if 396 | %% that one isn't actually the leader. The client will exit if the 397 | %% leader dies while the request is outstanding. 398 | %% @end 399 | %% 400 | -spec leader_call(Name::server_ref(), Request::term(), 401 | Timeout::integer()) -> term(). 402 | leader_call(Name, Request, Timeout) -> 403 | case catch gen:call(Name, '$leader_call', Request, Timeout) of 404 | {ok, {leader, reply, Res}} -> 405 | Res; 406 | {'EXIT', Reason} -> 407 | exit({Reason, {?MODULE, leader_call, [Name, Request, Timeout]}}) 408 | end. 409 | 410 | 411 | %% @equiv gen_server:cast/2 412 | -spec cast(Name::server_ref(), Request::term()) -> 'ok'. 413 | cast(Name, Request) -> 414 | catch do_cast('$gen_cast', Name, Request), 415 | ok. 416 | 417 | %% @doc Similar to gen_server:cast/2 but will be forwarded to 418 | %% the leader via the local gen_leader instance. 419 | -spec leader_cast(Name::server_ref(), Request::term()) -> 'ok'. 420 | leader_cast(Name, Request) -> 421 | catch do_cast('$leader_cast', Name, Request), 422 | ok. 423 | 424 | 425 | do_cast(Tag, ServerRef, Request) -> 426 | ServerRef ! {Tag, Request}. 427 | 428 | 429 | %% @equiv gen_server:reply/2 430 | -spec reply(From::caller_ref(), Reply::term()) -> term(). 431 | reply({To, Tag}, Reply) -> 432 | catch To ! {Tag, Reply}. 433 | 434 | 435 | %%% --------------------------------------------------- 436 | %%% Initiate the new process. 437 | %%% Register the name using the Rfunc function 438 | %%% Calls the Mod:init/Args function. 439 | %%% Finally an acknowledge is sent to Parent and the main 440 | %%% loop is entered. 441 | %%% --------------------------------------------------- 442 | %%% @hidden 443 | init_it(Starter, Parent, {local, Name}, Mod, {CandidateNodes, Workers, Arg}, Options) -> 444 | %% R13B passes {local, Name} instead of just Name 445 | init_it(Starter, Parent, Name, Mod, 446 | {CandidateNodes, Workers, Arg}, Options); 447 | init_it(Starter, self, Name, Mod, {CandidateNodes, OptArgs, Arg}, Options) -> 448 | init_it(Starter, self(), Name, Mod, 449 | {CandidateNodes, OptArgs, Arg}, Options); 450 | init_it(Starter, Parent, Name, Mod, {UnsortedCandidateNodes, OptArgs, Arg}, Options) -> 451 | Workers = proplists:get_value(workers, OptArgs, []), 452 | VarDir = proplists:get_value(vardir, OptArgs, "."), 453 | Interval = proplists:get_value(heartbeat, OptArgs, ?TAU div 1000) * 1000, 454 | BcastType = proplists:get_value(bcast_type, OptArgs, sender), 455 | Seed = proplists:get_value(seed, OptArgs, none), 456 | Debug = debug_options(Name, Options), 457 | CandidateNodes = lists:sort(UnsortedCandidateNodes), 458 | [spawn_link(net_adm, ping, [Node]) || Node <- CandidateNodes], timer:sleep(1000), 459 | AmCandidate = case lists:member(node(), CandidateNodes) of 460 | true -> true; 461 | false -> 462 | case lists:member(node(), Workers) of 463 | true -> false; 464 | false -> 465 | Seed =/= none 466 | end 467 | end, 468 | 469 | Election = #election{ 470 | candidate_nodes = CandidateNodes, 471 | worker_nodes = Workers, 472 | name = Name, 473 | nextel = 0, 474 | cand_timer_int = Interval, 475 | bcast_type = BcastType 476 | }, 477 | 478 | case {AmCandidate, lists:member(node(), Workers)} of 479 | {false, false} -> 480 | %% I am neither a candidate nor a worker - don't start this process 481 | error_logger:warning_msg("~w not started - node is not a candidate/worker\n", [Name]), 482 | proc_lib:init_ack(Starter, ignore), 483 | exit(normal); 484 | _ -> 485 | ok 486 | end, 487 | 488 | case {catch Mod:init(Arg), AmCandidate, Seed =/= none} of 489 | {{stop, Reason}, _, _} -> 490 | proc_lib:init_ack(Starter, {error, Reason}), 491 | exit(Reason); 492 | {ignore, _, _} -> 493 | proc_lib:init_ack(Starter, ignore), 494 | exit(normal); 495 | {{'EXIT', Reason}, _, _} -> 496 | proc_lib:init_ack(Starter, {error, Reason}), 497 | exit(Reason); 498 | {{ok, State}, true, false} -> 499 | Server = #server{parent = Parent, mod = Mod, 500 | state = State, debug = Debug}, 501 | Incarn = incarnation(VarDir, Name, node()), 502 | NewE = startStage1(Election#election{incarn = Incarn}, Server), 503 | proc_lib:init_ack(Starter, {ok, self()}), 504 | 505 | %% handle the case where there's only one candidate worker and we can't 506 | %% rely on DOWN messages to trigger the elected() call because we never get 507 | %% a DOWN for ourselves 508 | case CandidateNodes =:= [node()] of 509 | true -> 510 | %% there's only one candidate leader; us 511 | hasBecomeLeader(NewE, Server, {init}); 512 | false -> 513 | %% more than one candidate worker, continue as normal 514 | safe_loop(Server, candidate, NewE, {init}) 515 | end; 516 | {{ok, State}, true, true} -> 517 | Server = #server{parent = Parent, mod = Mod, 518 | state = State, debug = Debug}, 519 | Incarn = incarnation(VarDir, Name, node()), 520 | NewE1 = Election#election{incarn = Incarn, seed_node = Seed}, 521 | NewE = joinCluster(NewE1, Server), 522 | proc_lib:init_ack(Starter, {ok, self()}), 523 | safe_loop(Server, candidate_joining, NewE, {init}); 524 | {{ok, State}, false, HasSeed} -> 525 | proc_lib:init_ack(Starter, {ok, self()}), 526 | Candidates = case HasSeed of 527 | true -> 528 | {ok, C} = call({Name, Seed}, get_candidates), 529 | C; 530 | false -> CandidateNodes 531 | end, 532 | case lists:member(node(), Workers) of 533 | true -> 534 | rpc:multicall(Candidates, ?MODULE, 535 | worker_announce, [Name, node(self())]); 536 | false -> nop 537 | end, 538 | safe_loop(#server{parent = Parent, mod = Mod, 539 | state = State, debug = Debug}, 540 | waiting_worker, Election, {init}); 541 | {Else, _, _} -> 542 | Error = {bad_return_value, Else}, 543 | proc_lib:init_ack(Starter, {error, Error}), 544 | exit(Error) 545 | end. 546 | 547 | 548 | %%% --------------------------------------------------- 549 | %%% The MAIN loops. 550 | %%% --------------------------------------------------- 551 | 552 | % this is the election loop. Only specific messages related 553 | % to the election process are received. User messages, defined 554 | % in e.g. a callback module, are postponed until the (re)election\ 555 | % is complete. 556 | safe_loop(#server{} = Server, Role, #election{} = Election, PrevMsg) -> 557 | ?MODULE:real_safe_loop(Server, Role, Election, PrevMsg). 558 | 559 | real_safe_loop(#server{mod = Mod, state = State} = Server, Role, 560 | #election{name = Name} = E, _PrevMsg) -> 561 | receive 562 | code_reloaded = Msg -> 563 | safe_loop(Server, Role, E, Msg); 564 | {system, From, Req} -> 565 | #server{parent = Parent, debug = Debug} = Server, 566 | sys:handle_system_msg(Req, From, Parent, ?MODULE, Debug, 567 | [safe, Server, Role, E]); 568 | {'EXIT', _, Reason} = Msg -> 569 | terminate(Reason, Msg, Server, Role, E); 570 | {update_candidates, _, _, _} = Msg -> 571 | safe_loop(Server, Role, E, Msg); 572 | {halt, T, From} = Msg -> 573 | NewE = halting(E, T, From, Server), 574 | From ! {ackLeader, T, self()}, 575 | safe_loop(Server, Role, NewE, Msg); 576 | {hasLeader, Ldr, T, _} = Msg when Role == candidate_joining -> 577 | NewE1 = mon_node(E, Ldr, Server), 578 | NewE = NewE1#election{elid = T, leadernode = node(Ldr)}, 579 | Ldr ! {isLeader, T, self()}, 580 | safe_loop(Server, Role, NewE, Msg); 581 | {hasLeader, Ldr, T, _} = Msg -> 582 | NewE1 = mon_node(E, Ldr, Server), 583 | case ( (E#election.status == elec2) and (E#election.acks /= []) ) of 584 | true -> 585 | lists:foreach( 586 | fun(Node) -> 587 | {Name, Node} ! {hasLeader, Ldr, T, self()} 588 | end, E#election.acks); 589 | false -> 590 | ok 591 | end, 592 | NewE = NewE1#election{elid = T, 593 | status = wait, 594 | leadernode = node(Ldr), 595 | down = E#election.down -- [node(Ldr)], 596 | acks = []}, 597 | Ldr ! {isLeader, T, self()}, 598 | safe_loop(Server, Role, NewE, Msg); 599 | {isLeader, T, From} = Msg -> 600 | From ! {notLeader, T, self()}, 601 | safe_loop(Server, Role, E, Msg); 602 | {notLeader, T, _} = Msg when Role == candidate_joining -> 603 | NewE = case E#election.elid == T of 604 | true -> 605 | joinCluster(E, Server); 606 | false -> 607 | E 608 | end, 609 | safe_loop(Server, Role, NewE, Msg); 610 | {notLeader, T, _} = Msg -> 611 | NewE = 612 | case ((E#election.status == wait) and (E#election.elid == T)) of 613 | true -> 614 | startStage1(E, Server); 615 | false -> 616 | E 617 | end, 618 | safe_loop(Server, Role, NewE, Msg); 619 | {ackLeader, T, From} = Msg -> 620 | NewE = 621 | case ( (E#election.status == elec2) and (E#election.elid == T) 622 | and (E#election.pendack == node(From)) ) of 623 | true -> 624 | continStage2( 625 | E#election{acks = [node(From)|E#election.acks]}, 626 | Server); 627 | false -> 628 | E 629 | end, 630 | hasBecomeLeader(NewE, Server, Msg); 631 | 632 | {ldr, Synch, T, _, _, From} = Msg when Role == waiting_worker -> 633 | case ( (T == E#election.elid) 634 | and (node(From) == E#election.leadernode)) of 635 | true -> 636 | NewE = E#election{ leader = From, status = worker }, 637 | {ok, NewState} = Mod:surrendered(State, Synch, NewE), 638 | loop(Server#server{state = NewState}, worker, NewE, Msg); 639 | false -> 640 | %% This should be a VERY special case... 641 | %% But doing nothing is the right thing! 642 | %% A DOWN message should arrive to solve this situation 643 | safe_loop(Server, Role, E, Msg) 644 | end; 645 | {ldr, Synch, T, Workers, Candidates, From} = Msg -> 646 | case ( ( (E#election.status == wait) or (E#election.status == joining) ) 647 | and (E#election.elid == T) ) of 648 | true -> 649 | timer:cancel(E#election.cand_timer), 650 | NewE1 = mon_node(E, From, Server), 651 | NewE2 = NewE1#election{leader = From, 652 | leadernode = node(From), 653 | previous_leader = E#election.leader, 654 | worker_nodes = Workers, 655 | candidate_nodes = Candidates, 656 | status = norm, 657 | cand_timer=undefined}, 658 | NewE = case Role == candidate_joining of 659 | true -> 660 | mon_nodes(NewE2, lesser(node(), candidates(NewE2)), Server); 661 | false -> NewE2 662 | end, 663 | {ok, NewState} = Mod:surrendered(State, Synch, NewE), 664 | loop(Server#server{state = NewState}, surrendered, NewE, Msg); 665 | false -> 666 | safe_loop(Server, Role, E, Msg) 667 | end; 668 | {normQ, T, From} = Msg -> 669 | NewE = 670 | case ( (E#election.status == elec1) 671 | or ( (E#election.status == wait) 672 | and (E#election.elid == T) ) ) of 673 | true -> 674 | NE = halting(E, T, From, Server), 675 | From ! {notNorm, T, self()}, 676 | NE; 677 | false -> 678 | E 679 | end, 680 | safe_loop(Server, Role, NewE, Msg); 681 | {notNorm, _, _} = Msg -> 682 | safe_loop(Server, Role, E, Msg); 683 | {workerAlive, T, From} = Msg -> 684 | NewE = 685 | case E#election.leadernode == none of 686 | true -> 687 | %% We should initiate activation, 688 | %% monitor the possible leader! 689 | NE = mon_node(E#election{leadernode = node(From), 690 | elid = T}, 691 | From, Server), 692 | From ! {workerIsAlive, T, self()}, 693 | NE; 694 | false -> 695 | %% We should acutally ignore this, the present activation 696 | %% will complete or abort first... 697 | E 698 | end, 699 | safe_loop(Server, Role, NewE, Msg); 700 | {workerIsAlive, _, _} = Msg -> 701 | %% If this happens, the activation process should abort 702 | %% This process is no longer the leader! 703 | %% The sender will notice this via a DOWN message 704 | safe_loop(Server, Role, E, Msg); 705 | {election} = Msg -> 706 | %% We're already in an election, so this is likely an old message. 707 | safe_loop(Server, Role, E, Msg); 708 | {heartbeat, _Node} = Msg -> 709 | safe_loop(Server, Role, E, Msg); 710 | {candidate_timer} = Msg -> 711 | Down = E#election.down, 712 | Server#server.pinger_proc ! {set_ping_nodes, Down}, 713 | NewE = 714 | case Down of 715 | [] -> 716 | timer:cancel(E#election.cand_timer), 717 | E#election{cand_timer = undefined}; 718 | Down -> 719 | %% get rid of any queued up candidate_timers, since we just handled one 720 | flush_candidate_timers(), 721 | %% Some of potential master candidate nodes are down. 722 | %% Try to wake them up 723 | F = fun(N) -> 724 | erlang:send({E#election.name, N}, {heartbeat, node()}, [nosuspend, noconnect]) 725 | end, 726 | [F(N) || N <- Down, {ok, up} =/= net_kernel:node_info(N, state)], 727 | E 728 | end, 729 | safe_loop(Server, Role, halt_pendack(NewE), Msg); 730 | {checklead, Node} = Msg -> 731 | %% in the very exceptional case when a candidate comes up when the 732 | %% elected leader is *behind* it in the candidate list *and* all nodes 733 | %% before it in the candidate list are up, the candidate will be stuck in 734 | %% the safe_loop forever. This is because gen_leader relies on either 735 | %% one of the nodes being down, or the nodes responding to the heartbeat 736 | %% sent as part of stage1. However, nodes that are up but are NOT the 737 | %% leader do not respond to heartbeats. In this very exceptional case, 738 | %% we send a heartbeat to the leader in response to the checklead it 739 | %% sent us to bootstrap things and get out of this quagmire. 740 | case lists:member(Node, E#election.candidate_nodes) and 741 | (E#election.status == elec1) of 742 | true -> 743 | case ( pos(Node, E#election.candidate_nodes) > 744 | pos(node(), E#election.candidate_nodes) ) of 745 | true -> 746 | {Name, Node} ! {heartbeat, self()}; 747 | _ -> 748 | ok 749 | end; 750 | _ -> 751 | ok 752 | end, 753 | safe_loop(Server, Role, E, Msg); 754 | {ldr, 'DOWN', Node} = Msg when Role == waiting_worker -> 755 | NewE = 756 | case Node == E#election.leadernode of 757 | true -> 758 | E#election{leader = none, leadernode = none, 759 | previous_leader = E#election.leader, 760 | status = waiting_worker, 761 | monitored = []}; 762 | false -> 763 | E 764 | end, 765 | safe_loop(Server, Role, NewE, Msg); 766 | {ldr, 'DOWN', Node} = Msg when Role == candidate_joining -> 767 | Ldr = E#election.leadernode, 768 | Seed = E#election.seed_node, 769 | case Node of 770 | Seed -> 771 | case net_adm:ping(Ldr) of 772 | pong -> noop; 773 | pang -> 774 | terminate(seed_nodes_down, Msg, Server, Role, E) 775 | end; 776 | Ldr -> 777 | case net_adm:ping(Seed) of 778 | pong -> 779 | NewE = joinCluster(E, Server), 780 | safe_loop(Server, Role, NewE, Msg); 781 | pang -> 782 | terminate(seed_nodes_down, Msg, Server, Role, E) 783 | end 784 | end; 785 | {ldr, 'DOWN', Node} = Msg -> 786 | NewMon = lists:keydelete(Node, 2, E#election.monitored), 787 | NewE = 788 | case lists:member(Node, E#election.candidate_nodes) of 789 | true -> 790 | NewDown = [Node | E#election.down], 791 | E1 = E#election{down = NewDown, monitored = NewMon}, 792 | case ( pos(Node, E#election.candidate_nodes) < 793 | pos(node(), E#election.candidate_nodes) ) of 794 | true -> 795 | Lesser = lesser(node(), E#election.candidate_nodes), 796 | LesserIsSubset = (Lesser -- NewDown) == [], 797 | case ((E#election.status == wait) 798 | and (Node == E#election.leadernode)) of 799 | true -> 800 | startStage1(E1, Server); 801 | false -> 802 | case ((E#election.status == elec1) and 803 | LesserIsSubset) of 804 | true -> 805 | startStage2( 806 | E1#election{down = Lesser}, 807 | Server); 808 | false -> 809 | E1 810 | end 811 | end; 812 | false -> 813 | case ( (E#election.status == elec2) 814 | and (Node == E#election.pendack) ) of 815 | true -> 816 | continStage2(E1, Server); 817 | false -> 818 | case ( (E#election.status == wait) 819 | and (Node == E#election.leadernode)) of 820 | true -> 821 | startStage1(E1, Server); 822 | false -> 823 | E1 824 | end 825 | end 826 | end 827 | end, 828 | hasBecomeLeader(NewE, Server, Msg) 829 | end. 830 | 831 | 832 | % this is the regular operation loop. All messages are received, 833 | % unexpected ones are discarded. 834 | loop(#server{} = Server, Role, #election{} = Election, PrevMsg) -> 835 | ?MODULE:real_loop(Server, Role, Election, PrevMsg). 836 | 837 | real_loop(#server{parent = Parent, 838 | mod = Mod, 839 | state = State, 840 | debug = Debug} = Server, Role, 841 | #election{name = Name} = E, _PrevMsg) -> 842 | receive 843 | Msg -> 844 | case Msg of 845 | code_reloaded -> 846 | loop(Server, Role, E, Msg); 847 | {system, From, Req} -> 848 | sys:handle_system_msg(Req, From, Parent, ?MODULE, Debug, 849 | [normal, Server, Role, E]); 850 | {'EXIT', Parent, Reason} -> 851 | terminate(Reason, Msg, Server, Role, E); 852 | 853 | {join, From} -> 854 | From ! {hasLeader, E#election.leader, E#election.elid, self()}, 855 | loop(Server, Role, E, Msg); 856 | {update_candidates, T, Candidates, _From} -> 857 | case E#election.elid == T of 858 | true -> 859 | NewE = E#election{candidate_nodes = Candidates}, 860 | loop(Server, Role, NewE, Msg); 861 | false -> 862 | loop(Server, Role, E, Msg) 863 | end; 864 | {halt, _, From} -> 865 | T = E#election.elid, 866 | case E#election.leader of 867 | From -> 868 | % The process we consider to be a leader seems to be in elec1 stage. 869 | % So we downgrade to it too 870 | NewE = startStage1(E, Server), 871 | safe_loop(Server, candidate, NewE, Msg); 872 | OtherLeader -> 873 | From ! {hasLeader, OtherLeader, T, self()}, 874 | loop(Server, Role, E, Msg) 875 | end; 876 | {hasLeader, _, _, _} -> 877 | loop(Server, Role, E, Msg); 878 | {isLeader, T, From} -> 879 | case (self() == E#election.leader) of 880 | true -> 881 | NewCandidates = 882 | case lists:member(node(From), candidates(E)) of 883 | true -> candidates(E); 884 | false -> 885 | NC = candidates(E) ++ [node(From)], 886 | lists:foreach( 887 | fun(Node) -> 888 | {Name, Node} ! 889 | {update_candidates, E#election.elid, 890 | NC, self()} 891 | end, candidates(E) -- lists:flatten([node()], down(E))), 892 | NC 893 | end, 894 | NewDown = E#election.down -- [node(From)], 895 | NewE1 = mon_node(E#election{down = NewDown}, 896 | From, Server), 897 | NewE = NewE1#election{candidate_nodes = NewCandidates}, 898 | NewState = call_elected(Mod, State, NewE, From), 899 | loop(Server#server{state = NewState}, Role, NewE, Msg); 900 | false -> 901 | From ! {notLeader, T, self()}, 902 | loop(Server, Role, E, Msg) 903 | end; 904 | {ackLeader, _, _} -> 905 | loop(Server, Role, E, Msg); 906 | {notLeader, _, _} -> 907 | loop(Server, Role, E, Msg); 908 | {ack, _, _} -> 909 | loop(Server, Role, E, Msg); 910 | {ldr, _, _, _, _} -> 911 | loop(Server, Role, E, Msg); 912 | {normQ, _, _} -> 913 | loop(Server, Role, E, Msg); 914 | {notNorm, T, From} -> 915 | case ( (E#election.leader == self()) 916 | and (E#election.elid == T) ) of 917 | true -> 918 | NewDown = E#election.down -- [node(From)], 919 | NewE = mon_node(E#election{down = NewDown}, 920 | From, Server), 921 | NewState = call_elected(Mod, State, NewE, From), 922 | loop(Server#server{state = NewState}, Role, NewE, Msg); 923 | false -> 924 | loop(Server, Role, E, Msg) 925 | end; 926 | {workerAlive, _, _} -> 927 | %% Do nothing if we get this from a new leader 928 | %% We will soon notice that the prev leader has died, and 929 | %%get the same message again when we are back in safe_loop! 930 | loop(Server, Role, E, Msg); 931 | {activateWorker, _, _, _} -> 932 | %% We ignore this, we are already active... 933 | %% It must be an old message! 934 | loop(Server, Role, E, Msg); 935 | {workerIsAlive, T, From} -> 936 | case ((T == E#election.elid) and (self() == E#election.leader)) of 937 | true -> 938 | NewDown = E#election.work_down -- [node(From)], 939 | NewE = mon_node(E#election{work_down = NewDown}, 940 | From, Server), 941 | NewState = call_elected(Mod, State, NewE, From), 942 | loop(Server#server{state = NewState}, Role, NewE, Msg); 943 | false -> 944 | loop(Server, Role, E, Msg) 945 | end; 946 | {election} -> 947 | %% Told to do an election because of a leader conflict. 948 | E1 = startStage1(E, Server), 949 | safe_loop(Server, candidate, E1, Msg); 950 | {checklead, Node} -> 951 | case (E#election.leadernode == Node) of 952 | true -> 953 | %% Leaders match, nothing to do 954 | loop(Server, Role, E, Msg); 955 | false when E#election.leader == self() -> 956 | %% We're a leader and we disagree with the other 957 | %% leader. Tell everyone else to have an election. 958 | lists:foreach( 959 | fun(N) -> 960 | {Name, N} ! {election} 961 | end, E#election.candidate_nodes), 962 | %% Start participating in the election ourselves. 963 | E1 = startStage1(E, Server), 964 | safe_loop(Server, candidate, E1, Msg); 965 | false -> 966 | %% Not a leader, just wait to be told to do an 967 | %% election, if applicable. 968 | loop(Server, Role, E, Msg) 969 | end; 970 | {send_checklead} -> 971 | case (E#election.leader == self()) of 972 | true -> 973 | case E#election.down of 974 | [] -> 975 | loop(Server, Role, E, Msg); 976 | Down -> 977 | %% For any nodes which are down, send them 978 | %% a message comparing their leader to our 979 | %% own. This allows us to trigger an 980 | %% election after a netsplit is healed. 981 | spawn(?MODULE, send_checkleads, [Name, E#election.cand_timer_int, self(), Down]), 982 | loop(Server, Role, E, Msg) 983 | end; 984 | false -> 985 | loop(Server, Role, E, Msg) 986 | end; 987 | {heartbeat, _Node} -> 988 | case (E#election.leader == self()) of 989 | true -> 990 | Candidates = E#election.down -- [lists:nth(1, E#election.candidate_nodes)], 991 | lists:foreach( 992 | fun(N) -> 993 | Elid = E#election.elid, 994 | erlang:send({Name, N}, {normQ, Elid, self()}, [nosuspend, noconnect]) 995 | end, Candidates), 996 | lists:foreach( 997 | fun(N) -> 998 | Elid = E#election.elid, 999 | erlang:send({Name, N}, {workerAlive, Elid, self()}, [nosuspend, noconnect]) 1000 | end, E#election.work_down); 1001 | false -> 1002 | ok 1003 | end, 1004 | loop(Server, Role, E, Msg); 1005 | {candidate_timer} = Msg 1006 | when E#election.down =:= [] orelse (Role =/= elected andalso E#election.leadernode =/= none) -> 1007 | timer:cancel(E#election.cand_timer), 1008 | loop(Server, Role, E#election{cand_timer=undefined}, Msg); 1009 | {candidate_timer} = Msg -> 1010 | %% get rid of any queued up candidate_timers, 1011 | %% since we just handled one 1012 | flush_candidate_timers(), 1013 | %% This shouldn't happen in the leader - just ignore 1014 | loop(Server, Role, E, Msg); 1015 | {ldr, 'DOWN', Node} = Msg when Role == worker -> 1016 | case Node == E#election.leadernode of 1017 | true -> 1018 | NewE = E#election{ leader = none, leadernode = none, 1019 | status = waiting_worker, 1020 | monitored = []}, 1021 | safe_loop(Server, waiting_worker, NewE, Msg); 1022 | false -> 1023 | loop(Server, Role, E, Msg) 1024 | end; 1025 | {ldr, 'DOWN', Node} = Msg -> 1026 | NewMon = lists:keydelete(Node, 2, E#election.monitored), 1027 | case lists:member(Node, E#election.candidate_nodes) of 1028 | true -> 1029 | NewDown = [Node | E#election.down], 1030 | E1 = E#election{down = NewDown, monitored = NewMon}, 1031 | case (Node == E#election.leadernode) of 1032 | true -> 1033 | NewE = startStage1(E1, Server), 1034 | safe_loop(Server, candidate, NewE, Msg); 1035 | false when E#election.leadernode =:= node() -> 1036 | %% Serge: call handle_DOWN 1037 | {NewState, NewE} = 1038 | case (Server#server.mod):handle_DOWN(Node, Server#server.state, E1) of 1039 | {ok, NewState1} -> 1040 | {NewState1, E1}; 1041 | {ok, Synch, NewState1} -> 1042 | {NewState1, broadcast({from_leader, Synch}, E1)} 1043 | end, 1044 | %% We're the leader and one of our 1045 | %% candidates has gone down. Start sending 1046 | %% out checklead messages to the downed 1047 | %% candidates so we can quickly trigger an 1048 | %% election, if this was a netsplit when 1049 | %% its healed. 1050 | {Name, node()} ! {send_checklead}, 1051 | loop(Server#server{state=NewState}, Role, NewE, Msg); 1052 | false -> 1053 | loop(Server, Role, E1, Msg) 1054 | end; 1055 | false -> 1056 | %% I am the leader, 1057 | %% make sure the dead worker is in work_down. 1058 | E1 = E#election{ 1059 | monitored = NewMon, 1060 | work_down = [Node | 1061 | (E#election.work_down -- [Node])] 1062 | }, 1063 | loop(Server, Role, E1, Msg) 1064 | end; 1065 | {add_worker, WorkerNode} -> 1066 | case lists:member(WorkerNode, E#election.worker_nodes) of 1067 | false -> 1068 | {WNodes, DNodes} = {E#election.worker_nodes, E#election.work_down}, 1069 | 1070 | loop(Server, Role, E#election{worker_nodes=[WorkerNode|WNodes], 1071 | work_down=[WorkerNode|DNodes]}, 1072 | Msg); 1073 | true -> % Redundancy, meet the mirror 1074 | loop(Server, Role, E, Msg) 1075 | end; 1076 | _Msg when Debug == [] -> 1077 | handle_msg(Msg, Server, Role, E); 1078 | _Msg -> 1079 | Debug1 = sys:handle_debug(Debug, fun ?MODULE:print_event/3, 1080 | E#election.name, {in, Msg}), 1081 | handle_msg(Msg, Server#server{debug = Debug1}, Role, E) 1082 | end 1083 | end. 1084 | 1085 | %%----------------------------------------------------------------- 1086 | %% Callback functions for system messages handling. 1087 | %%----------------------------------------------------------------- 1088 | %% @hidden 1089 | system_continue(_Parent, _Debug, [safe, Server, Role, E]) -> 1090 | safe_loop(Server, Role, E, {}); 1091 | system_continue(_Parent, _Debug, [normal, Server, Role, E]) -> 1092 | loop(Server, Role, E, {}). 1093 | 1094 | %% @hidden 1095 | -spec system_terminate(any(), any(), any(), any()) -> no_return() . 1096 | system_terminate(Reason, _Parent, _Debug, [_Mode, Server, Role, E]) -> 1097 | terminate(Reason, [], Server, Role, E). 1098 | 1099 | %% @hidden 1100 | system_code_change([Mode, Server, Role, E], _Module, OldVsn, Extra) -> 1101 | #server{mod = Mod, state = State} = Server, 1102 | case catch Mod:code_change(OldVsn, State, E, Extra) of 1103 | {ok, NewState} -> 1104 | NewServer = Server#server{state = NewState}, 1105 | {ok, [Mode, NewServer, Role, E]}; 1106 | {ok, NewState, NewE} -> 1107 | NewServer = Server#server{state = NewState}, 1108 | {ok, [Mode, NewServer, Role, NewE]}; 1109 | Else -> Else 1110 | end. 1111 | 1112 | %%----------------------------------------------------------------- 1113 | %% Format debug messages. Print them as the call-back module sees 1114 | %% them, not as the real erlang messages. Use trace for that. 1115 | %%----------------------------------------------------------------- 1116 | %% @hidden 1117 | print_event(Dev, {in, Msg}, Name) -> 1118 | case Msg of 1119 | {'$gen_call', {From, _Tag}, Call} -> 1120 | io:format(Dev, "*DBG* ~p got local call ~p from ~w~n", 1121 | [Name, Call, From]); 1122 | {'$leader_call', {From, _Tag}, Call} -> 1123 | io:format(Dev, "*DBG* ~p got global call ~p from ~w~n", 1124 | [Name, Call, From]); 1125 | {'$gen_cast', Cast} -> 1126 | io:format(Dev, "*DBG* ~p got local cast ~p~n", 1127 | [Name, Cast]); 1128 | {'$leader_cast', Cast} -> 1129 | io:format(Dev, "*DBG* ~p got global cast ~p~n", 1130 | [Name, Cast]); 1131 | _ -> 1132 | io:format(Dev, "*DBG* ~p got ~p~n", [Name, Msg]) 1133 | end; 1134 | print_event(Dev, {out, Msg, To, State}, Name) -> 1135 | io:format(Dev, "*DBG* ~p sent ~p to ~w, new state ~w~n", 1136 | [Name, Msg, To, State]); 1137 | print_event(Dev, {noreply, State}, Name) -> 1138 | io:format(Dev, "*DBG* ~p new state ~w~n", [Name, State]); 1139 | print_event(Dev, Event, Name) -> 1140 | io:format(Dev, "*DBG* ~p dbg ~p~n", [Name, Event]). 1141 | 1142 | 1143 | handle_msg({'$leader_call', From, Request} = Msg, 1144 | #server{mod = Mod, state = State} = Server, elected = Role, E) -> 1145 | case catch Mod:handle_leader_call(Request, From, State, E) of 1146 | {reply, Reply, NState} -> 1147 | NewServer = reply(From, {leader, reply, Reply}, 1148 | Server#server{state = NState}, Role, E), 1149 | loop(NewServer, Role, E, Msg); 1150 | {reply, Reply, Broadcast, NState} -> 1151 | NewE = broadcast({from_leader, Broadcast}, E), 1152 | NewServer = reply(From, {leader, reply, Reply}, 1153 | Server#server{state = NState}, Role, 1154 | NewE), 1155 | loop(NewServer, Role, NewE, Msg); 1156 | {noreply, NState} = Reply -> 1157 | NewServer = handle_debug(Server#server{state = NState}, 1158 | Role, E, Reply), 1159 | loop(NewServer, Role, E, Msg); 1160 | {stop, Reason, Reply, NState} -> 1161 | {'EXIT', R} = 1162 | (catch terminate(Reason, Msg, 1163 | Server#server{state = NState}, 1164 | Role, E)), 1165 | reply(From, Reply), 1166 | exit(R); 1167 | Other -> 1168 | handle_common_reply(Other, Msg, Server, Role, E) 1169 | end; 1170 | handle_msg({from_leader, Cmd} = Msg, 1171 | #server{mod = Mod, state = State} = Server, Role, E) -> 1172 | NewE = check_candidates(E), 1173 | handle_common_reply(catch Mod:from_leader(Cmd, State, NewE), 1174 | Msg, Server, Role, NewE); 1175 | handle_msg({'$leader_call', From, Request} = Msg, Server, Role, 1176 | #election{buffered = Buffered, leader = Leader} = E) -> 1177 | Ref = make_ref(), 1178 | Leader ! {'$leader_call', {self(), Ref}, Request}, 1179 | NewBuffered = [{Ref, From}|Buffered], 1180 | loop(Server, Role, E#election{buffered = NewBuffered}, Msg); 1181 | handle_msg({Ref, {leader, reply, Reply}} = Msg, Server, Role, 1182 | #election{buffered = Buffered} = E) -> 1183 | {value, {_, From}} = lists:keysearch(Ref, 1, Buffered), 1184 | El = E#election{buffered = lists:keydelete(Ref, 1, Buffered)}, 1185 | 1186 | NewServer = reply(From, {leader, reply, Reply}, Server, Role, El), 1187 | 1188 | loop(NewServer, Role, El, Msg); 1189 | handle_msg({'$gen_call', From, get_candidates} = Msg, Server, Role, E) -> 1190 | NewServer = reply(From, {ok, candidates(E)}, Server, Role, E), 1191 | loop(NewServer, Role, E, Msg); 1192 | handle_msg({'$gen_call', From, Request} = Msg, 1193 | #server{mod = Mod, state = State} = Server, Role, E) -> 1194 | case catch Mod:handle_call(Request, From, State, E) of 1195 | {reply, Reply, NState} -> 1196 | NewServer = reply(From, Reply, 1197 | Server#server{state = NState}, Role, E), 1198 | loop(NewServer, Role, E, Msg); 1199 | {noreply, NState} = Reply -> 1200 | NewServer = handle_debug(Server#server{state = NState}, 1201 | Role, E, Reply), 1202 | loop(NewServer, Role, E, Msg); 1203 | {stop, Reason, Reply, NState} -> 1204 | {'EXIT', R} = 1205 | (catch terminate(Reason, Msg, Server#server{state = NState}, 1206 | Role, E)), 1207 | reply(From, Reply), 1208 | exit(R); 1209 | Other -> 1210 | handle_common_reply(Other, Msg, Server, Role, E) 1211 | end; 1212 | handle_msg({'$gen_cast', Msg} = Cast, 1213 | #server{mod = Mod, state = State} = Server, Role, E) -> 1214 | handle_common_reply(catch Mod:handle_cast(Msg, State, E), 1215 | Cast, Server, Role, E); 1216 | handle_msg({'$leader_cast', Msg} = Cast, 1217 | #server{mod = Mod, state = State} = Server, elected = Role, E) -> 1218 | case catch Mod:handle_leader_cast(Msg, State, E) of 1219 | {noreply, NState} -> 1220 | NewServer = handle_debug(Server#server{state = NState}, 1221 | Role, E, Cast), 1222 | loop(NewServer, Role, E, Cast); 1223 | {ok, Broadcast, NState} -> 1224 | NewE = broadcast({from_leader, Broadcast}, E), 1225 | NewServer = handle_debug(Server#server{state = NState}, 1226 | Role, E, Cast), 1227 | loop(NewServer, Role, NewE, Cast); 1228 | Other -> 1229 | handle_common_reply(Other, Msg, Server, Role, E) 1230 | end; 1231 | handle_msg({'$leader_cast', Msg} = Cast, Server, Role, 1232 | #election{leader = Leader} = E) -> 1233 | Leader ! {'$leader_cast', Msg}, 1234 | loop(Server, Role, E, Cast); 1235 | 1236 | handle_msg(Msg, #server{mod = Mod, state = State} = Server, Role, E) -> 1237 | handle_common_reply(catch Mod:handle_info(Msg, State, E), 1238 | Msg, Server, Role, E). 1239 | 1240 | 1241 | handle_common_reply(Reply, Msg, Server, Role, E) -> 1242 | case Reply of 1243 | {noreply, NState} -> 1244 | NewServer = handle_debug(Server#server{state = NState}, 1245 | Role, E, Reply), 1246 | loop(NewServer, Role, E, Msg); 1247 | {ok, NState} -> 1248 | NewServer = handle_debug(Server#server{state = NState}, 1249 | Role, E, Reply), 1250 | loop(NewServer, Role, E, Msg); 1251 | {stop, Reason, NState} -> 1252 | terminate(Reason, Msg, Server#server{state = NState}, Role, E); 1253 | {'EXIT', Reason} -> 1254 | terminate(Reason, Msg, Server, Role, E); 1255 | _ -> 1256 | terminate({bad2_return_value, Reply}, Msg, Server, Role, E) 1257 | end. 1258 | 1259 | 1260 | reply({To, Tag}, Reply, #server{state = State} = Server, Role, E) -> 1261 | reply({To, Tag}, Reply), 1262 | handle_debug(Server, Role, E, {out, Reply, To, State}). 1263 | 1264 | 1265 | handle_debug(#server{debug = []} = Server, _Role, _E, _Event) -> 1266 | Server; 1267 | handle_debug(#server{debug = Debug} = Server, _Role, E, Event) -> 1268 | Debug1 = sys:handle_debug(Debug, fun ?MODULE:print_event/3, 1269 | E#election.name, Event), 1270 | Server#server{debug = Debug1}. 1271 | 1272 | %%% --------------------------------------------------- 1273 | %%% Terminate the server. 1274 | %%% --------------------------------------------------- 1275 | 1276 | terminate(Reason, Msg, #server{mod = Mod, 1277 | state = State, 1278 | debug = Debug} = _Server, _Role, 1279 | #election{name = Name, cand_timer = Timer} = _E) -> 1280 | timer:cancel(Timer), 1281 | case catch Mod:terminate(Reason, State) of 1282 | {'EXIT', R} -> 1283 | error_info(R, Name, Msg, State, Debug), 1284 | exit(R); 1285 | _ -> 1286 | case Reason of 1287 | normal -> 1288 | exit(normal); 1289 | shutdown -> 1290 | exit(shutdown); 1291 | _ -> 1292 | error_info(Reason, Name, Msg, State, Debug), 1293 | exit(Reason) 1294 | end 1295 | end. 1296 | 1297 | %% Maybe we shouldn't do this? We have the crash report... 1298 | error_info(Reason, Name, Msg, State, Debug) -> 1299 | error_logger:format("** Generic leader ~p terminating \n" 1300 | "** Last message in was ~p~n" 1301 | "** When Server state == ~p~n" 1302 | "** Reason for termination == ~n** ~p~n", 1303 | [Name, Msg, State, Reason]), 1304 | sys:print_log(Debug), 1305 | ok. 1306 | 1307 | %%% --------------------------------------------------- 1308 | %%% Misc. functions. 1309 | %%% --------------------------------------------------- 1310 | 1311 | opt(Op, [{Op, Value}|_]) -> 1312 | {ok, Value}; 1313 | opt(Op, [_|Options]) -> 1314 | opt(Op, Options); 1315 | opt(_, []) -> 1316 | false. 1317 | 1318 | debug_options(Name, Opts) -> 1319 | case opt(debug, Opts) of 1320 | {ok, Options} -> dbg_options(Name, Options); 1321 | _ -> dbg_options(Name, []) 1322 | end. 1323 | 1324 | dbg_options(Name, []) -> 1325 | Opts = 1326 | case init:get_argument(generic_debug) of 1327 | error -> 1328 | []; 1329 | _ -> 1330 | [log, statistics] 1331 | end, 1332 | dbg_opts(Name, Opts); 1333 | dbg_options(Name, Opts) -> 1334 | dbg_opts(Name, Opts). 1335 | 1336 | dbg_opts(Name, Opts) -> 1337 | case catch sys:debug_options(Opts) of 1338 | {'EXIT', _} -> 1339 | error_logger:format("~p: ignoring erroneous debug options - ~p~n", 1340 | [Name, Opts]), 1341 | []; 1342 | Dbg -> 1343 | Dbg 1344 | end. 1345 | 1346 | %%----------------------------------------------------------------- 1347 | %% Status information 1348 | %%----------------------------------------------------------------- 1349 | %% @hidden 1350 | format_status(Opt, StatusData) -> 1351 | [PDict, SysState, Parent, Debug, [_Mode, Server, _Role, E]] = StatusData, 1352 | Header = lists:concat(["Status for generic server ", E#election.name]), 1353 | Log = sys:get_debug(log, Debug, []), 1354 | #server{mod = Mod, state = State} = Server, 1355 | Specific = 1356 | case erlang:function_exported(Mod, format_status, 2) of 1357 | true -> 1358 | case catch apply(Mod, format_status, [Opt, [PDict, State]]) of 1359 | {'EXIT', _} -> [{data, [{"State", State}]}]; 1360 | Else -> Else 1361 | end; 1362 | _ -> 1363 | [{data, [{"State", State}]}] 1364 | end, 1365 | [{header, Header}, 1366 | {data, [{"Status", SysState}, 1367 | {"Parent", Parent}, 1368 | {"Logged events", Log}]} | 1369 | Specific]. 1370 | 1371 | 1372 | %%----------------------------------------------------------------- 1373 | %% Leader-election functions 1374 | %%----------------------------------------------------------------- 1375 | 1376 | %% Corresponds to startStage1 in Figure 1 in the Stoller-article 1377 | startStage1(E, Server) -> 1378 | NodePos = pos(node(), E#election.candidate_nodes), 1379 | Elid = {NodePos, E#election.incarn, E#election.nextel}, 1380 | NewE = E#election{ 1381 | elid = Elid, 1382 | nextel = E#election.nextel + 1, 1383 | down = [], 1384 | status = elec1}, 1385 | case NodePos of 1386 | 1 -> 1387 | startStage2(NewE, Server); 1388 | _ -> 1389 | mon_nodes(NewE, lesser(node(), E#election.candidate_nodes), Server) 1390 | end. 1391 | 1392 | %% Corresponds to startStage2 1393 | startStage2(E, Server) -> 1394 | continStage2(E#election{status = elec2, pendack = node(), acks = []}, 1395 | Server). 1396 | 1397 | continStage2(E, Server) -> 1398 | case (pos(E#election.pendack, E#election.candidate_nodes) 1399 | < length(E#election.candidate_nodes)) of 1400 | true -> 1401 | Pendack = next(E#election.pendack, E#election.candidate_nodes), 1402 | NewE = mon_nodes(E, [Pendack], Server), 1403 | halt_pendack(NewE#election{pendack = Pendack}); 1404 | false -> 1405 | %% I am the leader 1406 | E#election{leader = self(), 1407 | leadernode = node(), 1408 | previous_leader = E#election.leader, 1409 | status = norm} 1410 | end. 1411 | 1412 | halt_pendack(#election{pendack = undefined} = E) -> 1413 | E; 1414 | halt_pendack(#election{name = Name, elid = ElId, pendack = Pendack} = E) -> 1415 | erlang:send({Name, Pendack}, {halt, ElId, self()}, [nosuspend, noconnect]), 1416 | E. 1417 | 1418 | %% corresponds to Halting 1419 | halting(E, T, From, Server) -> 1420 | NewE = mon_node(E, From, Server), 1421 | NewE#election{elid = T, 1422 | status = wait, 1423 | leadernode = node(From), 1424 | down = E#election.down -- [node(From)] 1425 | }. 1426 | 1427 | 1428 | joinCluster(E, Server) -> 1429 | Pid = {E#election.name, E#election.seed_node}, 1430 | Pid ! {join, self()}, 1431 | NewE = mon_node(E, Pid, Server), 1432 | NewE#election{status = joining}. 1433 | 1434 | 1435 | %%% checks if the proc has become the leader, if so switch to loop 1436 | hasBecomeLeader(E, Server, Msg) -> 1437 | case ((E#election.status == norm) and (E#election.leader == self())) of 1438 | true -> 1439 | {ok, Synch, NewState} = 1440 | (Server#server.mod):elected(Server#server.state, E, undefined), 1441 | lists:foreach( 1442 | fun(Node) -> 1443 | {E#election.name, Node} ! 1444 | {ldr, Synch, E#election.elid, workers(E), candidates(E), self()} 1445 | end, E#election.acks), 1446 | 1447 | %% Make sure we will try to contact all workers! 1448 | NewE = E#election{work_down = E#election.worker_nodes}, 1449 | 1450 | %% io:format("==> I am the leader! (acks: ~200p)\n", [E#election.acks]), 1451 | %% Set the internal timeout (corresponds to Periodically) 1452 | timer:send_after(E#election.cand_timer_int, {heartbeat, node()}), 1453 | {E#election.name, node()} ! {send_checklead}, 1454 | 1455 | %% trigger handle_DOWN callback if previous leader is down 1456 | PrevLeader = E#election.previous_leader, 1457 | {NewState2, NewE2} = 1458 | case PrevLeader of 1459 | none -> {NewState, NewE}; 1460 | Pid when is_pid(Pid) -> 1461 | case lists:member(node(PrevLeader), down(E)) of 1462 | false -> {NewState, NewE}; 1463 | true -> 1464 | case (Server#server.mod):handle_DOWN(node(PrevLeader), NewState, NewE) of 1465 | {ok, NS} -> {NS, NewE}; 1466 | {ok, Synch2, NS} -> 1467 | {NS, broadcast({from_leader, Synch2}, NewE)} 1468 | end 1469 | end 1470 | end, 1471 | 1472 | %% (It's meaningful only when I am the leader!) 1473 | loop(Server#server{state = NewState2}, elected, NewE2, Msg); 1474 | false -> 1475 | safe_loop(Server, candidate, E, Msg) 1476 | end. 1477 | 1478 | 1479 | %%% 1480 | %%% No one checks incarnation type, we just check equality 1481 | %%% So it is OK to just use timestamp here 1482 | %%% 1483 | incarnation(_VarDir, _RegName, _Node) -> 1484 | os:timestamp(). 1485 | 1486 | 1487 | broadcast(Msg, #election{monitored = Monitored} = E) -> 1488 | %% This function is used for broadcasts, 1489 | %% and we make sure only to broadcast to already known nodes. 1490 | ToNodes = [N || {_, N} <- Monitored], 1491 | broadcast(Msg, ToNodes, E). 1492 | 1493 | broadcast({from_leader, Msg}, ToNodes, E) -> 1494 | lists:foreach( 1495 | fun(Node) -> 1496 | {E#election.name, Node} ! {from_leader, Msg} 1497 | end, ToNodes), 1498 | E. 1499 | 1500 | 1501 | lesser(_, []) -> 1502 | []; 1503 | lesser(N, [N|_]) -> 1504 | []; 1505 | lesser(N, [M|Ms]) -> 1506 | [M|lesser(N, Ms)]. 1507 | 1508 | next(_, []) -> 1509 | no_val; 1510 | next(N, [N|Ms]) -> 1511 | lists:nth(1, Ms); 1512 | next(N, [_|Ms]) -> 1513 | next(N, Ms). 1514 | 1515 | pos(_, []) -> 1516 | 100000; 1517 | pos(N1, [N1|_]) -> 1518 | 1; 1519 | pos(N1, [_|Ns]) -> 1520 | 1+pos(N1, Ns). 1521 | 1522 | check_candidates(#election{down = Down} = E) -> 1523 | NewDown = [N || N <- Down, {ok, up} =/= net_kernel:node_info(N, state)], 1524 | E#election{down = NewDown}. 1525 | 1526 | broadcast_candidates(E, Synch, IgnoreNodes) -> 1527 | case E#election.bcast_type of 1528 | all -> 1529 | Nodes = [N || {_, N} <- E#election.monitored] -- IgnoreNodes, 1530 | broadcast({from_leader, Synch}, Nodes, E); 1531 | _ -> 1532 | ok 1533 | end. 1534 | 1535 | call_elected(Mod, State, E, From) when is_pid(From) -> 1536 | case Mod:elected(State, E, node(From)) of 1537 | {ok, Synch, NewState} -> 1538 | From ! {ldr, Synch, E#election.elid, workers(E), candidates(E), self()}, 1539 | broadcast_candidates(E, Synch, [From]), 1540 | NewState; 1541 | {reply, Synch, NewState} -> 1542 | From ! {ldr, Synch, E#election.elid, workers(E), candidates(E), self()}, 1543 | NewState 1544 | end. 1545 | 1546 | 1547 | %% Start monitor a bunch of candidate nodes 1548 | mon_nodes(E, Nodes, Server) -> 1549 | Server#server.pinger_proc ! {set_ping_nodes, Nodes}, 1550 | E1 = 1551 | case E#election.cand_timer of 1552 | undefined -> 1553 | {ok, TRef} = timer:send_interval(E#election.cand_timer_int, {candidate_timer}), 1554 | E#election{cand_timer = TRef}; 1555 | _ -> 1556 | E 1557 | end, 1558 | FromNode = node(), 1559 | lists:foldl( 1560 | fun(ToNode, El) -> 1561 | Pid = {El#election.name, ToNode}, 1562 | erlang:send(Pid, {heartbeat, FromNode}, [nosuspend, noconnect]), 1563 | mon_node(El, Pid, Server) 1564 | end, E1, Nodes -- [node()]). 1565 | 1566 | %% Start monitoring one Process 1567 | mon_node(E, {_RegName, NodeName} = Proc, Server) -> 1568 | do_mon_node(E, Proc, NodeName, Server); 1569 | 1570 | mon_node(E, Proc, Server) when is_pid(Proc) -> 1571 | do_mon_node(E, Proc, node(Proc), Server). 1572 | 1573 | do_mon_node(E, Proc, NodeName, Server) -> 1574 | case lists:keymember(NodeName, 2, E#election.monitored) of 1575 | true -> E; 1576 | false -> 1577 | {Ref, Node} = do_monitor(Proc, Server), 1578 | E#election{monitored = [{Ref, Node} | E#election.monitored]} 1579 | end. 1580 | 1581 | spawn_monitor_proc() -> 1582 | Parent = self(), 1583 | proc_lib:spawn_link(?MODULE, real_mon_loop, [Parent, []]). 1584 | 1585 | 1586 | do_monitor(Proc, #server{monitor_proc = P}) -> 1587 | P ! {self(), {monitor, Proc}}, 1588 | receive 1589 | {mon_reply, Reply} -> 1590 | Reply 1591 | after 10000 -> % can take quite a while to receive mon_reply if the node is down 1592 | erlang:error(timeout) 1593 | end. 1594 | 1595 | mon_loop(Parent, Refs) -> 1596 | ?MODULE:real_mon_loop(Parent, Refs). 1597 | 1598 | real_mon_loop(Parent, Refs) -> 1599 | receive 1600 | code_reloaded -> 1601 | mon_loop(Parent, Refs); 1602 | {From, Req} -> 1603 | mon_loop(Parent, mon_handle_req(Req, From, Refs)); 1604 | {'DOWN', Ref, _, _, _} -> 1605 | mon_loop(Parent, mon_handle_down(Ref, Parent, Refs)); 1606 | Msg -> 1607 | io:fwrite("mon_loop with parent: ~p refs: ~p received: ~p~n", [Parent, Refs, Msg]), 1608 | mon_loop(Parent, Refs) 1609 | end. 1610 | 1611 | mon_handle_req({monitor, P}, From, Refs) -> 1612 | Node = case P of 1613 | {_Name, N} -> N; 1614 | Pid when is_pid(Pid) -> node(Pid) 1615 | end, 1616 | case lists:keyfind(Node, 2, Refs) of 1617 | {Ref, _} -> 1618 | mon_reply(From, {Ref, Node}), 1619 | Refs; 1620 | false -> 1621 | Ref = erlang:monitor(process, P), 1622 | mon_reply(From, {Ref, Node}), 1623 | [{Ref, Node}|Refs] 1624 | end. 1625 | 1626 | mon_handle_down(Ref, Parent, Refs) -> 1627 | case lists:keytake(Ref, 1, Refs) of 1628 | {value, {_, Node}, Refs1} -> 1629 | Parent ! {ldr, 'DOWN', Node}, 1630 | Refs1; 1631 | false -> 1632 | Refs 1633 | end. 1634 | 1635 | 1636 | mon_reply(From, Reply) -> 1637 | From ! {mon_reply, Reply}. 1638 | 1639 | 1640 | spawn_pinger_proc() -> 1641 | Parent = self(), 1642 | proc_lib:spawn_link(?MODULE, init_ping_loop, [Parent, []]). 1643 | 1644 | init_ping_loop(Parent, NodesToPing) -> 1645 | ping_loop(Parent, set_ping_timer(0), NodesToPing). 1646 | 1647 | set_ping_timer(Timeout) -> 1648 | erlang:start_timer(Timeout, self(), {do_ping}). 1649 | 1650 | %% To avoid leader blocking on message send, we ping nodes here, 1651 | %% and leader sends messages to down nodes with [nosuspend, noconnect] 1652 | ping_loop(Parent, TRef, NodesToPing) -> 1653 | receive 1654 | code_reloaded -> 1655 | ?MODULE:ping_loop(Parent, TRef, NodesToPing); 1656 | {set_ping_nodes, NewNodesToPing} -> 1657 | init_ping_loop(Parent, NewNodesToPing); 1658 | {timeout, TRef, _} -> 1659 | NewTRef = set_ping_timer(1000), 1660 | [net_adm:ping(Node) || Node <- NodesToPing], 1661 | ?MODULE:ping_loop(Parent, NewTRef, NodesToPing); 1662 | {timeout, _, _} -> 1663 | ?MODULE:ping_loop(Parent, TRef, NodesToPing); 1664 | Msg -> 1665 | io:fwrite("ping_loop with parent: ~p nodes: ~p received: ~p~n", [Parent, NodesToPing, Msg]), 1666 | ?MODULE:ping_loop(Parent, TRef, NodesToPing) 1667 | end. 1668 | 1669 | 1670 | 1671 | %% the heartbeat messages sent to the downed nodes when the candicate_timer 1672 | %% message is received can take a very long time in the case of a partitioned 1673 | %% network (7 seconds in my testing). Since the candidate_timer is generated 1674 | %% by a send_interval, this means many candidate_timer messages can accumulate 1675 | %% in the mailbox. This function is used to clear them out after handling one 1676 | %% of the candidate_timers, so gen_leader doesn't spend all its time sending 1677 | %% heartbeats. 1678 | flush_candidate_timers() -> 1679 | receive 1680 | {candidate_timer} -> 1681 | flush_candidate_timers() 1682 | after 1683 | 0 -> 1684 | ok 1685 | end. 1686 | 1687 | %% sending messages to disconnected nodes can take a long time 1688 | %% instead of doing this in the gen_leader process, do it here 1689 | %% in a new proc so that gen_leader can remain responsive 1690 | %% Reschedule the next round of checkleads after this round completes, 1691 | %% since sending the messages can take longer than the time between rounds 1692 | send_checkleads(Name, Time, GlProc, Down) -> 1693 | Node = node(), 1694 | [{Name, N} ! {checklead, Node} || N <- Down], 1695 | erlang:send_after(Time, GlProc, {send_checklead}) 1696 | . 1697 | 1698 | --------------------------------------------------------------------------------