├── test
├── test.config
├── minishard_test.erl
└── minishard_detest.erl
├── elvis
├── .gitignore
├── src
├── minishard.app.src
├── minishard.erl
├── minishard_demo.erl
├── minishard_sup.erl
├── minishard_shard.erl
├── minishard_allocator.erl
└── minishard_gen_leader.erl
├── Makefile
├── AUTHORS
├── LICENSE
├── README.md
└── elvis.config
/test/test.config:
--------------------------------------------------------------------------------
1 | % -*- mode: erlang -*-
2 | [
3 | ].
4 |
--------------------------------------------------------------------------------
/elvis:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yandex/minishard/HEAD/elvis
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | deps/*
2 | ebin
3 | .*.sw?
4 | .erlang.mk*
5 | *.d
6 | erl_crash.dump
7 | test/*.ebin
8 | .detest
9 | log
10 |
--------------------------------------------------------------------------------
/src/minishard.app.src:
--------------------------------------------------------------------------------
1 | {application, minishard, [
2 | {description, ""},
3 | {vsn, "0.1.0"},
4 | {id, "git"},
5 | {modules, []},
6 | {registered, []},
7 | {applications, [
8 | kernel,
9 | stdlib
10 | ]},
11 | {mod, {minishard, []}},
12 | {env, []}
13 | ]}.
14 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | PROJECT = minishard
2 | COMPILE_FIRST = minishard_gen_leader
3 |
4 | NID := 1
5 | SHELL_OPTS = -sname minishard$(NID) -setcookie minishard_demo -s minishard -boot start_sasl -sasl errlog_type error
6 |
7 | BUILD_DEPS = elvis_mk
8 | DEP_PLUGINS = elvis_mk
9 | TEST_DEPS = detest
10 |
11 | dep_elvis_mk = git https://github.com/inaka/elvis.mk.git 784e41bcb91
12 |
13 | include erlang.mk
14 |
--------------------------------------------------------------------------------
/AUTHORS:
--------------------------------------------------------------------------------
1 | The following authors have created the source code of "minishard"
2 | published and distributed by YANDEX LLC as the owner:
3 |
4 | Danil Zagoskin This application implements a leader election behavior modeled after
27 | %%% gen_server. This behavior intends to make it reasonably
28 | %%% straightforward to implement a fully distributed server with
29 | %%% master-slave semantics. The gen_leader behavior supports nearly everything that gen_server
31 | %%% does (some functions, such as multicall() and the internal timeout,
32 | %%% have been removed), and adds a few callbacks and API functions to
33 | %%% support leader election etc. Also included is an example program, a global dictionary, based
35 | %%% on the modules gen_leader and dict. The callback implementing the
36 | %%% global dictionary is called 'test_cb', for no particularly logical
37 | %%% reason. New version: The internal leader election algorithm was faulty
39 | %%% and has been replaced with a new version based on a different leader
40 | %%% election algorithm. As a consequence of this the query functions
41 | %%% alive and down can no longer be provided.
42 | %%% The new algorithm also make use of an incarnation parameter, by
43 | %%% default written to disk in the function incarnation. This
44 | %%% implies that only one gen_leader per node is permitted, if
45 | %%% used in a diskless environment, incarnation must be adapted.
46 | %%%
48 | %%% Modifications contributed by Serge Aleynikov:
49 | %%%
50 | %%%
58 | %%%
60 | %%% Modifications done by Danil Zagoskin: 61 | %%%
| Name | The locally registered name of the process |
| CandidateNodes | The names of nodes capable of assuming 269 | %% a leadership role |
| OptArgs | 271 | %%Optional arguments given to `gen_leader'.
272 | %%
|
| Mod | The name of the callback module |
| Arg | Argument passed on to Mod:init/1 |
| Options | Same as gen_server's Options |
The list of candidates needs to be known from the start. Workers 297 | %% could potentially be added at runtime, but no functionality to do 298 | %% this is provided by this version.
299 | %% @end 300 | -spec start_link(Name::atom(), CandidateNodes::[node()], OptArgs::options(), 301 | Mod::module(), Arg::term(), Options::list()) -> start_ret(). 302 | start_link(Name, CandidateNodes, OptArgs, Mod, Arg, Options) 303 | when is_atom(Name), is_list(CandidateNodes), is_list(OptArgs) -> 304 | gen:start(?MODULE, link, {local, Name}, 305 | Mod, {CandidateNodes, OptArgs, Arg}, Options). 306 | 307 | %% Query functions to be used from the callback module 308 | 309 | %% @doc Returns list of alive nodes. 310 | -spec alive(election()) -> [node()]. 311 | alive(E) -> 312 | candidates(E) -- down(E). 313 | 314 | %% @doc Returns list of down nodes. 315 | -spec down(election()) -> [node()]. 316 | down(#election{down = Down}) -> 317 | Down. 318 | 319 | %% @doc Returns the current leader node. 320 | -spec leader_node(election()) -> node() | 'none'. 321 | leader_node(#election{leadernode=Leader}) -> 322 | Leader. 323 | 324 | %% @doc Returns a list of known candidates. 325 | -spec candidates(election()) -> [node()]. 326 | candidates(#election{candidate_nodes = Cands}) -> 327 | Cands. 328 | 329 | %% @doc Returns a list of known workers. 330 | -spec workers(election()) -> [node()]. 331 | workers(#election{worker_nodes = Workers}) -> 332 | Workers. 333 | 334 | %% Used by dynamically added workers. 335 | %% @hidden 336 | worker_announce(Name, Pid) -> 337 | Name ! {add_worker, Pid}, 338 | Name ! {heartbeat, Pid}. 339 | 340 | %% 341 | %% Make a call to a generic server. 342 | %% If the server is located at another node, that node will 343 | %% be monitored. 344 | %% If the client is trapping exits and is linked server termination 345 | %% is handled here (? Shall we do that here (or rely on timeouts) ?). 346 | %% 347 | %% @doc Equivalent togen_server:call/2, but with a slightly
348 | %% different exit reason if something goes wrong. This function calls
349 | %% the gen_leader process exactly as if it were a gen_server
350 | %% (which, for practical purposes, it is.)
351 | %% @end
352 | -spec call(server_ref(), term()) -> term().
353 | call(Name, Request) ->
354 | case catch gen:call(Name, '$gen_call', Request) of
355 | {ok, Res} ->
356 | Res;
357 | {'EXIT', Reason} ->
358 | exit({Reason, {?MODULE, local_call, [Name, Request]}})
359 | end.
360 |
361 | %% @doc Equivalent to gen_server:call/3, but with a slightly
362 | %% different exit reason if something goes wrong. This function calls
363 | %% the gen_leader process exactly as if it were a gen_server
364 | %% (which, for practical purposes, it is.)
365 | %% @end
366 | -spec call(server_ref(), term(), integer()) -> term().
367 | call(Name, Request, Timeout) ->
368 | case catch gen:call(Name, '$gen_call', Request, Timeout) of
369 | {ok, Res} ->
370 | Res;
371 | {'EXIT', Reason} ->
372 | exit({Reason, {?MODULE, local_call, [Name, Request, Timeout]}})
373 | end.
374 |
375 | %% @doc Makes a call (similar to gen_server:call/2) to the
376 | %% leader. The call is forwarded via the local gen_leader instance, if
377 | %% that one isn't actually the leader. The client will exit if the
378 | %% leader dies while the request is outstanding.
379 | %% This function uses gen:call/3, and is subject to the
380 | %% same default timeout as e.g. gen_server:call/2.
gen_server:call/3) to the
395 | %% leader. The call is forwarded via the local gen_leader instance, if
396 | %% that one isn't actually the leader. The client will exit if the
397 | %% leader dies while the request is outstanding.
398 | %% @end
399 | %%
400 | -spec leader_call(Name::server_ref(), Request::term(),
401 | Timeout::integer()) -> term().
402 | leader_call(Name, Request, Timeout) ->
403 | case catch gen:call(Name, '$leader_call', Request, Timeout) of
404 | {ok, {leader, reply, Res}} ->
405 | Res;
406 | {'EXIT', Reason} ->
407 | exit({Reason, {?MODULE, leader_call, [Name, Request, Timeout]}})
408 | end.
409 |
410 |
411 | %% @equiv gen_server:cast/2
412 | -spec cast(Name::server_ref(), Request::term()) -> 'ok'.
413 | cast(Name, Request) ->
414 | catch do_cast('$gen_cast', Name, Request),
415 | ok.
416 |
417 | %% @doc Similar to gen_server:cast/2 but will be forwarded to
418 | %% the leader via the local gen_leader instance.
419 | -spec leader_cast(Name::server_ref(), Request::term()) -> 'ok'.
420 | leader_cast(Name, Request) ->
421 | catch do_cast('$leader_cast', Name, Request),
422 | ok.
423 |
424 |
425 | do_cast(Tag, ServerRef, Request) ->
426 | ServerRef ! {Tag, Request}.
427 |
428 |
429 | %% @equiv gen_server:reply/2
430 | -spec reply(From::caller_ref(), Reply::term()) -> term().
431 | reply({To, Tag}, Reply) ->
432 | catch To ! {Tag, Reply}.
433 |
434 |
435 | %%% ---------------------------------------------------
436 | %%% Initiate the new process.
437 | %%% Register the name using the Rfunc function
438 | %%% Calls the Mod:init/Args function.
439 | %%% Finally an acknowledge is sent to Parent and the main
440 | %%% loop is entered.
441 | %%% ---------------------------------------------------
442 | %%% @hidden
443 | init_it(Starter, Parent, {local, Name}, Mod, {CandidateNodes, Workers, Arg}, Options) ->
444 | %% R13B passes {local, Name} instead of just Name
445 | init_it(Starter, Parent, Name, Mod,
446 | {CandidateNodes, Workers, Arg}, Options);
447 | init_it(Starter, self, Name, Mod, {CandidateNodes, OptArgs, Arg}, Options) ->
448 | init_it(Starter, self(), Name, Mod,
449 | {CandidateNodes, OptArgs, Arg}, Options);
450 | init_it(Starter, Parent, Name, Mod, {UnsortedCandidateNodes, OptArgs, Arg}, Options) ->
451 | Workers = proplists:get_value(workers, OptArgs, []),
452 | VarDir = proplists:get_value(vardir, OptArgs, "."),
453 | Interval = proplists:get_value(heartbeat, OptArgs, ?TAU div 1000) * 1000,
454 | BcastType = proplists:get_value(bcast_type, OptArgs, sender),
455 | Seed = proplists:get_value(seed, OptArgs, none),
456 | Debug = debug_options(Name, Options),
457 | CandidateNodes = lists:sort(UnsortedCandidateNodes),
458 | [spawn_link(net_adm, ping, [Node]) || Node <- CandidateNodes], timer:sleep(1000),
459 | AmCandidate = case lists:member(node(), CandidateNodes) of
460 | true -> true;
461 | false ->
462 | case lists:member(node(), Workers) of
463 | true -> false;
464 | false ->
465 | Seed =/= none
466 | end
467 | end,
468 |
469 | Election = #election{
470 | candidate_nodes = CandidateNodes,
471 | worker_nodes = Workers,
472 | name = Name,
473 | nextel = 0,
474 | cand_timer_int = Interval,
475 | bcast_type = BcastType
476 | },
477 |
478 | case {AmCandidate, lists:member(node(), Workers)} of
479 | {false, false} ->
480 | %% I am neither a candidate nor a worker - don't start this process
481 | error_logger:warning_msg("~w not started - node is not a candidate/worker\n", [Name]),
482 | proc_lib:init_ack(Starter, ignore),
483 | exit(normal);
484 | _ ->
485 | ok
486 | end,
487 |
488 | case {catch Mod:init(Arg), AmCandidate, Seed =/= none} of
489 | {{stop, Reason}, _, _} ->
490 | proc_lib:init_ack(Starter, {error, Reason}),
491 | exit(Reason);
492 | {ignore, _, _} ->
493 | proc_lib:init_ack(Starter, ignore),
494 | exit(normal);
495 | {{'EXIT', Reason}, _, _} ->
496 | proc_lib:init_ack(Starter, {error, Reason}),
497 | exit(Reason);
498 | {{ok, State}, true, false} ->
499 | Server = #server{parent = Parent, mod = Mod,
500 | state = State, debug = Debug},
501 | Incarn = incarnation(VarDir, Name, node()),
502 | NewE = startStage1(Election#election{incarn = Incarn}, Server),
503 | proc_lib:init_ack(Starter, {ok, self()}),
504 |
505 | %% handle the case where there's only one candidate worker and we can't
506 | %% rely on DOWN messages to trigger the elected() call because we never get
507 | %% a DOWN for ourselves
508 | case CandidateNodes =:= [node()] of
509 | true ->
510 | %% there's only one candidate leader; us
511 | hasBecomeLeader(NewE, Server, {init});
512 | false ->
513 | %% more than one candidate worker, continue as normal
514 | safe_loop(Server, candidate, NewE, {init})
515 | end;
516 | {{ok, State}, true, true} ->
517 | Server = #server{parent = Parent, mod = Mod,
518 | state = State, debug = Debug},
519 | Incarn = incarnation(VarDir, Name, node()),
520 | NewE1 = Election#election{incarn = Incarn, seed_node = Seed},
521 | NewE = joinCluster(NewE1, Server),
522 | proc_lib:init_ack(Starter, {ok, self()}),
523 | safe_loop(Server, candidate_joining, NewE, {init});
524 | {{ok, State}, false, HasSeed} ->
525 | proc_lib:init_ack(Starter, {ok, self()}),
526 | Candidates = case HasSeed of
527 | true ->
528 | {ok, C} = call({Name, Seed}, get_candidates),
529 | C;
530 | false -> CandidateNodes
531 | end,
532 | case lists:member(node(), Workers) of
533 | true ->
534 | rpc:multicall(Candidates, ?MODULE,
535 | worker_announce, [Name, node(self())]);
536 | false -> nop
537 | end,
538 | safe_loop(#server{parent = Parent, mod = Mod,
539 | state = State, debug = Debug},
540 | waiting_worker, Election, {init});
541 | {Else, _, _} ->
542 | Error = {bad_return_value, Else},
543 | proc_lib:init_ack(Starter, {error, Error}),
544 | exit(Error)
545 | end.
546 |
547 |
548 | %%% ---------------------------------------------------
549 | %%% The MAIN loops.
550 | %%% ---------------------------------------------------
551 |
552 | % this is the election loop. Only specific messages related
553 | % to the election process are received. User messages, defined
554 | % in e.g. a callback module, are postponed until the (re)election\
555 | % is complete.
556 | safe_loop(#server{} = Server, Role, #election{} = Election, PrevMsg) ->
557 | ?MODULE:real_safe_loop(Server, Role, Election, PrevMsg).
558 |
559 | real_safe_loop(#server{mod = Mod, state = State} = Server, Role,
560 | #election{name = Name} = E, _PrevMsg) ->
561 | receive
562 | code_reloaded = Msg ->
563 | safe_loop(Server, Role, E, Msg);
564 | {system, From, Req} ->
565 | #server{parent = Parent, debug = Debug} = Server,
566 | sys:handle_system_msg(Req, From, Parent, ?MODULE, Debug,
567 | [safe, Server, Role, E]);
568 | {'EXIT', _, Reason} = Msg ->
569 | terminate(Reason, Msg, Server, Role, E);
570 | {update_candidates, _, _, _} = Msg ->
571 | safe_loop(Server, Role, E, Msg);
572 | {halt, T, From} = Msg ->
573 | NewE = halting(E, T, From, Server),
574 | From ! {ackLeader, T, self()},
575 | safe_loop(Server, Role, NewE, Msg);
576 | {hasLeader, Ldr, T, _} = Msg when Role == candidate_joining ->
577 | NewE1 = mon_node(E, Ldr, Server),
578 | NewE = NewE1#election{elid = T, leadernode = node(Ldr)},
579 | Ldr ! {isLeader, T, self()},
580 | safe_loop(Server, Role, NewE, Msg);
581 | {hasLeader, Ldr, T, _} = Msg ->
582 | NewE1 = mon_node(E, Ldr, Server),
583 | case ( (E#election.status == elec2) and (E#election.acks /= []) ) of
584 | true ->
585 | lists:foreach(
586 | fun(Node) ->
587 | {Name, Node} ! {hasLeader, Ldr, T, self()}
588 | end, E#election.acks);
589 | false ->
590 | ok
591 | end,
592 | NewE = NewE1#election{elid = T,
593 | status = wait,
594 | leadernode = node(Ldr),
595 | down = E#election.down -- [node(Ldr)],
596 | acks = []},
597 | Ldr ! {isLeader, T, self()},
598 | safe_loop(Server, Role, NewE, Msg);
599 | {isLeader, T, From} = Msg ->
600 | From ! {notLeader, T, self()},
601 | safe_loop(Server, Role, E, Msg);
602 | {notLeader, T, _} = Msg when Role == candidate_joining ->
603 | NewE = case E#election.elid == T of
604 | true ->
605 | joinCluster(E, Server);
606 | false ->
607 | E
608 | end,
609 | safe_loop(Server, Role, NewE, Msg);
610 | {notLeader, T, _} = Msg ->
611 | NewE =
612 | case ((E#election.status == wait) and (E#election.elid == T)) of
613 | true ->
614 | startStage1(E, Server);
615 | false ->
616 | E
617 | end,
618 | safe_loop(Server, Role, NewE, Msg);
619 | {ackLeader, T, From} = Msg ->
620 | NewE =
621 | case ( (E#election.status == elec2) and (E#election.elid == T)
622 | and (E#election.pendack == node(From)) ) of
623 | true ->
624 | continStage2(
625 | E#election{acks = [node(From)|E#election.acks]},
626 | Server);
627 | false ->
628 | E
629 | end,
630 | hasBecomeLeader(NewE, Server, Msg);
631 |
632 | {ldr, Synch, T, _, _, From} = Msg when Role == waiting_worker ->
633 | case ( (T == E#election.elid)
634 | and (node(From) == E#election.leadernode)) of
635 | true ->
636 | NewE = E#election{ leader = From, status = worker },
637 | {ok, NewState} = Mod:surrendered(State, Synch, NewE),
638 | loop(Server#server{state = NewState}, worker, NewE, Msg);
639 | false ->
640 | %% This should be a VERY special case...
641 | %% But doing nothing is the right thing!
642 | %% A DOWN message should arrive to solve this situation
643 | safe_loop(Server, Role, E, Msg)
644 | end;
645 | {ldr, Synch, T, Workers, Candidates, From} = Msg ->
646 | case ( ( (E#election.status == wait) or (E#election.status == joining) )
647 | and (E#election.elid == T) ) of
648 | true ->
649 | timer:cancel(E#election.cand_timer),
650 | NewE1 = mon_node(E, From, Server),
651 | NewE2 = NewE1#election{leader = From,
652 | leadernode = node(From),
653 | previous_leader = E#election.leader,
654 | worker_nodes = Workers,
655 | candidate_nodes = Candidates,
656 | status = norm,
657 | cand_timer=undefined},
658 | NewE = case Role == candidate_joining of
659 | true ->
660 | mon_nodes(NewE2, lesser(node(), candidates(NewE2)), Server);
661 | false -> NewE2
662 | end,
663 | {ok, NewState} = Mod:surrendered(State, Synch, NewE),
664 | loop(Server#server{state = NewState}, surrendered, NewE, Msg);
665 | false ->
666 | safe_loop(Server, Role, E, Msg)
667 | end;
668 | {normQ, T, From} = Msg ->
669 | NewE =
670 | case ( (E#election.status == elec1)
671 | or ( (E#election.status == wait)
672 | and (E#election.elid == T) ) ) of
673 | true ->
674 | NE = halting(E, T, From, Server),
675 | From ! {notNorm, T, self()},
676 | NE;
677 | false ->
678 | E
679 | end,
680 | safe_loop(Server, Role, NewE, Msg);
681 | {notNorm, _, _} = Msg ->
682 | safe_loop(Server, Role, E, Msg);
683 | {workerAlive, T, From} = Msg ->
684 | NewE =
685 | case E#election.leadernode == none of
686 | true ->
687 | %% We should initiate activation,
688 | %% monitor the possible leader!
689 | NE = mon_node(E#election{leadernode = node(From),
690 | elid = T},
691 | From, Server),
692 | From ! {workerIsAlive, T, self()},
693 | NE;
694 | false ->
695 | %% We should acutally ignore this, the present activation
696 | %% will complete or abort first...
697 | E
698 | end,
699 | safe_loop(Server, Role, NewE, Msg);
700 | {workerIsAlive, _, _} = Msg ->
701 | %% If this happens, the activation process should abort
702 | %% This process is no longer the leader!
703 | %% The sender will notice this via a DOWN message
704 | safe_loop(Server, Role, E, Msg);
705 | {election} = Msg ->
706 | %% We're already in an election, so this is likely an old message.
707 | safe_loop(Server, Role, E, Msg);
708 | {heartbeat, _Node} = Msg ->
709 | safe_loop(Server, Role, E, Msg);
710 | {candidate_timer} = Msg ->
711 | Down = E#election.down,
712 | Server#server.pinger_proc ! {set_ping_nodes, Down},
713 | NewE =
714 | case Down of
715 | [] ->
716 | timer:cancel(E#election.cand_timer),
717 | E#election{cand_timer = undefined};
718 | Down ->
719 | %% get rid of any queued up candidate_timers, since we just handled one
720 | flush_candidate_timers(),
721 | %% Some of potential master candidate nodes are down.
722 | %% Try to wake them up
723 | F = fun(N) ->
724 | erlang:send({E#election.name, N}, {heartbeat, node()}, [nosuspend, noconnect])
725 | end,
726 | [F(N) || N <- Down, {ok, up} =/= net_kernel:node_info(N, state)],
727 | E
728 | end,
729 | safe_loop(Server, Role, halt_pendack(NewE), Msg);
730 | {checklead, Node} = Msg ->
731 | %% in the very exceptional case when a candidate comes up when the
732 | %% elected leader is *behind* it in the candidate list *and* all nodes
733 | %% before it in the candidate list are up, the candidate will be stuck in
734 | %% the safe_loop forever. This is because gen_leader relies on either
735 | %% one of the nodes being down, or the nodes responding to the heartbeat
736 | %% sent as part of stage1. However, nodes that are up but are NOT the
737 | %% leader do not respond to heartbeats. In this very exceptional case,
738 | %% we send a heartbeat to the leader in response to the checklead it
739 | %% sent us to bootstrap things and get out of this quagmire.
740 | case lists:member(Node, E#election.candidate_nodes) and
741 | (E#election.status == elec1) of
742 | true ->
743 | case ( pos(Node, E#election.candidate_nodes) >
744 | pos(node(), E#election.candidate_nodes) ) of
745 | true ->
746 | {Name, Node} ! {heartbeat, self()};
747 | _ ->
748 | ok
749 | end;
750 | _ ->
751 | ok
752 | end,
753 | safe_loop(Server, Role, E, Msg);
754 | {ldr, 'DOWN', Node} = Msg when Role == waiting_worker ->
755 | NewE =
756 | case Node == E#election.leadernode of
757 | true ->
758 | E#election{leader = none, leadernode = none,
759 | previous_leader = E#election.leader,
760 | status = waiting_worker,
761 | monitored = []};
762 | false ->
763 | E
764 | end,
765 | safe_loop(Server, Role, NewE, Msg);
766 | {ldr, 'DOWN', Node} = Msg when Role == candidate_joining ->
767 | Ldr = E#election.leadernode,
768 | Seed = E#election.seed_node,
769 | case Node of
770 | Seed ->
771 | case net_adm:ping(Ldr) of
772 | pong -> noop;
773 | pang ->
774 | terminate(seed_nodes_down, Msg, Server, Role, E)
775 | end;
776 | Ldr ->
777 | case net_adm:ping(Seed) of
778 | pong ->
779 | NewE = joinCluster(E, Server),
780 | safe_loop(Server, Role, NewE, Msg);
781 | pang ->
782 | terminate(seed_nodes_down, Msg, Server, Role, E)
783 | end
784 | end;
785 | {ldr, 'DOWN', Node} = Msg ->
786 | NewMon = lists:keydelete(Node, 2, E#election.monitored),
787 | NewE =
788 | case lists:member(Node, E#election.candidate_nodes) of
789 | true ->
790 | NewDown = [Node | E#election.down],
791 | E1 = E#election{down = NewDown, monitored = NewMon},
792 | case ( pos(Node, E#election.candidate_nodes) <
793 | pos(node(), E#election.candidate_nodes) ) of
794 | true ->
795 | Lesser = lesser(node(), E#election.candidate_nodes),
796 | LesserIsSubset = (Lesser -- NewDown) == [],
797 | case ((E#election.status == wait)
798 | and (Node == E#election.leadernode)) of
799 | true ->
800 | startStage1(E1, Server);
801 | false ->
802 | case ((E#election.status == elec1) and
803 | LesserIsSubset) of
804 | true ->
805 | startStage2(
806 | E1#election{down = Lesser},
807 | Server);
808 | false ->
809 | E1
810 | end
811 | end;
812 | false ->
813 | case ( (E#election.status == elec2)
814 | and (Node == E#election.pendack) ) of
815 | true ->
816 | continStage2(E1, Server);
817 | false ->
818 | case ( (E#election.status == wait)
819 | and (Node == E#election.leadernode)) of
820 | true ->
821 | startStage1(E1, Server);
822 | false ->
823 | E1
824 | end
825 | end
826 | end
827 | end,
828 | hasBecomeLeader(NewE, Server, Msg)
829 | end.
830 |
831 |
832 | % this is the regular operation loop. All messages are received,
833 | % unexpected ones are discarded.
834 | loop(#server{} = Server, Role, #election{} = Election, PrevMsg) ->
835 | ?MODULE:real_loop(Server, Role, Election, PrevMsg).
836 |
837 | real_loop(#server{parent = Parent,
838 | mod = Mod,
839 | state = State,
840 | debug = Debug} = Server, Role,
841 | #election{name = Name} = E, _PrevMsg) ->
842 | receive
843 | Msg ->
844 | case Msg of
845 | code_reloaded ->
846 | loop(Server, Role, E, Msg);
847 | {system, From, Req} ->
848 | sys:handle_system_msg(Req, From, Parent, ?MODULE, Debug,
849 | [normal, Server, Role, E]);
850 | {'EXIT', Parent, Reason} ->
851 | terminate(Reason, Msg, Server, Role, E);
852 |
853 | {join, From} ->
854 | From ! {hasLeader, E#election.leader, E#election.elid, self()},
855 | loop(Server, Role, E, Msg);
856 | {update_candidates, T, Candidates, _From} ->
857 | case E#election.elid == T of
858 | true ->
859 | NewE = E#election{candidate_nodes = Candidates},
860 | loop(Server, Role, NewE, Msg);
861 | false ->
862 | loop(Server, Role, E, Msg)
863 | end;
864 | {halt, _, From} ->
865 | T = E#election.elid,
866 | case E#election.leader of
867 | From ->
868 | % The process we consider to be a leader seems to be in elec1 stage.
869 | % So we downgrade to it too
870 | NewE = startStage1(E, Server),
871 | safe_loop(Server, candidate, NewE, Msg);
872 | OtherLeader ->
873 | From ! {hasLeader, OtherLeader, T, self()},
874 | loop(Server, Role, E, Msg)
875 | end;
876 | {hasLeader, _, _, _} ->
877 | loop(Server, Role, E, Msg);
878 | {isLeader, T, From} ->
879 | case (self() == E#election.leader) of
880 | true ->
881 | NewCandidates =
882 | case lists:member(node(From), candidates(E)) of
883 | true -> candidates(E);
884 | false ->
885 | NC = candidates(E) ++ [node(From)],
886 | lists:foreach(
887 | fun(Node) ->
888 | {Name, Node} !
889 | {update_candidates, E#election.elid,
890 | NC, self()}
891 | end, candidates(E) -- lists:flatten([node()], down(E))),
892 | NC
893 | end,
894 | NewDown = E#election.down -- [node(From)],
895 | NewE1 = mon_node(E#election{down = NewDown},
896 | From, Server),
897 | NewE = NewE1#election{candidate_nodes = NewCandidates},
898 | NewState = call_elected(Mod, State, NewE, From),
899 | loop(Server#server{state = NewState}, Role, NewE, Msg);
900 | false ->
901 | From ! {notLeader, T, self()},
902 | loop(Server, Role, E, Msg)
903 | end;
904 | {ackLeader, _, _} ->
905 | loop(Server, Role, E, Msg);
906 | {notLeader, _, _} ->
907 | loop(Server, Role, E, Msg);
908 | {ack, _, _} ->
909 | loop(Server, Role, E, Msg);
910 | {ldr, _, _, _, _} ->
911 | loop(Server, Role, E, Msg);
912 | {normQ, _, _} ->
913 | loop(Server, Role, E, Msg);
914 | {notNorm, T, From} ->
915 | case ( (E#election.leader == self())
916 | and (E#election.elid == T) ) of
917 | true ->
918 | NewDown = E#election.down -- [node(From)],
919 | NewE = mon_node(E#election{down = NewDown},
920 | From, Server),
921 | NewState = call_elected(Mod, State, NewE, From),
922 | loop(Server#server{state = NewState}, Role, NewE, Msg);
923 | false ->
924 | loop(Server, Role, E, Msg)
925 | end;
926 | {workerAlive, _, _} ->
927 | %% Do nothing if we get this from a new leader
928 | %% We will soon notice that the prev leader has died, and
929 | %%get the same message again when we are back in safe_loop!
930 | loop(Server, Role, E, Msg);
931 | {activateWorker, _, _, _} ->
932 | %% We ignore this, we are already active...
933 | %% It must be an old message!
934 | loop(Server, Role, E, Msg);
935 | {workerIsAlive, T, From} ->
936 | case ((T == E#election.elid) and (self() == E#election.leader)) of
937 | true ->
938 | NewDown = E#election.work_down -- [node(From)],
939 | NewE = mon_node(E#election{work_down = NewDown},
940 | From, Server),
941 | NewState = call_elected(Mod, State, NewE, From),
942 | loop(Server#server{state = NewState}, Role, NewE, Msg);
943 | false ->
944 | loop(Server, Role, E, Msg)
945 | end;
946 | {election} ->
947 | %% Told to do an election because of a leader conflict.
948 | E1 = startStage1(E, Server),
949 | safe_loop(Server, candidate, E1, Msg);
950 | {checklead, Node} ->
951 | case (E#election.leadernode == Node) of
952 | true ->
953 | %% Leaders match, nothing to do
954 | loop(Server, Role, E, Msg);
955 | false when E#election.leader == self() ->
956 | %% We're a leader and we disagree with the other
957 | %% leader. Tell everyone else to have an election.
958 | lists:foreach(
959 | fun(N) ->
960 | {Name, N} ! {election}
961 | end, E#election.candidate_nodes),
962 | %% Start participating in the election ourselves.
963 | E1 = startStage1(E, Server),
964 | safe_loop(Server, candidate, E1, Msg);
965 | false ->
966 | %% Not a leader, just wait to be told to do an
967 | %% election, if applicable.
968 | loop(Server, Role, E, Msg)
969 | end;
970 | {send_checklead} ->
971 | case (E#election.leader == self()) of
972 | true ->
973 | case E#election.down of
974 | [] ->
975 | loop(Server, Role, E, Msg);
976 | Down ->
977 | %% For any nodes which are down, send them
978 | %% a message comparing their leader to our
979 | %% own. This allows us to trigger an
980 | %% election after a netsplit is healed.
981 | spawn(?MODULE, send_checkleads, [Name, E#election.cand_timer_int, self(), Down]),
982 | loop(Server, Role, E, Msg)
983 | end;
984 | false ->
985 | loop(Server, Role, E, Msg)
986 | end;
987 | {heartbeat, _Node} ->
988 | case (E#election.leader == self()) of
989 | true ->
990 | Candidates = E#election.down -- [lists:nth(1, E#election.candidate_nodes)],
991 | lists:foreach(
992 | fun(N) ->
993 | Elid = E#election.elid,
994 | erlang:send({Name, N}, {normQ, Elid, self()}, [nosuspend, noconnect])
995 | end, Candidates),
996 | lists:foreach(
997 | fun(N) ->
998 | Elid = E#election.elid,
999 | erlang:send({Name, N}, {workerAlive, Elid, self()}, [nosuspend, noconnect])
1000 | end, E#election.work_down);
1001 | false ->
1002 | ok
1003 | end,
1004 | loop(Server, Role, E, Msg);
1005 | {candidate_timer} = Msg
1006 | when E#election.down =:= [] orelse (Role =/= elected andalso E#election.leadernode =/= none) ->
1007 | timer:cancel(E#election.cand_timer),
1008 | loop(Server, Role, E#election{cand_timer=undefined}, Msg);
1009 | {candidate_timer} = Msg ->
1010 | %% get rid of any queued up candidate_timers,
1011 | %% since we just handled one
1012 | flush_candidate_timers(),
1013 | %% This shouldn't happen in the leader - just ignore
1014 | loop(Server, Role, E, Msg);
1015 | {ldr, 'DOWN', Node} = Msg when Role == worker ->
1016 | case Node == E#election.leadernode of
1017 | true ->
1018 | NewE = E#election{ leader = none, leadernode = none,
1019 | status = waiting_worker,
1020 | monitored = []},
1021 | safe_loop(Server, waiting_worker, NewE, Msg);
1022 | false ->
1023 | loop(Server, Role, E, Msg)
1024 | end;
1025 | {ldr, 'DOWN', Node} = Msg ->
1026 | NewMon = lists:keydelete(Node, 2, E#election.monitored),
1027 | case lists:member(Node, E#election.candidate_nodes) of
1028 | true ->
1029 | NewDown = [Node | E#election.down],
1030 | E1 = E#election{down = NewDown, monitored = NewMon},
1031 | case (Node == E#election.leadernode) of
1032 | true ->
1033 | NewE = startStage1(E1, Server),
1034 | safe_loop(Server, candidate, NewE, Msg);
1035 | false when E#election.leadernode =:= node() ->
1036 | %% Serge: call handle_DOWN
1037 | {NewState, NewE} =
1038 | case (Server#server.mod):handle_DOWN(Node, Server#server.state, E1) of
1039 | {ok, NewState1} ->
1040 | {NewState1, E1};
1041 | {ok, Synch, NewState1} ->
1042 | {NewState1, broadcast({from_leader, Synch}, E1)}
1043 | end,
1044 | %% We're the leader and one of our
1045 | %% candidates has gone down. Start sending
1046 | %% out checklead messages to the downed
1047 | %% candidates so we can quickly trigger an
1048 | %% election, if this was a netsplit when
1049 | %% its healed.
1050 | {Name, node()} ! {send_checklead},
1051 | loop(Server#server{state=NewState}, Role, NewE, Msg);
1052 | false ->
1053 | loop(Server, Role, E1, Msg)
1054 | end;
1055 | false ->
1056 | %% I am the leader,
1057 | %% make sure the dead worker is in work_down.
1058 | E1 = E#election{
1059 | monitored = NewMon,
1060 | work_down = [Node |
1061 | (E#election.work_down -- [Node])]
1062 | },
1063 | loop(Server, Role, E1, Msg)
1064 | end;
1065 | {add_worker, WorkerNode} ->
1066 | case lists:member(WorkerNode, E#election.worker_nodes) of
1067 | false ->
1068 | {WNodes, DNodes} = {E#election.worker_nodes, E#election.work_down},
1069 |
1070 | loop(Server, Role, E#election{worker_nodes=[WorkerNode|WNodes],
1071 | work_down=[WorkerNode|DNodes]},
1072 | Msg);
1073 | true -> % Redundancy, meet the mirror
1074 | loop(Server, Role, E, Msg)
1075 | end;
1076 | _Msg when Debug == [] ->
1077 | handle_msg(Msg, Server, Role, E);
1078 | _Msg ->
1079 | Debug1 = sys:handle_debug(Debug, fun ?MODULE:print_event/3,
1080 | E#election.name, {in, Msg}),
1081 | handle_msg(Msg, Server#server{debug = Debug1}, Role, E)
1082 | end
1083 | end.
1084 |
1085 | %%-----------------------------------------------------------------
1086 | %% Callback functions for system messages handling.
1087 | %%-----------------------------------------------------------------
1088 | %% @hidden
1089 | system_continue(_Parent, _Debug, [safe, Server, Role, E]) ->
1090 | safe_loop(Server, Role, E, {});
1091 | system_continue(_Parent, _Debug, [normal, Server, Role, E]) ->
1092 | loop(Server, Role, E, {}).
1093 |
1094 | %% @hidden
1095 | -spec system_terminate(any(), any(), any(), any()) -> no_return() .
1096 | system_terminate(Reason, _Parent, _Debug, [_Mode, Server, Role, E]) ->
1097 | terminate(Reason, [], Server, Role, E).
1098 |
1099 | %% @hidden
1100 | system_code_change([Mode, Server, Role, E], _Module, OldVsn, Extra) ->
1101 | #server{mod = Mod, state = State} = Server,
1102 | case catch Mod:code_change(OldVsn, State, E, Extra) of
1103 | {ok, NewState} ->
1104 | NewServer = Server#server{state = NewState},
1105 | {ok, [Mode, NewServer, Role, E]};
1106 | {ok, NewState, NewE} ->
1107 | NewServer = Server#server{state = NewState},
1108 | {ok, [Mode, NewServer, Role, NewE]};
1109 | Else -> Else
1110 | end.
1111 |
1112 | %%-----------------------------------------------------------------
1113 | %% Format debug messages. Print them as the call-back module sees
1114 | %% them, not as the real erlang messages. Use trace for that.
1115 | %%-----------------------------------------------------------------
1116 | %% @hidden
1117 | print_event(Dev, {in, Msg}, Name) ->
1118 | case Msg of
1119 | {'$gen_call', {From, _Tag}, Call} ->
1120 | io:format(Dev, "*DBG* ~p got local call ~p from ~w~n",
1121 | [Name, Call, From]);
1122 | {'$leader_call', {From, _Tag}, Call} ->
1123 | io:format(Dev, "*DBG* ~p got global call ~p from ~w~n",
1124 | [Name, Call, From]);
1125 | {'$gen_cast', Cast} ->
1126 | io:format(Dev, "*DBG* ~p got local cast ~p~n",
1127 | [Name, Cast]);
1128 | {'$leader_cast', Cast} ->
1129 | io:format(Dev, "*DBG* ~p got global cast ~p~n",
1130 | [Name, Cast]);
1131 | _ ->
1132 | io:format(Dev, "*DBG* ~p got ~p~n", [Name, Msg])
1133 | end;
1134 | print_event(Dev, {out, Msg, To, State}, Name) ->
1135 | io:format(Dev, "*DBG* ~p sent ~p to ~w, new state ~w~n",
1136 | [Name, Msg, To, State]);
1137 | print_event(Dev, {noreply, State}, Name) ->
1138 | io:format(Dev, "*DBG* ~p new state ~w~n", [Name, State]);
1139 | print_event(Dev, Event, Name) ->
1140 | io:format(Dev, "*DBG* ~p dbg ~p~n", [Name, Event]).
1141 |
1142 |
1143 | handle_msg({'$leader_call', From, Request} = Msg,
1144 | #server{mod = Mod, state = State} = Server, elected = Role, E) ->
1145 | case catch Mod:handle_leader_call(Request, From, State, E) of
1146 | {reply, Reply, NState} ->
1147 | NewServer = reply(From, {leader, reply, Reply},
1148 | Server#server{state = NState}, Role, E),
1149 | loop(NewServer, Role, E, Msg);
1150 | {reply, Reply, Broadcast, NState} ->
1151 | NewE = broadcast({from_leader, Broadcast}, E),
1152 | NewServer = reply(From, {leader, reply, Reply},
1153 | Server#server{state = NState}, Role,
1154 | NewE),
1155 | loop(NewServer, Role, NewE, Msg);
1156 | {noreply, NState} = Reply ->
1157 | NewServer = handle_debug(Server#server{state = NState},
1158 | Role, E, Reply),
1159 | loop(NewServer, Role, E, Msg);
1160 | {stop, Reason, Reply, NState} ->
1161 | {'EXIT', R} =
1162 | (catch terminate(Reason, Msg,
1163 | Server#server{state = NState},
1164 | Role, E)),
1165 | reply(From, Reply),
1166 | exit(R);
1167 | Other ->
1168 | handle_common_reply(Other, Msg, Server, Role, E)
1169 | end;
1170 | handle_msg({from_leader, Cmd} = Msg,
1171 | #server{mod = Mod, state = State} = Server, Role, E) ->
1172 | NewE = check_candidates(E),
1173 | handle_common_reply(catch Mod:from_leader(Cmd, State, NewE),
1174 | Msg, Server, Role, NewE);
1175 | handle_msg({'$leader_call', From, Request} = Msg, Server, Role,
1176 | #election{buffered = Buffered, leader = Leader} = E) ->
1177 | Ref = make_ref(),
1178 | Leader ! {'$leader_call', {self(), Ref}, Request},
1179 | NewBuffered = [{Ref, From}|Buffered],
1180 | loop(Server, Role, E#election{buffered = NewBuffered}, Msg);
1181 | handle_msg({Ref, {leader, reply, Reply}} = Msg, Server, Role,
1182 | #election{buffered = Buffered} = E) ->
1183 | {value, {_, From}} = lists:keysearch(Ref, 1, Buffered),
1184 | El = E#election{buffered = lists:keydelete(Ref, 1, Buffered)},
1185 |
1186 | NewServer = reply(From, {leader, reply, Reply}, Server, Role, El),
1187 |
1188 | loop(NewServer, Role, El, Msg);
1189 | handle_msg({'$gen_call', From, get_candidates} = Msg, Server, Role, E) ->
1190 | NewServer = reply(From, {ok, candidates(E)}, Server, Role, E),
1191 | loop(NewServer, Role, E, Msg);
1192 | handle_msg({'$gen_call', From, Request} = Msg,
1193 | #server{mod = Mod, state = State} = Server, Role, E) ->
1194 | case catch Mod:handle_call(Request, From, State, E) of
1195 | {reply, Reply, NState} ->
1196 | NewServer = reply(From, Reply,
1197 | Server#server{state = NState}, Role, E),
1198 | loop(NewServer, Role, E, Msg);
1199 | {noreply, NState} = Reply ->
1200 | NewServer = handle_debug(Server#server{state = NState},
1201 | Role, E, Reply),
1202 | loop(NewServer, Role, E, Msg);
1203 | {stop, Reason, Reply, NState} ->
1204 | {'EXIT', R} =
1205 | (catch terminate(Reason, Msg, Server#server{state = NState},
1206 | Role, E)),
1207 | reply(From, Reply),
1208 | exit(R);
1209 | Other ->
1210 | handle_common_reply(Other, Msg, Server, Role, E)
1211 | end;
1212 | handle_msg({'$gen_cast', Msg} = Cast,
1213 | #server{mod = Mod, state = State} = Server, Role, E) ->
1214 | handle_common_reply(catch Mod:handle_cast(Msg, State, E),
1215 | Cast, Server, Role, E);
1216 | handle_msg({'$leader_cast', Msg} = Cast,
1217 | #server{mod = Mod, state = State} = Server, elected = Role, E) ->
1218 | case catch Mod:handle_leader_cast(Msg, State, E) of
1219 | {noreply, NState} ->
1220 | NewServer = handle_debug(Server#server{state = NState},
1221 | Role, E, Cast),
1222 | loop(NewServer, Role, E, Cast);
1223 | {ok, Broadcast, NState} ->
1224 | NewE = broadcast({from_leader, Broadcast}, E),
1225 | NewServer = handle_debug(Server#server{state = NState},
1226 | Role, E, Cast),
1227 | loop(NewServer, Role, NewE, Cast);
1228 | Other ->
1229 | handle_common_reply(Other, Msg, Server, Role, E)
1230 | end;
1231 | handle_msg({'$leader_cast', Msg} = Cast, Server, Role,
1232 | #election{leader = Leader} = E) ->
1233 | Leader ! {'$leader_cast', Msg},
1234 | loop(Server, Role, E, Cast);
1235 |
1236 | handle_msg(Msg, #server{mod = Mod, state = State} = Server, Role, E) ->
1237 | handle_common_reply(catch Mod:handle_info(Msg, State, E),
1238 | Msg, Server, Role, E).
1239 |
1240 |
1241 | handle_common_reply(Reply, Msg, Server, Role, E) ->
1242 | case Reply of
1243 | {noreply, NState} ->
1244 | NewServer = handle_debug(Server#server{state = NState},
1245 | Role, E, Reply),
1246 | loop(NewServer, Role, E, Msg);
1247 | {ok, NState} ->
1248 | NewServer = handle_debug(Server#server{state = NState},
1249 | Role, E, Reply),
1250 | loop(NewServer, Role, E, Msg);
1251 | {stop, Reason, NState} ->
1252 | terminate(Reason, Msg, Server#server{state = NState}, Role, E);
1253 | {'EXIT', Reason} ->
1254 | terminate(Reason, Msg, Server, Role, E);
1255 | _ ->
1256 | terminate({bad2_return_value, Reply}, Msg, Server, Role, E)
1257 | end.
1258 |
1259 |
1260 | reply({To, Tag}, Reply, #server{state = State} = Server, Role, E) ->
1261 | reply({To, Tag}, Reply),
1262 | handle_debug(Server, Role, E, {out, Reply, To, State}).
1263 |
1264 |
1265 | handle_debug(#server{debug = []} = Server, _Role, _E, _Event) ->
1266 | Server;
1267 | handle_debug(#server{debug = Debug} = Server, _Role, E, Event) ->
1268 | Debug1 = sys:handle_debug(Debug, fun ?MODULE:print_event/3,
1269 | E#election.name, Event),
1270 | Server#server{debug = Debug1}.
1271 |
1272 | %%% ---------------------------------------------------
1273 | %%% Terminate the server.
1274 | %%% ---------------------------------------------------
1275 |
1276 | terminate(Reason, Msg, #server{mod = Mod,
1277 | state = State,
1278 | debug = Debug} = _Server, _Role,
1279 | #election{name = Name, cand_timer = Timer} = _E) ->
1280 | timer:cancel(Timer),
1281 | case catch Mod:terminate(Reason, State) of
1282 | {'EXIT', R} ->
1283 | error_info(R, Name, Msg, State, Debug),
1284 | exit(R);
1285 | _ ->
1286 | case Reason of
1287 | normal ->
1288 | exit(normal);
1289 | shutdown ->
1290 | exit(shutdown);
1291 | _ ->
1292 | error_info(Reason, Name, Msg, State, Debug),
1293 | exit(Reason)
1294 | end
1295 | end.
1296 |
1297 | %% Maybe we shouldn't do this? We have the crash report...
1298 | error_info(Reason, Name, Msg, State, Debug) ->
1299 | error_logger:format("** Generic leader ~p terminating \n"
1300 | "** Last message in was ~p~n"
1301 | "** When Server state == ~p~n"
1302 | "** Reason for termination == ~n** ~p~n",
1303 | [Name, Msg, State, Reason]),
1304 | sys:print_log(Debug),
1305 | ok.
1306 |
1307 | %%% ---------------------------------------------------
1308 | %%% Misc. functions.
1309 | %%% ---------------------------------------------------
1310 |
1311 | opt(Op, [{Op, Value}|_]) ->
1312 | {ok, Value};
1313 | opt(Op, [_|Options]) ->
1314 | opt(Op, Options);
1315 | opt(_, []) ->
1316 | false.
1317 |
1318 | debug_options(Name, Opts) ->
1319 | case opt(debug, Opts) of
1320 | {ok, Options} -> dbg_options(Name, Options);
1321 | _ -> dbg_options(Name, [])
1322 | end.
1323 |
1324 | dbg_options(Name, []) ->
1325 | Opts =
1326 | case init:get_argument(generic_debug) of
1327 | error ->
1328 | [];
1329 | _ ->
1330 | [log, statistics]
1331 | end,
1332 | dbg_opts(Name, Opts);
1333 | dbg_options(Name, Opts) ->
1334 | dbg_opts(Name, Opts).
1335 |
1336 | dbg_opts(Name, Opts) ->
1337 | case catch sys:debug_options(Opts) of
1338 | {'EXIT', _} ->
1339 | error_logger:format("~p: ignoring erroneous debug options - ~p~n",
1340 | [Name, Opts]),
1341 | [];
1342 | Dbg ->
1343 | Dbg
1344 | end.
1345 |
1346 | %%-----------------------------------------------------------------
1347 | %% Status information
1348 | %%-----------------------------------------------------------------
1349 | %% @hidden
1350 | format_status(Opt, StatusData) ->
1351 | [PDict, SysState, Parent, Debug, [_Mode, Server, _Role, E]] = StatusData,
1352 | Header = lists:concat(["Status for generic server ", E#election.name]),
1353 | Log = sys:get_debug(log, Debug, []),
1354 | #server{mod = Mod, state = State} = Server,
1355 | Specific =
1356 | case erlang:function_exported(Mod, format_status, 2) of
1357 | true ->
1358 | case catch apply(Mod, format_status, [Opt, [PDict, State]]) of
1359 | {'EXIT', _} -> [{data, [{"State", State}]}];
1360 | Else -> Else
1361 | end;
1362 | _ ->
1363 | [{data, [{"State", State}]}]
1364 | end,
1365 | [{header, Header},
1366 | {data, [{"Status", SysState},
1367 | {"Parent", Parent},
1368 | {"Logged events", Log}]} |
1369 | Specific].
1370 |
1371 |
1372 | %%-----------------------------------------------------------------
1373 | %% Leader-election functions
1374 | %%-----------------------------------------------------------------
1375 |
1376 | %% Corresponds to startStage1 in Figure 1 in the Stoller-article
1377 | startStage1(E, Server) ->
1378 | NodePos = pos(node(), E#election.candidate_nodes),
1379 | Elid = {NodePos, E#election.incarn, E#election.nextel},
1380 | NewE = E#election{
1381 | elid = Elid,
1382 | nextel = E#election.nextel + 1,
1383 | down = [],
1384 | status = elec1},
1385 | case NodePos of
1386 | 1 ->
1387 | startStage2(NewE, Server);
1388 | _ ->
1389 | mon_nodes(NewE, lesser(node(), E#election.candidate_nodes), Server)
1390 | end.
1391 |
1392 | %% Corresponds to startStage2
1393 | startStage2(E, Server) ->
1394 | continStage2(E#election{status = elec2, pendack = node(), acks = []},
1395 | Server).
1396 |
1397 | continStage2(E, Server) ->
1398 | case (pos(E#election.pendack, E#election.candidate_nodes)
1399 | < length(E#election.candidate_nodes)) of
1400 | true ->
1401 | Pendack = next(E#election.pendack, E#election.candidate_nodes),
1402 | NewE = mon_nodes(E, [Pendack], Server),
1403 | halt_pendack(NewE#election{pendack = Pendack});
1404 | false ->
1405 | %% I am the leader
1406 | E#election{leader = self(),
1407 | leadernode = node(),
1408 | previous_leader = E#election.leader,
1409 | status = norm}
1410 | end.
1411 |
1412 | halt_pendack(#election{pendack = undefined} = E) ->
1413 | E;
1414 | halt_pendack(#election{name = Name, elid = ElId, pendack = Pendack} = E) ->
1415 | erlang:send({Name, Pendack}, {halt, ElId, self()}, [nosuspend, noconnect]),
1416 | E.
1417 |
1418 | %% corresponds to Halting
1419 | halting(E, T, From, Server) ->
1420 | NewE = mon_node(E, From, Server),
1421 | NewE#election{elid = T,
1422 | status = wait,
1423 | leadernode = node(From),
1424 | down = E#election.down -- [node(From)]
1425 | }.
1426 |
1427 |
1428 | joinCluster(E, Server) ->
1429 | Pid = {E#election.name, E#election.seed_node},
1430 | Pid ! {join, self()},
1431 | NewE = mon_node(E, Pid, Server),
1432 | NewE#election{status = joining}.
1433 |
1434 |
1435 | %%% checks if the proc has become the leader, if so switch to loop
1436 | hasBecomeLeader(E, Server, Msg) ->
1437 | case ((E#election.status == norm) and (E#election.leader == self())) of
1438 | true ->
1439 | {ok, Synch, NewState} =
1440 | (Server#server.mod):elected(Server#server.state, E, undefined),
1441 | lists:foreach(
1442 | fun(Node) ->
1443 | {E#election.name, Node} !
1444 | {ldr, Synch, E#election.elid, workers(E), candidates(E), self()}
1445 | end, E#election.acks),
1446 |
1447 | %% Make sure we will try to contact all workers!
1448 | NewE = E#election{work_down = E#election.worker_nodes},
1449 |
1450 | %% io:format("==> I am the leader! (acks: ~200p)\n", [E#election.acks]),
1451 | %% Set the internal timeout (corresponds to Periodically)
1452 | timer:send_after(E#election.cand_timer_int, {heartbeat, node()}),
1453 | {E#election.name, node()} ! {send_checklead},
1454 |
1455 | %% trigger handle_DOWN callback if previous leader is down
1456 | PrevLeader = E#election.previous_leader,
1457 | {NewState2, NewE2} =
1458 | case PrevLeader of
1459 | none -> {NewState, NewE};
1460 | Pid when is_pid(Pid) ->
1461 | case lists:member(node(PrevLeader), down(E)) of
1462 | false -> {NewState, NewE};
1463 | true ->
1464 | case (Server#server.mod):handle_DOWN(node(PrevLeader), NewState, NewE) of
1465 | {ok, NS} -> {NS, NewE};
1466 | {ok, Synch2, NS} ->
1467 | {NS, broadcast({from_leader, Synch2}, NewE)}
1468 | end
1469 | end
1470 | end,
1471 |
1472 | %% (It's meaningful only when I am the leader!)
1473 | loop(Server#server{state = NewState2}, elected, NewE2, Msg);
1474 | false ->
1475 | safe_loop(Server, candidate, E, Msg)
1476 | end.
1477 |
1478 |
1479 | %%%
1480 | %%% No one checks incarnation type, we just check equality
1481 | %%% So it is OK to just use timestamp here
1482 | %%%
1483 | incarnation(_VarDir, _RegName, _Node) ->
1484 | os:timestamp().
1485 |
1486 |
1487 | broadcast(Msg, #election{monitored = Monitored} = E) ->
1488 | %% This function is used for broadcasts,
1489 | %% and we make sure only to broadcast to already known nodes.
1490 | ToNodes = [N || {_, N} <- Monitored],
1491 | broadcast(Msg, ToNodes, E).
1492 |
1493 | broadcast({from_leader, Msg}, ToNodes, E) ->
1494 | lists:foreach(
1495 | fun(Node) ->
1496 | {E#election.name, Node} ! {from_leader, Msg}
1497 | end, ToNodes),
1498 | E.
1499 |
1500 |
1501 | lesser(_, []) ->
1502 | [];
1503 | lesser(N, [N|_]) ->
1504 | [];
1505 | lesser(N, [M|Ms]) ->
1506 | [M|lesser(N, Ms)].
1507 |
1508 | next(_, []) ->
1509 | no_val;
1510 | next(N, [N|Ms]) ->
1511 | lists:nth(1, Ms);
1512 | next(N, [_|Ms]) ->
1513 | next(N, Ms).
1514 |
1515 | pos(_, []) ->
1516 | 100000;
1517 | pos(N1, [N1|_]) ->
1518 | 1;
1519 | pos(N1, [_|Ns]) ->
1520 | 1+pos(N1, Ns).
1521 |
1522 | check_candidates(#election{down = Down} = E) ->
1523 | NewDown = [N || N <- Down, {ok, up} =/= net_kernel:node_info(N, state)],
1524 | E#election{down = NewDown}.
1525 |
1526 | broadcast_candidates(E, Synch, IgnoreNodes) ->
1527 | case E#election.bcast_type of
1528 | all ->
1529 | Nodes = [N || {_, N} <- E#election.monitored] -- IgnoreNodes,
1530 | broadcast({from_leader, Synch}, Nodes, E);
1531 | _ ->
1532 | ok
1533 | end.
1534 |
1535 | call_elected(Mod, State, E, From) when is_pid(From) ->
1536 | case Mod:elected(State, E, node(From)) of
1537 | {ok, Synch, NewState} ->
1538 | From ! {ldr, Synch, E#election.elid, workers(E), candidates(E), self()},
1539 | broadcast_candidates(E, Synch, [From]),
1540 | NewState;
1541 | {reply, Synch, NewState} ->
1542 | From ! {ldr, Synch, E#election.elid, workers(E), candidates(E), self()},
1543 | NewState
1544 | end.
1545 |
1546 |
1547 | %% Start monitor a bunch of candidate nodes
1548 | mon_nodes(E, Nodes, Server) ->
1549 | Server#server.pinger_proc ! {set_ping_nodes, Nodes},
1550 | E1 =
1551 | case E#election.cand_timer of
1552 | undefined ->
1553 | {ok, TRef} = timer:send_interval(E#election.cand_timer_int, {candidate_timer}),
1554 | E#election{cand_timer = TRef};
1555 | _ ->
1556 | E
1557 | end,
1558 | FromNode = node(),
1559 | lists:foldl(
1560 | fun(ToNode, El) ->
1561 | Pid = {El#election.name, ToNode},
1562 | erlang:send(Pid, {heartbeat, FromNode}, [nosuspend, noconnect]),
1563 | mon_node(El, Pid, Server)
1564 | end, E1, Nodes -- [node()]).
1565 |
1566 | %% Start monitoring one Process
1567 | mon_node(E, {_RegName, NodeName} = Proc, Server) ->
1568 | do_mon_node(E, Proc, NodeName, Server);
1569 |
1570 | mon_node(E, Proc, Server) when is_pid(Proc) ->
1571 | do_mon_node(E, Proc, node(Proc), Server).
1572 |
1573 | do_mon_node(E, Proc, NodeName, Server) ->
1574 | case lists:keymember(NodeName, 2, E#election.monitored) of
1575 | true -> E;
1576 | false ->
1577 | {Ref, Node} = do_monitor(Proc, Server),
1578 | E#election{monitored = [{Ref, Node} | E#election.monitored]}
1579 | end.
1580 |
1581 | spawn_monitor_proc() ->
1582 | Parent = self(),
1583 | proc_lib:spawn_link(?MODULE, real_mon_loop, [Parent, []]).
1584 |
1585 |
1586 | do_monitor(Proc, #server{monitor_proc = P}) ->
1587 | P ! {self(), {monitor, Proc}},
1588 | receive
1589 | {mon_reply, Reply} ->
1590 | Reply
1591 | after 10000 -> % can take quite a while to receive mon_reply if the node is down
1592 | erlang:error(timeout)
1593 | end.
1594 |
1595 | mon_loop(Parent, Refs) ->
1596 | ?MODULE:real_mon_loop(Parent, Refs).
1597 |
1598 | real_mon_loop(Parent, Refs) ->
1599 | receive
1600 | code_reloaded ->
1601 | mon_loop(Parent, Refs);
1602 | {From, Req} ->
1603 | mon_loop(Parent, mon_handle_req(Req, From, Refs));
1604 | {'DOWN', Ref, _, _, _} ->
1605 | mon_loop(Parent, mon_handle_down(Ref, Parent, Refs));
1606 | Msg ->
1607 | io:fwrite("mon_loop with parent: ~p refs: ~p received: ~p~n", [Parent, Refs, Msg]),
1608 | mon_loop(Parent, Refs)
1609 | end.
1610 |
1611 | mon_handle_req({monitor, P}, From, Refs) ->
1612 | Node = case P of
1613 | {_Name, N} -> N;
1614 | Pid when is_pid(Pid) -> node(Pid)
1615 | end,
1616 | case lists:keyfind(Node, 2, Refs) of
1617 | {Ref, _} ->
1618 | mon_reply(From, {Ref, Node}),
1619 | Refs;
1620 | false ->
1621 | Ref = erlang:monitor(process, P),
1622 | mon_reply(From, {Ref, Node}),
1623 | [{Ref, Node}|Refs]
1624 | end.
1625 |
1626 | mon_handle_down(Ref, Parent, Refs) ->
1627 | case lists:keytake(Ref, 1, Refs) of
1628 | {value, {_, Node}, Refs1} ->
1629 | Parent ! {ldr, 'DOWN', Node},
1630 | Refs1;
1631 | false ->
1632 | Refs
1633 | end.
1634 |
1635 |
1636 | mon_reply(From, Reply) ->
1637 | From ! {mon_reply, Reply}.
1638 |
1639 |
1640 | spawn_pinger_proc() ->
1641 | Parent = self(),
1642 | proc_lib:spawn_link(?MODULE, init_ping_loop, [Parent, []]).
1643 |
1644 | init_ping_loop(Parent, NodesToPing) ->
1645 | ping_loop(Parent, set_ping_timer(0), NodesToPing).
1646 |
1647 | set_ping_timer(Timeout) ->
1648 | erlang:start_timer(Timeout, self(), {do_ping}).
1649 |
1650 | %% To avoid leader blocking on message send, we ping nodes here,
1651 | %% and leader sends messages to down nodes with [nosuspend, noconnect]
1652 | ping_loop(Parent, TRef, NodesToPing) ->
1653 | receive
1654 | code_reloaded ->
1655 | ?MODULE:ping_loop(Parent, TRef, NodesToPing);
1656 | {set_ping_nodes, NewNodesToPing} ->
1657 | init_ping_loop(Parent, NewNodesToPing);
1658 | {timeout, TRef, _} ->
1659 | NewTRef = set_ping_timer(1000),
1660 | [net_adm:ping(Node) || Node <- NodesToPing],
1661 | ?MODULE:ping_loop(Parent, NewTRef, NodesToPing);
1662 | {timeout, _, _} ->
1663 | ?MODULE:ping_loop(Parent, TRef, NodesToPing);
1664 | Msg ->
1665 | io:fwrite("ping_loop with parent: ~p nodes: ~p received: ~p~n", [Parent, NodesToPing, Msg]),
1666 | ?MODULE:ping_loop(Parent, TRef, NodesToPing)
1667 | end.
1668 |
1669 |
1670 |
1671 | %% the heartbeat messages sent to the downed nodes when the candicate_timer
1672 | %% message is received can take a very long time in the case of a partitioned
1673 | %% network (7 seconds in my testing). Since the candidate_timer is generated
1674 | %% by a send_interval, this means many candidate_timer messages can accumulate
1675 | %% in the mailbox. This function is used to clear them out after handling one
1676 | %% of the candidate_timers, so gen_leader doesn't spend all its time sending
1677 | %% heartbeats.
1678 | flush_candidate_timers() ->
1679 | receive
1680 | {candidate_timer} ->
1681 | flush_candidate_timers()
1682 | after
1683 | 0 ->
1684 | ok
1685 | end.
1686 |
1687 | %% sending messages to disconnected nodes can take a long time
1688 | %% instead of doing this in the gen_leader process, do it here
1689 | %% in a new proc so that gen_leader can remain responsive
1690 | %% Reschedule the next round of checkleads after this round completes,
1691 | %% since sending the messages can take longer than the time between rounds
1692 | send_checkleads(Name, Time, GlProc, Down) ->
1693 | Node = node(),
1694 | [{Name, N} ! {checklead, Node} || N <- Down],
1695 | erlang:send_after(Time, GlProc, {send_checklead})
1696 | .
1697 |
1698 |
--------------------------------------------------------------------------------