├── rebar
├── .gitignore
├── src
    ├── locker.app.src
    └── locker.erl
├── rebar.config
├── priv
    └── basho_bench_locker.config
├── LICENSE
├── test
    ├── locker_benchmark.erl
    ├── basho_bench_driver_locker.erl
    ├── locker_proper.erl
    └── locker_SUITE.erl
└── README.md


/rebar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wooga/locker/HEAD/rebar


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | logs
 2 | .eunit
 3 | ebin
 4 | deps
 5 | *.beam
 6 | *~
 7 | .DS_Store
 8 | .rebar
 9 | 
10 | 


--------------------------------------------------------------------------------
/src/locker.app.src:
--------------------------------------------------------------------------------
 1 | {application, locker,
 2 |  [
 3 |   {description, ""},
 4 |   {vsn, "6"},
 5 |   {registered, []},
 6 |   {applications, [
 7 |                   kernel,
 8 |                   stdlib
 9 |                  ]},
10 |   {env, []}
11 |  ]}.
12 | 


--------------------------------------------------------------------------------
/rebar.config:
--------------------------------------------------------------------------------
 1 | {lib_dirs, ["deps"]}.
 2 | {erl_opts, [debug_info]}.
 3 | {clean_files, ["ebin/*.beam"]}.
 4 | 
 5 | {xref_checks, [exports_not_used, undefined_function_calls]}.
 6 | {deps, [
 7 |   {proper, "", {git,"git://github.com/manopapad/proper.git"}}
 8 | ]}.
 9 | 
10 | {ct_use_short_names, true}.
11 | 


--------------------------------------------------------------------------------
/priv/basho_bench_locker.config:
--------------------------------------------------------------------------------
 1 | {mode, max}.
 2 | %{mode, {rate, 100}}.
 3 | 
 4 | {duration, 15}.
 5 | 
 6 | {concurrent, 8}.
 7 | 
 8 | {driver, basho_bench_driver_locker}.
 9 | 
10 | {code_paths, ["../locker/ebin"]}.
11 | 
12 | {operations, [{get, 9}, {set,1}]}.
13 | 
14 | {key_generator, {partitioned_sequential_int, 100000000}}.
15 | 
16 | {value_generator, {fixed_bin, 1}}.
17 | 
18 | {masters, [{'localhost', 'a'}, {'localhost', 'b'}, {'localhost', 'c'}]}.
19 | {w, 2}.
20 | {replicas, [{'localhost', 'r1'}, {'localhost', 'r2'}, {'localhost', 'r3'}]}.
21 | {start_nodes, true}.
22 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License
 2 | 
 3 | Copyright (c) 2011 wooga GmbH
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.


--------------------------------------------------------------------------------
/test/locker_benchmark.erl:
--------------------------------------------------------------------------------
 1 | -module(locker_benchmark).
 2 | -compile([export_all]).
 3 | 
 4 | 
 5 | 
 6 | start() ->
 7 |     StatsConfig = [{docroot, filename:absname(
 8 |                                filename:join(code:priv_dir(statman),
 9 |                                              "docroot"))}],
10 |     elli:start_link([{callback, statman_elli}, {callback_args, StatsConfig}]),
11 | 
12 |     locker:start_link(1),
13 |     locker:set_nodes([node()], [node()], []),
14 | 
15 |     statman_server:start_link(1000),
16 |     statman_merger:start_link(),
17 | 
18 |     statman_elli_server:start_link(),
19 |     statman_server:add_subscriber(statman_merger),
20 |     statman_merger:add_subscriber(statman_elli_server).
21 | 
22 | 
23 | run(Start, End, ExtendInterval, LeaseLength) ->
24 |     [begin
25 |          timer:sleep(100),
26 |          spawn(?MODULE, session, [N, ExtendInterval, LeaseLength])
27 |      end || N <- lists:seq(Start, End)].
28 | 
29 | 
30 | 
31 | session(Id, ExtendInterval, LeaseLength) ->
32 |     Start = erlang:monotonic_time(micro_seconds),
33 |     case locker:lock(Id, self(), LeaseLength) of
34 |         {ok, _, _, _} ->
35 |             statman_histogram:record_value(get_lease, (erlang:monotonic_time(micro_seconds) - Start)),
36 |             ?MODULE:session_loop(Id, ExtendInterval, LeaseLength);
37 |         {error, _} ->
38 |             error_logger:info_msg("~p: could not get lock~n", [Id])
39 |     end.
40 | 
41 | session_loop(Id, ExtendInterval, LeaseLength) ->
42 |     erlang:send_after(ExtendInterval, self(), extend),
43 | 
44 |     receive
45 |         extend ->
46 |             Start = erlang:monotonic_time(micro_seconds),
47 |             case locker:extend_lease(Id, self(), LeaseLength) of
48 |                 ok ->
49 |                     statman_histogram:record_value(extend_lease, (erlang:monotonic_time(micro_seconds) - Start)),
50 |                     ?MODULE:session_loop(Id, ExtendInterval, LeaseLength);
51 |                 {error, _} ->
52 |                     error_logger:info_msg("~p: could not extend lease~n", [Id])
53 |             end
54 |     end.
55 | 


--------------------------------------------------------------------------------
/test/basho_bench_driver_locker.erl:
--------------------------------------------------------------------------------
 1 | -module(basho_bench_driver_locker).
 2 | 
 3 | -export([new/1,
 4 |          run/4]).
 5 | 
 6 | new(_Id) ->
 7 |     case mark_setup_completed() of
 8 |         true ->
 9 |             error_logger:info_msg("setting up cluster~n"),
10 |             net_kernel:start([master, shortnames]),
11 |             {ok, _LocalLocker} = locker:start_link(2),
12 |             MasterNames = basho_bench_config:get(masters),
13 |             ReplicaNames = basho_bench_config:get(replicas),
14 |             W = basho_bench_config:get(w),
15 | 
16 |             case basho_bench_config:get(start_nodes) of
17 |                 true ->
18 |                     Masters = setup(MasterNames, W),
19 |                     Replicas = setup(ReplicaNames, W),
20 | 
21 |                     ok = locker:set_nodes(Masters ++ Replicas, Masters, Replicas),
22 |                     error_logger:info_msg("~p~n",
23 |                                           [rpc:call(hd(Replicas), locker, get_meta, [])]),
24 |                     {ok, {Masters, Replicas}};
25 |                 false ->
26 |                     {ok, []}
27 |             end;
28 |         false ->
29 |             %%timer:sleep(30000),
30 |             Masters = [list_to_atom(atom_to_list(N) ++ "@" ++ atom_to_list(H))
31 |                        || {H, N} <- basho_bench_config:get(masters)],
32 | 
33 |             Replicas = [list_to_atom(atom_to_list(N) ++ "@" ++ atom_to_list(H))
34 |                         || {H, N} <- basho_bench_config:get(replicas)],
35 | 
36 |             {ok, {Masters, Replicas}}
37 |     end.
38 | 
39 | setup(NodeNames, W) ->
40 |     Nodes = [begin
41 |                  element(2, slave:start_link(Hostname, N))
42 |              end
43 |              || {Hostname, N} <- NodeNames],
44 | 
45 |     [rpc:call(N, code, add_path, ["/home/knutin/git/locker/ebin"]) || N <- Nodes],
46 |     [rpc:call(N, locker, start_link, [W]) || N <- Nodes],
47 | 
48 |     Nodes.
49 | 
50 | 
51 | mark_setup_completed() ->
52 |     case whereis(locker_setup) of
53 |         undefined ->
54 |             true = register(locker_setup, self()),
55 |             true;
56 |         _ ->
57 |             false
58 |     end.
59 | 
60 | 
61 | 
62 | run(set, KeyGen, _ValueGen, {[M | Masters], Replicas}) ->
63 |     NewMasters = lists:reverse([M | lists:reverse(Masters)]),
64 | 
65 |     Key = KeyGen(),
66 |     case rpc:call(M, locker, lock, [Key, Key]) of
67 |         {ok, _, _, _} ->
68 |             {ok, {NewMasters, Replicas}};
69 |         {error, Error} ->
70 |             error_logger:info_msg("Key: ~p~, ~p~n", [Key, Error]),
71 |             {error, Error, {NewMasters, Replicas}}
72 |     end;
73 | 
74 | run(get, KeyGen, _, {[M | Masters], Replicas}) ->
75 |     NewMasters = lists:reverse([M | lists:reverse(Masters)]),
76 | 
77 |     Key = KeyGen(),
78 |     case locker:dirty_read(Key) of
79 |         {ok, Key} ->
80 |             {ok, {NewMasters, Replicas}};
81 |         {ok, _OtherValue} ->
82 |             {error, wrong_value, {NewMasters, Replicas}};
83 |         {error, not_found} ->
84 |             {ok, {NewMasters, Replicas}}
85 |     end.
86 | 
87 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ## locker - atomic distributed "check and set" for short-lived keys
  2 | 
  3 | `locker` is a distributed de-centralized consistent in-memory
  4 | key-value store written in Erlang. An entry expires after a certain
  5 | amount of time, unless the lease is extended. This makes it a good
  6 | practical option for locks, mutexes and leader election in a
  7 | distributed system.
  8 | 
  9 | In terms of the CAP theorem, `locker` chooses consistency by requiring
 10 | a quorum for every write. For reads, `locker` chooses availability and
 11 | always does a local read which can be inconsistent. Extensions of the
 12 | lease is used as an anti-entropy mechanism to eventually propagate all
 13 | leases.
 14 | 
 15 | It is designed to be used inside your application on the Erlang VM,
 16 | using the Erlang distribution to communicate with masters and
 17 | replicas.
 18 | 
 19 | Operations:
 20 | 
 21 |  * `locker:lock/2,3,4`
 22 |  * `locker:update/3,4`
 23 |  * `locker:extend_lease/3`
 24 |  * `locker:release/2,3`
 25 |  * `locker:wait_for/2`
 26 |  * `locker:wait_for_release/2`
 27 | 
 28 | 
 29 | ### Writes
 30 | 
 31 | To achieve "atomic" updates, the write is done in two phases, voting and
 32 | commiting.
 33 | 
 34 | In the voting phase, the client asks every master node for a promise
 35 | that the node can later set the key. The promise is only granted if
 36 | the current value is what the client expects. The promise will block
 37 | any other clients from also receiving a promise for that key.
 38 | 
 39 | If the majority of the master nodes gives the client the promise
 40 | (quorum), the client can go ahead and commit the lock. If a positive
 41 | majority was not reached, the client will abort and delete any
 42 | promises it received.
 43 | 
 44 | ### Reads
 45 | 
 46 | `locker` currently only offers dirty reads from the local node. If we
 47 | need consistent reads, a read quorum can be used.
 48 | 
 49 | ### Failure
 50 | 
 51 | "So, this is all fine and good, but what happens when something
 52 | fails?". To make the implementation simple, there is a timeout on
 53 | every promise and every lock. If a promise is not converted into a
 54 | lock in time, it is simply deleted.
 55 | 
 56 | If the user process fails to extend the lease of its lock, the lock
 57 | expires without consulting any other node. If a node is partitioned
 58 | away from the rest of the cluster, the lock might expire too soon
 59 | resulting in reads returning the empty value. However, a new lock
 60 | cannot be created as a quorum cannot be reached.
 61 | 
 62 | Calling `locker:wait_for_release/2` will block until a lock expires,
 63 | either by manual release or from a expired lease.
 64 | 
 65 | ### Lease expiration
 66 | 
 67 | Synchronized clocks is not required for correct expiration of a
 68 | lease. It is only required that the clocks progress at roughly the
 69 | same speed. When a lock is created or extended, the node will set the
 70 | expiration to `now() + lease_length`, which means that the user needs
 71 | to account for the skew when extending the lease. With leases in the
 72 | order of minutes, the skew should be very small.
 73 | 
 74 | When a lease is extended, it is replicated to the other nodes in the
 75 | cluster which will update their local copy if they don't already have
 76 | the key. This is used to bring new nodes in sync.
 77 | 
 78 | ### Replication
 79 | 
 80 | A `locker` cluster consists of masters and replicas. The masters
 81 | participate in the quorum and accept writes from the clients. The
 82 | masters implements strong consistency. Periodically the masters send
 83 | off their transaction log to the replicas where it is replayed to
 84 | create the same state. Replication is thus asynchronous and reads on
 85 | the replicas might be inconsistent. Replication is done in batch to
 86 | improve performance by reducing the number of messages each replica
 87 | needs to handle. Calling `locker:wait_for/2` after a succesful write
 88 | will block until the key is replicated to the local node. If the local
 89 | node is a master, it will return immediately.
 90 | 
 91 | ### Adding new nodes
 92 | 
 93 | New nodes may first be added as replicas to sync up before being
 94 | promoted to master. Every operation happening after the replica
 95 | joined, will be also propagated to the replica. The time to catch up
 96 | is then determined by how long it takes for all leases to be extended.
 97 | 
 98 | New nodes might also be set directly as masters, in which case the new
 99 | node might give negative votes in the quorum. As long as a quorum can
100 | be reached, the out-of-sync master will still accept writes and catch
101 | up as fast as a replica.
102 | 
103 | Using `locker:set_nodes/3` masters and replicas can be set across the
104 | entire cluster in a "send-and-pray" operation. If something happens
105 | during this operation, the locker cluster might be in an inconsistent
106 | state.
107 | 


--------------------------------------------------------------------------------
/test/locker_proper.erl:
--------------------------------------------------------------------------------
  1 | -module(locker_proper).
  2 | -compile([export_all]).
  3 | 
  4 | -include_lib("proper/include/proper.hrl").
  5 | 
  6 | -record(state, {master_leases, replicated_leases}).
  7 | 
  8 | -define(MASTERS, [host_name("a")]).
  9 | -define(REPLICAS, [host_name("b")]).
 10 | 
 11 | test() ->
 12 |     proper:quickcheck(prop_lock_release()).
 13 | 
 14 | prop_lock_release() ->
 15 |     ?FORALL(Commands, parallel_commands(?MODULE),
 16 |             ?TRAPEXIT(
 17 |                begin
 18 |                    [A, B] = Cluster = setup([a, b]),
 19 |                    ok = rpc:call(A, locker, set_nodes, [Cluster, [A], [B]]),
 20 |                    {Seq, P, Result} = run_parallel_commands(?MODULE, Commands),
 21 |                    teardown(Cluster),
 22 |                    ?WHENFAIL(
 23 |                       io:format("Sequential: ~p\nParallel: ~p\nRes: ~p\n",
 24 |                                 [Seq, P, Result]),
 25 |                       Result =:= ok)
 26 |                end)).
 27 | 
 28 | key() ->
 29 |     elements([1]).
 30 | 
 31 | value() ->
 32 |     elements([foo, bar]).
 33 | 
 34 | get_master() ->
 35 |     elements(?MASTERS).
 36 | 
 37 | get_replica() ->
 38 |     elements(?REPLICAS).
 39 | 
 40 | get_node() ->
 41 |     elements(?MASTERS ++ ?REPLICAS).
 42 | 
 43 | is_master(N) ->
 44 |     lists:member(N, ?MASTERS).
 45 | 
 46 | is_replica(N) ->
 47 |     lists:member(N, ?REPLICAS).
 48 | 
 49 | command(S) ->
 50 |     Leases = S#state.master_leases =/= [],
 51 |     oneof([{call, ?MODULE, lock, [get_node(), key(), value()]}] ++
 52 |               [{call, ?MODULE, read, [get_node(), key()]}] ++
 53 |               [?LET({Key, Value}, elements(S#state.master_leases),
 54 |                     {call, ?MODULE, release,
 55 |                      [get_node(), Key, Value]}) || Leases] ++
 56 |               [{call, ?MODULE, update, [get_node(), key(), value(), value()]}
 57 |                 || Leases] ++
 58 |               [{call, ?MODULE, replicate, []}]
 59 |          ).
 60 | 
 61 | lock(Node, Key, Value) ->
 62 |     rpc:call(Node, locker, lock, [Key, Value]).
 63 | 
 64 | release(Node, Key, Value) ->
 65 |     rpc:call(Node, locker, release, [Key, Value]).
 66 | 
 67 | update(Node, Key, Value, NewValue) ->
 68 |     rpc:call(Node, locker, update, [Key, Value, NewValue]).
 69 | 
 70 | replicate() ->
 71 |     rpc:sbcast(?MASTERS, locker, push_trans_log).
 72 | 
 73 | read(Node, Key) ->
 74 |     rpc:call(Node, locker, dirty_read, [Key]).
 75 | 
 76 | 
 77 | initial_state() ->
 78 |     #state{master_leases = [], replicated_leases = []}.
 79 | 
 80 | precondition(S, {call, _, release, [_, Key, _Value]}) ->
 81 |     lists:keymember(Key, 1, S#state.master_leases);
 82 | 
 83 | precondition(_, _) ->
 84 |     true.
 85 | 
 86 | next_state(S, _V, {call, _, lock, [_, Key, Value]}) ->
 87 |     case lists:keymember(Key, 1, S#state.master_leases) of
 88 |         true ->
 89 |             S;
 90 |         false ->
 91 |             S#state{master_leases = [{Key, Value} | S#state.master_leases]}
 92 |     end;
 93 | 
 94 | next_state(S, _V, {call, _, release, [_, Key, Value]}) ->
 95 |     case lists:member({Key, Value}, S#state.master_leases) of
 96 |         true ->
 97 |             S#state{master_leases = lists:delete({Key, Value},
 98 |                                                  S#state.master_leases),
 99 |                     replicated_leases =
100 |                         lists:delete({Key, Value}, S#state.replicated_leases)};
101 |         false ->
102 |             S
103 |     end;
104 | 
105 | next_state(S, _V, {call, _, update, [_, Key, Value, NewValue]}) ->
106 |     case lists:member({Key, Value}, S#state.master_leases) of
107 |         true ->
108 |             S#state{master_leases = [{Key, NewValue} |
109 |                                         lists:delete({Key, Value},
110 |                                                      S#state.master_leases)]};
111 |         false ->
112 |             S
113 |     end;
114 | 
115 | next_state(S, _V, {call, _, replicate, []}) ->
116 |     S#state{replicated_leases = S#state.master_leases};
117 | 
118 | next_state(S, _V, {call, _, read, _}) ->
119 |     S.
120 | 
121 | postcondition(S, {call, _, lock, [_, Key, _Value]}, Result) ->
122 |     case Result of
123 |         {ok, _, _, _} ->
124 |             not lists:keymember(Key, 1, S#state.master_leases);
125 |         {error, no_quorum} ->
126 |             lists:keymember(Key, 1, S#state.master_leases)
127 |     end;
128 | 
129 | postcondition(S, {call, _, release, [_, Key, Value]}, {ok, _, _, _}) ->
130 |     lists:member({Key, Value}, S#state.master_leases);
131 | 
132 | postcondition(S, {call, _, release, [_, Key, _Value]}, {error, no_quorum}) ->
133 |     lists:keymember(Key, 1, S#state.master_leases);
134 | 
135 | postcondition(S, {call, _, update, [_, Key, Value, _NewValue]},
136 |               {ok, _, _, _}) ->
137 |     lists:member({Key, Value}, S#state.master_leases);
138 | 
139 | postcondition(S, {call, _, update, [_, Key, Value, _NewValue]},
140 |               {error, no_quorum}) ->
141 |     Val = lists:keymember(Key, 1, S#state.master_leases),
142 |     Val orelse (Val =/= Value);
143 | 
144 | postcondition(_S, {call, _, replicate, []}, _) ->
145 |     true;
146 | 
147 | postcondition(S, {call, _, read, [Node, Key]}, Result) ->
148 |     case is_master(Node) of
149 |         true ->
150 |             case Result of
151 |                 {ok, Value} ->
152 |                     lists:member({Key, Value}, S#state.master_leases);
153 |                 {error, not_found} ->
154 |                     not lists:keymember(Key, 1, S#state.master_leases)
155 |             end;
156 |         false ->
157 |             case Result of
158 |                 {ok, Value} ->
159 |                     lists:member({Key, Value}, S#state.replicated_leases);
160 |                 {error, not_found} ->
161 |                     not lists:keymember(Key, 1, S#state.replicated_leases)
162 |             end
163 |     end.
164 | 
165 | %%
166 | %% SETUP
167 | %%
168 | 
169 | setup(Name) when is_atom(Name) ->
170 |     {ok, Node} = slave:start_link(list_to_atom(net_adm:localhost()), Name),
171 |     true = rpc:call(Node, code, add_path, ["ebin"]),
172 |     {ok, _} = rpc:call(Node, locker, start_link, [1]),
173 | 
174 |     {ok, _, _, R1, R2, R3} = rpc:call(Node, locker, get_debug_state, []),
175 |     {ok, cancel} = rpc:call(Node, timer, cancel, [R1]),
176 |     {ok, cancel} = rpc:call(Node, timer, cancel, [R2]),
177 |     {ok, cancel} = rpc:call(Node, timer, cancel, [R3]),
178 |     Node;
179 | 
180 | setup(NodeNames) ->
181 |     lists:map(fun setup/1, NodeNames).
182 | 
183 | teardown(Nodes) ->
184 |     lists:map(fun slave:stop/1, Nodes).
185 | 
186 | %% @doc Return fully qualified name for local host node.
187 | host_name(Name) ->
188 |     list_to_atom(Name ++ "@" ++ net_adm:localhost()).
189 | 


--------------------------------------------------------------------------------
/test/locker_SUITE.erl:
--------------------------------------------------------------------------------
  1 | -module(locker_SUITE).
  2 | -compile([export_all]).
  3 | -include_lib("test_server/include/test_server.hrl").
  4 | 
  5 | -define (EBIN_DIR, lists:flatten(
  6 |     filename:dirname(filename:dirname(filename:absname(""))) ++
  7 |     ["/ebin"])).
  8 | 
  9 | all() ->
 10 |     [
 11 |      api,
 12 |      quorum,
 13 |      no_quorum_possible,
 14 |      release,
 15 |      lease_extend,
 16 |      expire_leases,
 17 |      one_node_down,
 18 |      extend_propagates,
 19 |      add_remove_node,
 20 |      replica,
 21 |      promote,
 22 |      wait_for,
 23 |      wait_for_release,
 24 |      update
 25 |     ].
 26 | 
 27 | api(_) ->
 28 |     [A, B, C] = Cluster = setup([a, b, c]),
 29 |     ok = rpc:call(A, locker, set_nodes, [Cluster, Cluster, []]),
 30 | 
 31 |     {Cluster, [], 2} = rpc:call(A, locker, get_meta, []),
 32 | 
 33 |     ok = rpc:call(A, locker, set_w, [[A], 3]),
 34 |     {Cluster, [], 3} = rpc:call(A, locker, get_meta, []),
 35 |     ok = rpc:call(A, locker, set_w, [[A], 2]),
 36 | 
 37 |     {ok, 2, 3, 3} = rpc:call(A, locker, lock, [123, self()]),
 38 |     %% slave:stop(C),
 39 |     Pid = rpc:call(C, erlang, whereis, [locker]),
 40 |     true = rpc:call(C, erlang, exit, [Pid, kill]),
 41 |     false = rpc:call(C, erlang, is_process_alive, [Pid]),
 42 |     {ok, 2, 2, 2} = rpc:call(A, locker, release, [123, self()]),
 43 |     {ok, 2, 2, 2} = rpc:call(B, locker, lock, [123, self()]),
 44 |     {error, no_quorum} = rpc:call(A, locker, update, [123, wrong_value,
 45 |                                                       new_value]),
 46 | 
 47 |     teardown([A, B, C]).
 48 | 
 49 | quorum(_) ->
 50 |     [A, B, C] = Cluster = setup([a, b, c]),
 51 |     ok = rpc:call(A, locker, set_nodes, [Cluster, Cluster, []]),
 52 | 
 53 |     Parent = self(),
 54 |     spawn(fun() ->
 55 |                   Parent ! {1, catch rpc:call(A, locker, lock, [123, Parent])}
 56 |           end),
 57 |     spawn(fun() ->
 58 |                   Parent ! {2, catch rpc:call(B, locker, lock, [123, Parent])}
 59 |           end),
 60 |     receive {1, P1} -> P1 after 1000 -> throw(timeout) end,
 61 |     receive {2, P2} -> P2 after 1000 -> throw(timeout) end,
 62 | 
 63 |     ?line {ok, Pid} = rpc:call(A, locker, dirty_read, [123]),
 64 |     ?line {ok, Pid} = rpc:call(B, locker, dirty_read, [123]),
 65 |     rpc:sbcast([A, B, C], locker, push_trans_log),
 66 |     ?line {ok, Pid} = rpc:call(C, locker, dirty_read, [123]),
 67 | 
 68 |     {ok, [], [{123, Pid, _}], _, _, _} = state(A),
 69 |     {ok, [], [{123, Pid, _}], _, _, _} = state(B),
 70 |     {ok, [], [{123, Pid, _}], _, _, _} = state(C),
 71 | 
 72 |     teardown([A, B, C]).
 73 | 
 74 | no_quorum_possible(_) ->
 75 |     [A, B, C] = setup([a, b, c]),
 76 |     ok = rpc:call(A, locker, set_nodes, [[A, B], [A, B], []]),
 77 | 
 78 |     Parent = self(),
 79 |     spawn(fun() ->
 80 |                   Parent ! {1, catch rpc:call(A, locker, lock, [123, Parent])}
 81 |           end),
 82 |     spawn(fun() ->
 83 |                   Parent ! {2, catch rpc:call(B, locker, lock, [123, Parent])}
 84 |           end),
 85 | 
 86 |     {error, no_quorum} = receive {1, P1} -> P1 after 1000 -> throw(timeout) end,
 87 |     {error, no_quorum} = receive {2, P2} -> P2 after 1000 -> throw(timeout) end,
 88 | 
 89 |     {error, not_found} = rpc:call(A, locker, dirty_read, [123]),
 90 |     {error, not_found} = rpc:call(B, locker, dirty_read, [123]),
 91 |     rpc:sbcast([A, B, C], locker, push_trans_log),
 92 |     {error, not_found} = rpc:call(C, locker, dirty_read, [123]),
 93 | 
 94 |     {ok, [], [], _, _, _} = state(A),
 95 |     {ok, [], [], _, _, _} = state(B),
 96 |     {ok, [], [], _, _, _} = state(C),
 97 | 
 98 |     teardown([A, B, C]).
 99 | 
100 | release(_) ->
101 |     [A, B, C] = Cluster = setup([a, b, c]),
102 |     ok = rpc:call(A, locker, set_nodes, [Cluster, Cluster, []]),
103 | 
104 |     Value = self(),
105 |     {ok, 2, 3, 3} = rpc:call(A, locker, lock, [123, Value]),
106 | 
107 |     {ok, Value} = rpc:call(A, locker, dirty_read, [123]),
108 |     {ok, Value} = rpc:call(B, locker, dirty_read, [123]),
109 |     rpc:sbcast([A, B, C], locker, push_trans_log),
110 |     {ok, Value} = rpc:call(C, locker, dirty_read, [123]),
111 |     slave:stop(A),
112 |     slave:stop(B),
113 | 
114 |     {error, no_quorum} = rpc:call(C, locker, release, [123, Value]),
115 |     rpc:sbcast([A, B, C], locker, push_trans_log),
116 |     {ok, Value} = rpc:call(C, locker, dirty_read, [123]),
117 | 
118 |     teardown([A, B, C]).
119 | 
120 | one_node_down(_) ->
121 |     [A, B, C] = Cluster = setup([a, b, c]),
122 |     ok = rpc:call(A, locker, set_nodes, [Cluster, Cluster, []]),
123 |     slave:stop(C),
124 | 
125 |     Pid = self(),
126 |     spawn(fun() ->
127 |                   Pid ! {1, catch rpc:call(A, locker, lock, [123, Pid])}
128 |           end),
129 |     receive {1, P1} -> P1 after 1000 -> throw(timeout) end,
130 | 
131 |     {ok, Pid} = rpc:call(A, locker, dirty_read, [123]),
132 |     {ok, Pid} = rpc:call(B, locker, dirty_read, [123]),
133 | 
134 |     {ok, [], [{123, Pid, _}], _, _, _} = state(A),
135 |     {ok, [], [{123, Pid, _}], _, _, _} = state(B),
136 | 
137 |     teardown([A, B, C]).
138 | 
139 | extend_propagates(_) ->
140 |     [A, B, C] = setup([a, b, c]),
141 |     ok = rpc:call(A, locker, set_nodes, [[A, B], [A, B], []]),
142 | 
143 |     Pid = self(),
144 |     {ok, 2, 2, 2} = rpc:call(A, locker, lock, [123, Pid]),
145 | 
146 |     {ok, Pid} = rpc:call(A, locker, dirty_read, [123]),
147 |     {ok, Pid} = rpc:call(B, locker, dirty_read, [123]),
148 |     {error, not_found} = rpc:call(C, locker, dirty_read, [123]),
149 | 
150 |     {ok, [], [{123, Pid, _}], _, _, _} = state(A),
151 |     {ok, [], [{123, Pid, _}], _, _, _} = state(B),
152 |     rpc:sbcast([A, B, C], locker, push_trans_log),
153 |     {ok, [], [], _, _, _} = state(C),
154 | 
155 |     ok = rpc:call(A, locker, set_nodes, [[A, B, C], [A, B], [C]]),
156 | 
157 |     ok = rpc:call(A, locker, extend_lease, [123, Pid, 2000]),
158 | 
159 | 
160 |     {ok, [], [{123, Pid, _ExA}], _, _, _} = state(A),
161 |     {ok, [], [{123, Pid, _ExB}], _, _, _} = state(B),
162 |     rpc:sbcast([A, B, C], locker, push_trans_log),
163 |     {ok, [], [{123, Pid, _ExC}], _, _, _} = state(C),
164 | 
165 |     %% abs((ExA - ExB)) < 3 orelse throw(too_much_drift),
166 |     %% abs((ExB - ExC)) < 3 orelse throw(too_much_drift),
167 |     %% abs((ExA - ExC)) < 3 orelse throw(too_much_drift),
168 | 
169 |     teardown([A, B, C]).
170 | 
171 | 
172 | lease_extend(_) ->
173 |     [A, B, C] = Cluster = setup([a, b, c]),
174 |     ok = rpc:call(A, locker, set_nodes, [Cluster, Cluster, []]),
175 | 
176 |     Pid = self(),
177 |     {ok, _, _, _} = rpc:call(A, locker, lock, [123, Pid]),
178 |     {ok, Pid} = rpc:call(A, locker, dirty_read, [123]),
179 |     {ok, Pid} = rpc:call(B, locker, dirty_read, [123]),
180 |     {ok, Pid} = rpc:call(C, locker, dirty_read, [123]),
181 | 
182 |     timer:sleep(2000),
183 |     rpc:sbcast([A, B, C], locker, expire_leases),
184 | 
185 |     {error, not_found} = rpc:call(A, locker, dirty_read, [123]),
186 |     {error, not_found} = rpc:call(B, locker, dirty_read, [123]),
187 |     {error, not_found} = rpc:call(C, locker, dirty_read, [123]),
188 | 
189 |     {ok, _, _, _} = rpc:call(A, locker, lock, [123, Pid]),
190 |     {ok, Pid} = rpc:call(A, locker, dirty_read, [123]),
191 |     {ok, Pid} = rpc:call(B, locker, dirty_read, [123]),
192 |     {ok, Pid} = rpc:call(C, locker, dirty_read, [123]),
193 | 
194 | 
195 |     ok = rpc:call(B, locker, extend_lease, [123, Pid, 2000]),
196 |     rpc:sbcast([A, B, C], locker, expire_leases),
197 |     {ok, Pid} = rpc:call(A, locker, dirty_read, [123]),
198 |     {ok, Pid} = rpc:call(B, locker, dirty_read, [123]),
199 |     {ok, Pid} = rpc:call(C, locker, dirty_read, [123]),
200 | 
201 |     ok.
202 | 
203 | expire_leases(_) ->
204 |     [A, B, C] = Cluster = setup([a, b, c]),
205 |     ok = rpc:call(A, locker, set_nodes, [Cluster, Cluster, []]),
206 | 
207 |     Pid = self(),
208 |     {ok, _, _, _} = rpc:call(A, locker, lock, [123, Pid]),
209 | 
210 |     timer:sleep(1000),
211 |     {ok, _, _, _} = rpc:call(A, locker, lock, [abc, Pid]),
212 | 
213 |     {ok, Pid} = rpc:call(A, locker, dirty_read, [123]),
214 |     {ok, Pid} = rpc:call(B, locker, dirty_read, [123]),
215 |     {ok, Pid} = rpc:call(C, locker, dirty_read, [123]),
216 |     {ok, Pid} = rpc:call(A, locker, dirty_read, [abc]),
217 |     {ok, Pid} = rpc:call(B, locker, dirty_read, [abc]),
218 |     {ok, Pid} = rpc:call(C, locker, dirty_read, [abc]),
219 | 
220 |     timer:sleep(2000),
221 |     rpc:sbcast([A, B, C], locker, expire_leases),
222 | 
223 |     {error, not_found} = rpc:call(A, locker, dirty_read, [123]),
224 |     {error, not_found} = rpc:call(B, locker, dirty_read, [123]),
225 |     {error, not_found} = rpc:call(C, locker, dirty_read, [123]),
226 |     {error, not_found} = rpc:call(A, locker, dirty_read, [abc]),
227 |     {error, not_found} = rpc:call(B, locker, dirty_read, [abc]),
228 |     {error, not_found} = rpc:call(C, locker, dirty_read, [abc]),
229 | 
230 |     teardown([A, B, C]).
231 | 
232 | add_remove_node(_) ->
233 |     [A, B, C] = Cluster = setup([a, b, c]),
234 |     ok = rpc:call(A, locker, set_nodes, [Cluster, Cluster, []]),
235 | 
236 |     {ok, 2, 3, 3} = rpc:call(A, locker, lock, [123, self()]),
237 |     {ok, 2, 3, 3} = rpc:call(B, locker, release, [123, self()]),
238 | 
239 |     ok = rpc:call(A, locker, set_nodes, [Cluster, [A, B], []]),
240 |     {ok, 2, 2, 2} = rpc:call(A, locker, lock, [123, self()]),
241 | 
242 |     teardown([A, B, C]).
243 | 
244 | replica(_) ->
245 |     [A, B, C] = Cluster = setup([a, b, c]),
246 |     ok = rpc:call(A, locker, set_nodes, [Cluster, [A, B], [C]]),
247 | 
248 |     {[A, B], [C], 2} = rpc:call(A, locker, get_meta, []),
249 |     {[A, B], [C], 2} = rpc:call(B, locker, get_meta, []),
250 |     {[A, B], [C], 2} = rpc:call(C, locker, get_meta, []),
251 | 
252 |     Pid = self(),
253 |     {ok, 2, 2, 2} = rpc:call(A, locker, lock, [123, Pid]),
254 | 
255 |     {ok, Pid} = rpc:call(A, locker, dirty_read, [123]),
256 |     {ok, Pid} = rpc:call(B, locker, dirty_read, [123]),
257 |     {error, not_found} = rpc:call(C, locker, dirty_read, [123]),
258 |     rpc:sbcast([A, B, C], locker, push_trans_log),
259 |     {ok, Pid} = rpc:call(C, locker, dirty_read, [123]),
260 | 
261 |     slave:stop(B),
262 | 
263 |     {error, no_quorum} = rpc:call(A, locker, release, [123, Pid]),
264 | 
265 |     teardown([A, B, C]).
266 | 
267 | promote(_) ->
268 |     [A, B, C] = Cluster = setup([a, b, c]),
269 |     ok = rpc:call(A, locker, set_nodes, [Cluster, [A, B], [C]]),
270 | 
271 |     Pid = self(),
272 |     {ok, 2, 2, 2} = rpc:call(A, locker, lock, [123, Pid]),
273 |     timer:sleep(200),
274 |     {ok, Pid} = rpc:call(A, locker, dirty_read, [123]),
275 |     {ok, Pid} = rpc:call(B, locker, dirty_read, [123]),
276 |     rpc:sbcast([A, B, C], locker, push_trans_log),
277 |     {ok, Pid} = rpc:call(C, locker, dirty_read, [123]),
278 | 
279 | 
280 |     ok = rpc:call(A, locker, set_nodes, [Cluster, [A, B, C], []]),
281 |     {ok, 2, 3, 3} = rpc:call(A, locker, release, [123, Pid]),
282 | 
283 |     teardown([A, B, C]).
284 | 
285 | 
286 | wait_for(_) ->
287 |     [A, B, C] = Cluster = setup([a, b, c]),
288 |     ok = rpc:call(A, locker, set_nodes, [Cluster, [A, B], [C]]),
289 | 
290 |     Pid = self(),
291 |     {ok, 2, 2, 2} = rpc:call(A, locker, lock, [123, Pid]),
292 | 
293 |     {error, not_found} = rpc:call(C, locker, dirty_read, [123]),
294 |     {badrpc, {'EXIT', {timeout, _}}} = rpc:call(C, locker, wait_for, [123, 100]),
295 | 
296 |     rpc:sbcast([A, B, C], locker, push_trans_log),
297 |     {ok, Pid} = rpc:call(C, locker, wait_for, [123, 5000]),
298 | 
299 |     teardown([A, B, C]).
300 | 
301 | wait_for_release(_) ->
302 |     [A, B, C] = Cluster = setup([a, b, c]),
303 |     ok = rpc:call(A, locker, set_nodes, [Cluster, [A, B], [C]]),
304 | 
305 |     LeaseLength = 500,
306 |     Pid = Parent = self(),
307 |     {ok, 2, 2, 2} = rpc:call(A, locker, lock, [123, Pid, LeaseLength, 1000]),
308 | 
309 |     {ok, Pid} = rpc:call(B, locker, dirty_read, [123]),
310 |     {error, not_found} = rpc:call(C, locker, dirty_read, [123]),
311 |     {error, key_not_locked} =
312 |         rpc:call(C, locker, wait_for_release, [123, 100]),
313 | 
314 |     rpc:sbcast([A, B, C], locker, push_trans_log),
315 |     timer:sleep(100),
316 | 
317 |     P1 = spawn(fun() ->
318 |                        Parent ! {self(), (catch rpc:call(B, locker, wait_for_release, [123, 1000]))}
319 |                end),
320 |     P2 = spawn(fun() ->
321 |                        Parent ! {self(), (catch rpc:call(C, locker, wait_for_release, [123, 1000]))}
322 |                end),
323 |     timer:sleep(LeaseLength),
324 |     rpc:sbcast([A, B, C], locker, expire_leases),
325 | 
326 |     {ok, released} = receive {P1, M1} -> M1 end,
327 |     {ok, released} = receive {P2, M2} -> M2 end,
328 | 
329 |     teardown([A, B, C]).
330 | 
331 | update(_) ->
332 |     [A, B, C] = Cluster = setup([a, b, c]),
333 |     ok = rpc:call(A, locker, set_nodes, [Cluster, [A, B], [C]]),
334 | 
335 |     Key = 123,
336 |     Value0 = 41,
337 |     Value1 = 42,
338 |     LeaseLength = 50,
339 |     {ok, 2, 2, 2} = rpc:call(A, locker, lock, [Key, Value0, LeaseLength]),
340 |     {ok, 2, 2, 2} = rpc:call(B, locker, update, [Key, Value0, Value1]),
341 | 
342 |     rpc:sbcast([A, B, C], locker, push_trans_log),
343 |     {ok, Value1} = rpc:call(C, locker, dirty_read, [Key]),
344 |     {ok, Value1} = rpc:call(B, locker, dirty_read, [Key]),
345 | 
346 |     {error, no_quorum} = rpc:call(A, locker, update, [Key, Value0,
347 |                                                       random_value]),
348 | 
349 |     timer:sleep(LeaseLength),
350 |     rpc:sbcast([A, B, C], locker, expire_leases),
351 | 
352 |     Res = lists:duplicate(3, {error, not_found}),
353 |     {Res, []} = rpc:multicall(Cluster, locker, dirty_read, [Key]),
354 | 
355 |     teardown([A, B, C]).
356 | 
357 | %%
358 | %% HELPERS
359 | %%
360 | 
361 | 
362 | setup(Name) when is_atom(Name) ->
363 |     {ok, Node} = slave:start_link(list_to_atom(net_adm:localhost()), Name),
364 | 
365 |     true = rpc:call(Node, code, add_path, [?EBIN_DIR]),
366 |     {ok, _} = rpc:call(Node, locker, start_link, [2]),
367 | 
368 |     {ok, _, _, R1, R2, R3} = rpc:call(Node, locker, get_debug_state, []),
369 |     {ok, cancel} = rpc:call(Node, timer, cancel, [R1]),
370 |     {ok, cancel} = rpc:call(Node, timer, cancel, [R2]),
371 |     {ok, cancel} = rpc:call(Node, timer, cancel, [R3]),
372 |     Node;
373 | 
374 | setup(NodeNames) ->
375 |     lists:map(fun setup/1, NodeNames).
376 | 
377 | 
378 | teardown(Nodes) ->
379 |     lists:map(fun slave:stop/1, Nodes).
380 | 
381 | state(N) ->
382 |     rpc:call(N, locker, get_debug_state, []).
383 | 


--------------------------------------------------------------------------------
/src/locker.erl:
--------------------------------------------------------------------------------
  1 | %% @doc Distributed consistent key-value store
  2 | %%
  3 | %% Reads use the local copy, all data is replicated to all nodes.
  4 | %%
  5 | %% Writing is done in two phases, in the first phase the key is
  6 | %% locked, if a quorum can be made, the value is written.
  7 | 
  8 | -module(locker).
  9 | -behaviour(gen_server).
 10 | -author('Knut Nesheim <knutin@gmail.com>').
 11 | 
 12 | %% API
 13 | -export([start_link/1, start_link/4]).
 14 | -export([set_w/2, set_nodes/3]).
 15 | 
 16 | -export([lock/2, lock/3, lock/4, update/3, update/4,
 17 |          extend_lease/3,release/2, release/3]).
 18 | -export([wait_for/2, wait_for_release/1, wait_for_release/2]).
 19 | -export([dirty_read/1, master_dirty_read/1]).
 20 | -export([lag/0, summary/0]).
 21 | 
 22 | 
 23 | -export([get_write_lock/4, do_write/6, release_write_lock/3]).
 24 | -export([get_meta/0, get_meta_ets/1, get_debug_state/0]).
 25 | 
 26 | -export([now_to_seconds/0]).
 27 | 
 28 | %% gen_server callbacks
 29 | -export([init/1, handle_call/3, handle_cast/2, handle_info/2,
 30 |          terminate/2, code_change/3]).
 31 | 
 32 | -record(state, {
 33 |           %% The masters queue writes in the trans_log for batching to
 34 |           %% the replicas, triggered every N milliseconds by the
 35 |           %% push_replica timer
 36 |           trans_log = [],
 37 | 
 38 |           %% Clients can wait for a key to become locked
 39 |           waiters = [],
 40 | 
 41 |           %% Clients can wait for a lock to be released
 42 |           release_waiters = [],
 43 | 
 44 |           %% Previous point of expiration, no keys older than this
 45 |           %% point should exist
 46 |           prev_expire_point,
 47 | 
 48 |           %% Timer references
 49 |           lease_expire_ref,
 50 |           write_locks_expire_ref,
 51 |           push_trans_log_ref
 52 | }).
 53 | 
 54 | -define(LEASE_LENGTH, 2000).
 55 | -define(DB, locker_db).
 56 | -define(LOCK_DB, locker_lock_db).
 57 | -define(META_DB, locker_meta_db).
 58 | -define(EXPIRE_DB, locker_expire_db).
 59 | 
 60 | %%%===================================================================
 61 | %%% API
 62 | %%%===================================================================
 63 | 
 64 | start_link(W) ->
 65 |     start_link(W, 1000, 1000, 100).
 66 | 
 67 | start_link(W, LeaseExpireInterval, LockExpireInterval, PushTransInterval) ->
 68 |     Args = [W, LeaseExpireInterval, LockExpireInterval, PushTransInterval],
 69 |     gen_server:start_link({local, ?MODULE}, ?MODULE, Args, []).
 70 | 
 71 | lock(Key, Value) ->
 72 |     lock(Key, Value, ?LEASE_LENGTH).
 73 | 
 74 | lock(Key, Value, LeaseLength) ->
 75 |     lock(Key, Value, LeaseLength, 5000).
 76 | 
 77 | %% @doc: Tries to acquire the lock. In case of unreachable nodes, the
 78 | %% timeout is 1 second per node which might need tuning. Returns {ok,
 79 | %% W, V, C} where W is the number of agreeing nodes required for a
 80 | %% quorum, V is the number of nodes that voted in favor of this lock
 81 | %% in the case of contention and C is the number of nodes who
 82 | %% acknowledged commit of the lock successfully.
 83 | lock(Key, Value, LeaseLength, Timeout) ->
 84 |     Nodes = get_meta_ets(nodes),
 85 |     W = get_meta_ets(w),
 86 | 
 87 |     %% Try getting the write lock on all nodes
 88 |     {Tag, RequestReplies, _BadNodes} = get_write_lock(Nodes, Key, not_found, Timeout),
 89 | 
 90 |     case ok_responses(RequestReplies) of
 91 |         {OkNodes, _} when length(OkNodes) >= W ->
 92 |             %% Majority of nodes gave us the lock, go ahead and do the
 93 |             %% write on all masters. The write also releases the
 94 |             %% lock. Replicas are synced asynchronously by the
 95 |             %% masters.
 96 |             {WriteReplies, _} = do_write(Nodes,
 97 |                                          Tag, Key, Value,
 98 |                                          LeaseLength, Timeout),
 99 |             {OkWrites, _} = ok_responses(WriteReplies),
100 |             {ok, W, length(OkNodes), length(OkWrites)};
101 |         _ ->
102 |             {_AbortReplies, _} = release_write_lock(Nodes, Tag, Timeout),
103 |             {error, no_quorum}
104 |     end.
105 | 
106 | update(Key, Value, NewValue) ->
107 |     update(Key, Value, NewValue, 5000).
108 | 
109 | %% @doc: Tries to update the lock. The update only happens if an existing
110 | %% value of the lock corresponds to the given Value within the W number of
111 | %% master nodes.
112 | %% Returns the same tuple as in lock/4 case.
113 | update(Key, Value, NewValue, Timeout) ->
114 |     Nodes = get_meta_ets(nodes),
115 |     W = get_meta_ets(w),
116 | 
117 |     %% Try getting the write lock on all nodes
118 |     {Tag, RequestReplies, _BadNodes} = get_write_lock(Nodes, Key, Value,
119 |                                                       Timeout),
120 | 
121 |     case ok_responses(RequestReplies) of
122 |         {OkNodes, _} when length(OkNodes) >= W ->
123 |             {UpdateReplies, _} = do_update(Nodes, Tag, Key, NewValue, Timeout),
124 |             {OkUpdates, _} = ok_responses(UpdateReplies),
125 |             {ok, W, length(OkNodes), length(OkUpdates)};
126 |         _ ->
127 |             {_AbortReplies, _} = release_write_lock(Nodes, Tag, Timeout),
128 |             {error, no_quorum}
129 |     end.
130 | 
131 | %% @doc: Waits for the key to become available on the local node. If a
132 | %% value is already available, returns immediately, otherwise it will
133 | %% return within the timeout. In case of timeout, the caller might get
134 | %% a reply anyway if it sent at the same time as the timeout.
135 | wait_for(Key, Timeout) ->
136 |     case dirty_read(Key) of
137 |         {ok, Value} ->
138 |             {ok, Value};
139 |         {error, not_found} ->
140 |             gen_server:call(locker, {wait_for, Key, Timeout}, Timeout)
141 |     end.
142 | 
143 | wait_for_release(Key) ->
144 |     wait_for_release(Key, 5000).
145 | 
146 | wait_for_release(Key, Timeout) ->
147 |     case dirty_read(Key) of
148 |         {ok, _Value} ->
149 |             gen_server:call(locker, {wait_for_release, Key, Timeout}, Timeout);
150 |         {error, not_found} ->
151 |             {error, key_not_locked}
152 |     end.
153 | 
154 | release(Key, Value) ->
155 |     release(Key, Value, 5000).
156 | 
157 | release(Key, Value, Timeout) ->
158 |     Nodes = get_meta_ets(nodes),
159 |     Replicas = get_meta_ets(replicas),
160 |     W = get_meta_ets(w),
161 | 
162 |     %% Try getting the write lock on all nodes
163 |     {Tag, WriteLockReplies, _} = get_write_lock(Nodes, Key, Value, Timeout),
164 | 
165 |     case ok_responses(WriteLockReplies) of
166 |         {OkNodes, _} when length(OkNodes) >= W ->
167 |             Request = {release, Key, Value, Tag},
168 |             {ReleaseReplies, _BadNodes} =
169 |                 gen_server:multi_call(Nodes ++ Replicas, locker, Request, Timeout),
170 | 
171 |             {OkWrites, _} = ok_responses(ReleaseReplies),
172 | 
173 |             {ok, W, length(OkNodes), length(OkWrites)};
174 |         _ ->
175 |             {_AbortReplies, _} = release_write_lock(Nodes, Tag, Timeout),
176 |             {error, no_quorum}
177 |     end.
178 | 
179 | 
180 | extend_lease(Key, Value, LeaseLength) ->
181 |     extend_lease(Key, Value, LeaseLength, 5000).
182 | 
183 | %% @doc: Extends the lease for the lock on all nodes that are up. What
184 | %% really happens is that the expiration is scheduled for (now + lease
185 | %% time), to allow for nodes that just joined to set the correct
186 | %% expiration time without knowing the start time of the lease.
187 | extend_lease(Key, Value, LeaseLength, Timeout) ->
188 |     Nodes = get_meta_ets(nodes),
189 |     W = get_meta_ets(w),
190 | 
191 |     {Tag, WriteLockReplies, _} = get_write_lock(Nodes, Key, Value, Timeout),
192 | 
193 |     case ok_responses(WriteLockReplies) of
194 |         {N, _E} when length(N) >= W ->
195 | 
196 |             Request = {extend_lease, Tag, Key, Value, LeaseLength},
197 |             {Replies, _} = gen_server:multi_call(Nodes, locker, Request, Timeout),
198 |             {_, FailedExtended} = ok_responses(Replies),
199 |             release_write_lock(FailedExtended, Tag, Timeout),
200 |             ok;
201 |         _ ->
202 |             {_AbortReplies, _} = release_write_lock(Nodes, Tag, Timeout),
203 |             {error, no_quorum}
204 |     end.
205 | 
206 | %% @doc: A dirty read does not create a read-quorum so consistency is
207 | %% not guaranteed. The value is read directly from a local ETS-table,
208 | %% so the performance should be very high.
209 | dirty_read(Key) ->
210 |     case ets:lookup(?DB, Key) of
211 |         [{Key, Value, _Lease}] ->
212 |             {ok, Value};
213 |         [] ->
214 |             {error, not_found}
215 |     end.
216 | 
217 | %% @doc: Execute a dirty read on the master. Same caveats as for
218 | %% dirty_read/1
219 | master_dirty_read(Key) ->
220 |     Masters = get_meta_ets(nodes),
221 |     case lists:member(node(), Masters) of
222 |         true ->
223 |             dirty_read(Key);
224 |         false ->
225 |             Master = lists:nth(random:uniform(length(Masters)), Masters),
226 |             rpc:call(Master, locker, dirty_read, [Key])
227 |     end.
228 | 
229 | %%
230 | %% Helpers for operators
231 | %%
232 | 
233 | lag() ->
234 |     Key = {'__lock_lag_probe', os:timestamp()},
235 |     {Time, Result} = timer:tc(fun() ->
236 |                                       lock(Key, foo, 2000)
237 |                               end),
238 |     release(Key, foo),
239 |     {Time / 1000, Result}.
240 | 
241 | summary() ->
242 |     [{write_locks, ets:info(?LOCK_DB, size)},
243 |      {leases, ets:info(?DB, size)}].
244 | 
245 | get_meta() ->
246 |     {get_meta_ets(nodes), get_meta_ets(replicas), get_meta_ets(w)}.
247 | 
248 | %%
249 | %% Helpers
250 | %%
251 | 
252 | get_write_lock(Nodes, Key, Value, Timeout) ->
253 |     Tag = make_ref(),
254 |     Request = {get_write_lock, Key, Value, Tag},
255 |     {Replies, Down} = gen_server:multi_call(Nodes, locker, Request, Timeout),
256 |     {Tag, Replies, Down}.
257 | 
258 | do_write(Nodes, Tag, Key, Value, LeaseLength, Timeout) ->
259 |     gen_server:multi_call(Nodes, locker,
260 |                           {write, Tag, Key, Value, LeaseLength},
261 |                           Timeout).
262 | 
263 | do_update(Nodes, Tag, Key, Value, Timeout) ->
264 |     gen_server:multi_call(Nodes, locker,
265 |                           {update, Tag, Key, Value},
266 |                           Timeout).
267 | 
268 | release_write_lock(Nodes, Tag, Timeout) ->
269 |     gen_server:multi_call(Nodes, locker, {release_write_lock, Tag}, Timeout).
270 | 
271 | get_meta_ets(Key) ->
272 |     case ets:lookup(?META_DB, Key) of
273 |         [] ->
274 |             throw({locker, no_such_meta_key});
275 |         [{Key, Value}] ->
276 |             Value
277 |     end.
278 | 
279 | %% @doc: Replaces the primary and replica node list on all nodes in
280 | %% the cluster. Assumes no failures.
281 | set_nodes(Cluster, Primaries, Replicas) ->
282 |     {_Replies, []} = gen_server:multi_call(Cluster, locker,
283 |                                            {set_nodes, Primaries, Replicas}),
284 |     ok.
285 | 
286 | set_w(Cluster, W) when is_integer(W) ->
287 |     {_Replies, []} = gen_server:multi_call(Cluster, locker, {set_w, W}),
288 |     ok.
289 | 
290 | get_debug_state() ->
291 |     gen_server:call(?MODULE, get_debug_state).
292 | 
293 | %%%===================================================================
294 | %%% gen_server callbacks
295 | %%%===================================================================
296 | init([W, LeaseExpireInterval, LockExpireInterval, PushTransInterval]) ->
297 |     ?DB = ets:new(?DB, [named_table, protected, set,
298 |                         {read_concurrency, true},
299 |                         {write_concurrency, true}]),
300 | 
301 |     ?LOCK_DB = ets:new(?LOCK_DB, [named_table, protected, set]),
302 |     ?EXPIRE_DB = ets:new(?EXPIRE_DB, [named_table, protected, bag]),
303 | 
304 | 
305 |     ?META_DB = ets:new(?META_DB, [named_table, protected, set,
306 |                                   {read_concurrency, true}]),
307 |     ets:insert(?META_DB, {w, W}),
308 |     ets:insert(?META_DB, {nodes, []}),
309 |     ets:insert(?META_DB, {replicas, []}),
310 | 
311 | 
312 |     {ok, LeaseExpireRef} = timer:send_interval(LeaseExpireInterval, expire_leases),
313 |     {ok, WriteLocksExpireRef} = timer:send_interval(LockExpireInterval, expire_locks),
314 |     {ok, PushTransLog} = timer:send_interval(PushTransInterval, push_trans_log),
315 |     {ok, #state{lease_expire_ref = LeaseExpireRef,
316 |                 write_locks_expire_ref = WriteLocksExpireRef,
317 |                 push_trans_log_ref = PushTransLog,
318 |                 prev_expire_point = now_to_seconds()}}.
319 | 
320 | %%
321 | %% WRITE-LOCKS
322 | %%
323 | 
324 | handle_call({get_write_lock, Key, Value, Tag}, _From, State) ->
325 |     %% Phase 1: Grant a write lock on the key if the value in the
326 |     %% database is what the coordinator expects. If the atom
327 |     %% 'not_found' is given as the expected value, the lock is granted
328 |     %% if the key does not exist.
329 |     %%
330 |     %% Only one lock per key is allowed. Timeouts are triggered when
331 |     %% expiring leases.
332 | 
333 |     case is_locked(Key) of
334 |         true ->
335 |             %% Key already has a write lock
336 |             {reply, {error, already_locked}, State};
337 |         false ->
338 |             case ets:lookup(?DB, Key) of
339 |                 [{Key, DbValue, _Expire}] when DbValue =:= Value ->
340 |                     set_lock(Tag, Key),
341 |                     {reply, ok, State};
342 |                 [] when Value =:= not_found->
343 |                     set_lock(Tag, Key),
344 |                     {reply, ok, State};
345 |                 _Other ->
346 |                     {reply, {error, not_expected_value}, State}
347 |             end
348 |     end;
349 | 
350 | handle_call({release_write_lock, Tag}, _From, State) ->
351 |     del_lock(Tag),
352 |     {reply, ok, State};
353 | 
354 | %%
355 | %% DATABASE OPERATIONS
356 | %%
357 | 
358 | handle_call({write, LockTag, Key, Value, LeaseLength}, _From,
359 |             #state{trans_log = TransLog} = State) ->
360 |     %% Database write. LockTag might be a valid write-lock, in which
361 |     %% case it is deleted to avoid the extra round-trip of explicit
362 |     %% delete. If it is not valid, we assume the coordinator had a
363 |     %% quorum before writing.
364 |     del_lock(LockTag),
365 |     ExpireAt = expire_at(LeaseLength),
366 |     ets:insert(?DB, {Key, Value, ExpireAt}),
367 |     schedule_expire(ExpireAt, Key),
368 | 
369 |     NewTransLog = [{write, Key, Value, LeaseLength} | TransLog],
370 |     {reply, ok, State#state{trans_log = NewTransLog}};
371 | 
372 | handle_call({update, LockTag, Key, Value}, _From,
373 |             #state{trans_log = TransLog} = State) ->
374 |     del_lock(LockTag),
375 | 
376 |     case ets:lookup(?DB, Key) of
377 |         [{Key, _Value, ExpireAt}] ->
378 |             %% Update the lock
379 |             ets:insert(?DB, {Key, Value, ExpireAt});
380 |         [] ->
381 |             %% Lock not found (most likely it has expired after acquiring write
382 |             %% lock)
383 |             ok
384 |     end,
385 | 
386 |     NewTransLog = [{update, Key, Value} | TransLog],
387 |     {reply, ok, State#state{trans_log = NewTransLog}};
388 | 
389 | %%
390 | %% LEASES
391 | %%
392 | 
393 | handle_call({extend_lease, LockTag, Key, Value, ExtendLength}, _From,
394 |             #state{trans_log = TransLog} = State) ->
395 |     %% Extending a lease sets a new expire time. As the coordinator
396 |     %% holds a write lock on the key, it validation has already been
397 |     %% done
398 | 
399 |     del_lock(LockTag),
400 |     delete_expire(expires(Key), Key),
401 | 
402 |     ExpireAt = expire_at(ExtendLength),
403 |     ets:insert(?DB, {Key, Value, ExpireAt}),
404 |     schedule_expire(ExpireAt, Key),
405 | 
406 |     NewTransLog = [{extend_lease, Key, Value, ExtendLength} | TransLog],
407 |     {reply, ok, State#state{trans_log = NewTransLog}};
408 | 
409 | 
410 | handle_call({release, Key, Value, LockTag}, _From,
411 |             #state{trans_log = TransLog} = State) ->
412 |     {Reply, NewState} =
413 |         case ets:lookup(?DB, Key) of
414 |             [{Key, Value, ExpireAt}] ->
415 |                 del_lock(LockTag),
416 |                 ets:delete(?DB, Key),
417 |                 delete_expire(ExpireAt, Key),
418 | 
419 |                 NewTransLog = [{release, Key} | TransLog],
420 |                 {ok, State#state{trans_log = NewTransLog}};
421 | 
422 |             [{Key, _OtherValue, _}] ->
423 |                 {{error, not_owner}, State};
424 |             [] ->
425 |                 {{error, not_found}, State}
426 |         end,
427 |     NewWaiters = notify_release_waiter(Key, released, NewState#state.release_waiters),
428 |     {reply, Reply, NewState#state{release_waiters = NewWaiters}};
429 | 
430 | %%
431 | %% WAIT-FOR
432 | %%
433 | 
434 | handle_call({wait_for, Key, Timeout}, From, #state{waiters = Waiters} = State) ->
435 |     %% 'From' waits for the given key to become available, using
436 |     %% gen_server:call/3. We will reply when replaying the transaction
437 |     %% log. If we do not have a response within the given timeout, the
438 |     %% reply is discarded.
439 | 
440 |     %% Possible race: wait_for/2 reads from ETS, finds nothing, sends
441 |     %% this message. Before this message is processed, we have
442 |     %% processed the transaction log, the waiter will time out. Fix:
443 |     %% read again here?
444 |     {noreply, State#state{waiters = [{Key, From, now_to_ms() + Timeout} | Waiters]}};
445 | 
446 | handle_call({wait_for_release, Key, Timeout}, From,
447 |             #state{release_waiters = Waiters} = State) ->
448 |     %% 'From' waits for the given key lock to become released, using
449 |     %% gen_server:call/3. We will reply when replaying the transaction
450 |     %% log. If we do not have a response within the given timeout, the
451 |     %% reply is discarded.
452 |     {noreply, State#state{release_waiters = [{Key, From, now_to_ms() + Timeout} | Waiters]}};
453 | 
454 | %%
455 | %% ADMINISTRATION
456 | %%
457 | 
458 | handle_call({set_w, W}, _From, State) ->
459 |     ets:insert(?META_DB, {w, W}),
460 |     {reply, ok, State};
461 | 
462 | handle_call({set_nodes, Primaries, Replicas}, _From, State) ->
463 |     ets:insert(?META_DB, {nodes, ordsets:to_list(
464 |                                    ordsets:from_list(Primaries))}),
465 |     ets:insert(?META_DB, {replicas, ordsets:to_list(
466 |                                       ordsets:from_list(Replicas))}),
467 |     {reply, ok, State};
468 | 
469 | handle_call(get_debug_state, _From, State) ->
470 |     {reply, {ok, ets:tab2list(?LOCK_DB),
471 |              ets:tab2list(?DB),
472 |              State#state.lease_expire_ref,
473 |              State#state.write_locks_expire_ref,
474 |              State#state.push_trans_log_ref}, State}.
475 | 
476 | %%
477 | %% REPLICATION
478 | %%
479 | 
480 | handle_cast({trans_log, _FromNode, TransLog}, State0) ->
481 |     %% Replay transaction log.
482 | 
483 |     %% In the future, we might want to offset the lease length in the
484 |     %% master before writing it to the log to ensure the lease length
485 |     %% is at least reasonably similar for all replicas.
486 |     Now = now_to_ms(),
487 |     ReplayF =
488 |         fun ({write, Key, Value, LeaseLength}, State) ->
489 |                 %% With multiple masters, we will get multiple writes
490 |                 %% for the same key. The last write will win for the
491 |                 %% lease db, but make sure we only have one entry in the
492 |                 %% expire table.
493 |                 delete_expire(expires(Key), Key),
494 | 
495 |                 ExpireAt = expire_at(LeaseLength),
496 |                 ets:insert(?DB, {Key, Value, ExpireAt}),
497 |                 schedule_expire(ExpireAt, Key),
498 | 
499 |                 NewWaiters = notify_lock_waiter(Now, Key, Value,
500 |                                                 State#state.waiters),
501 |                 State#state{waiters = NewWaiters};
502 | 
503 |             ({extend_lease, Key, Value, ExtendLength}, State) ->
504 |                 delete_expire(expires(Key), Key),
505 | 
506 |                 ExpireAt = expire_at(ExtendLength),
507 |                 ets:insert(?DB, {Key, Value, ExpireAt}),
508 |                 schedule_expire(ExpireAt, Key),
509 | 
510 |                 State;
511 | 
512 |           ({release, Key}, State) ->
513 |               %% Due to replication lag, the key might already have
514 |               %% been expired in which case we simply do nothing
515 |               case ets:lookup(?DB, Key) of
516 |                   [{Key, _Value, ExpireAt}] ->
517 |                       delete_expire(ExpireAt, Key),
518 |                       ets:delete(?DB, Key);
519 |                   [] ->
520 |                       ok
521 |               end,
522 |                 State;
523 | 
524 |             ({update, Key, Value}, State) ->
525 |                 delete_expire(expires(Key), Key),
526 | 
527 |                 case ets:lookup(?DB, Key) of
528 |                     [{Key, _Value, ExpireAt}] ->
529 |                         ets:insert(?DB, {Key, Value, ExpireAt}),
530 |                         %% If removal of expired locks and updates were handled
531 |                         %% by multiple processes, i.e. in non-sequential order,
532 |                         %% then it would be possible to end up in a situation,
533 |                         %% in which expired lock has been re-inserted. Calling
534 |                         %% scedule_expire/2 after updating the lock prevents
535 |                         %% from that.
536 |                         schedule_expire(ExpireAt, Key);
537 |                     [] ->
538 |                         %% Lock has been expired
539 |                         ok
540 |                 end,
541 | 
542 |                 State
543 | 
544 |         end,
545 | 
546 |     NewState = lists:foldl(ReplayF, State0, TransLog),
547 | 
548 |     {noreply, NewState};
549 | 
550 | handle_cast(Msg, State) ->
551 |     {stop, {badmsg, Msg}, State}.
552 | 
553 | %%
554 | %% SYSTEM EVENTS
555 | %%
556 | 
557 | handle_info(expire_leases, State) ->
558 |     %% Delete any leases that has expired. There might be writes in
559 |     %% flight, but they have already been validated in the locking
560 |     %% phase and will be written regardless of what is in the db.
561 | 
562 |     Now = now_to_seconds(),
563 |     Expired = lists:flatmap(fun (T) -> ets:lookup(?EXPIRE_DB, T) end,
564 |                             lists:seq(State#state.prev_expire_point, Now)),
565 | 
566 |     ReleaseLockAndNotifyWaiters =
567 |         fun ({At, Key}, RemainingWaiters) ->
568 |                 delete_expire(At, Key),
569 |                 case ets:lookup(?DB, Key) of
570 |                     [{Key, _Value, ExpAt}] when ExpAt =:= At ->
571 |                         ets:delete(?DB, Key),
572 |                         notify_release_waiter(Key, released, RemainingWaiters);
573 |                     _Other ->
574 |                         %% locker_expire_db is out of sync with locker_db
575 |                         %% resulting in one correct and one or more incorrect
576 |                         %% locker_exipre_db entries.
577 |                         %% This exipre entry is incorrect.
578 |                         RemainingWaiters
579 |                 end
580 |         end,
581 |     NewWaiters = lists:foldl(ReleaseLockAndNotifyWaiters,
582 |                              State#state.release_waiters, Expired),
583 |     {noreply, State#state{prev_expire_point = Now,
584 |                           release_waiters = NewWaiters}};
585 | 
586 | handle_info(expire_locks, State) ->
587 |     %% Make a table scan of the write locks. There should be very few
588 |     %% (<1000) writes in progress at any time, so a full scan is
589 |     %% ok. Optimize like the leases if needed.
590 |     Now = now_to_seconds(),
591 |     ets:select_delete(?LOCK_DB,
592 |                       [{ {'_', '_', '$1'}, [{'<', '$1', Now}], [true] }]),
593 | 
594 |     {noreply, State};
595 | 
596 | handle_info(push_trans_log, #state{trans_log = TransLog} = State) ->
597 |     %% Push transaction log to *all* replicas. With multiple masters,
598 |     %% each replica will receive the same write multiple times.
599 |     Msg = {trans_log, node(), lists:reverse(TransLog)},
600 |     gen_server:abcast(get_meta_ets(replicas), locker, Msg),
601 |     {noreply, State#state{trans_log = []}};
602 | 
603 | handle_info(_Info, State) ->
604 |     {noreply, State}.
605 | 
606 | terminate(_Reason, _State) ->
607 |     ok.
608 | 
609 | code_change(_OldVsn, State, _Extra) ->
610 |     {ok, State}.
611 | 
612 | %%%===================================================================
613 | %%% Internal functions
614 | %%%===================================================================
615 | 
616 | %% Notify waiter on a lock that the lock has been taken.
617 | notify_lock_waiter(Now, Key, Value, AllWaiters) ->
618 |     KeyWaiter = fun ({K, _, _}) when Key =:= K -> true;
619 |                     (_) -> false
620 |                 end,
621 |     ReplyIfNotExpired =
622 |         fun ({_, From, Expire}) when Expire > Now ->
623 |                 gen_server:reply(From, {ok, Value});
624 |             (_) ->
625 |                 ok
626 |         end,
627 |     {KeyWaiters, OtherWaiters} = lists:partition(KeyWaiter, AllWaiters),
628 |     lists:foreach(ReplyIfNotExpired, KeyWaiters),
629 |     OtherWaiters.
630 | 
631 | %% Notify waiter of a release of a lock, even if it is expired.
632 | notify_release_waiter(Key, Value, AllWaiters) ->
633 |     KeyWaiter = fun ({K, _, _}) when Key =:= K -> true;
634 |                     (_) -> false
635 |                 end,
636 |     Reply = fun ({_, From, _Expire}) -> gen_server:reply(From, {ok, Value}) end,
637 |     {KeyWaiters, OtherWaiters} = lists:partition(KeyWaiter, AllWaiters),
638 |     lists:foreach(Reply, KeyWaiters),
639 |     OtherWaiters.
640 | 
641 | now_to_seconds() ->
642 |     now_to_seconds(os:timestamp()).
643 | 
644 | now_to_seconds(Now) ->
645 |     {MegaSeconds, Seconds, _} = Now,
646 |     MegaSeconds * 1000000 + Seconds.
647 | 
648 | now_to_ms() ->
649 |     now_to_ms(os:timestamp()).
650 | 
651 | now_to_ms({MegaSecs,Secs,MicroSecs}) ->
652 |     (MegaSecs * 1000000 + Secs) * 1000 + MicroSecs div 1000.
653 | 
654 | ok_responses(Replies) ->
655 |     lists:partition(fun ({_, ok}) -> true;
656 |                         (_)       -> false
657 |                     end, Replies).
658 | 
659 | %%
660 | %% EXPIRATION
661 | %%
662 | 
663 | schedule_expire(At, Key) ->
664 |     true = ets:insert(?EXPIRE_DB, {At, Key}),
665 |     ok.
666 | 
667 | delete_expire(At, Key) ->
668 |     ets:delete_object(?EXPIRE_DB, {At, Key}),
669 |     ok.
670 | 
671 | expire_at(Length) ->
672 |     trunc(now_to_seconds() + (Length/1000)).
673 | 
674 | expires(Key) ->
675 |     case ets:lookup(?DB, Key) of
676 |         [{Key, _Value, ExpireAt}] ->
677 |             ExpireAt;
678 |         [] ->
679 |             []
680 |     end.
681 | 
682 | %%
683 | %% WRITE-LOCKS
684 | %%
685 | 
686 | is_locked(Key) ->
687 |     ets:match(?LOCK_DB, {Key, '_', '_'}) =/= [].
688 | 
689 | set_lock(Tag, Key) ->
690 |     ets:insert_new(?LOCK_DB, {Key, Tag, now_to_seconds() + 10}).
691 | 
692 | del_lock(Tag) ->
693 |     ets:match_delete(?LOCK_DB, {'_', Tag, '_'}).
694 | 


--------------------------------------------------------------------------------