├── rebar ├── .gitignore ├── src ├── locker.app.src └── locker.erl ├── rebar.config ├── priv └── basho_bench_locker.config ├── LICENSE ├── test ├── locker_benchmark.erl ├── basho_bench_driver_locker.erl ├── locker_proper.erl └── locker_SUITE.erl └── README.md /rebar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wooga/locker/HEAD/rebar -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | logs 2 | .eunit 3 | ebin 4 | deps 5 | *.beam 6 | *~ 7 | .DS_Store 8 | .rebar 9 | 10 | -------------------------------------------------------------------------------- /src/locker.app.src: -------------------------------------------------------------------------------- 1 | {application, locker, 2 | [ 3 | {description, ""}, 4 | {vsn, "6"}, 5 | {registered, []}, 6 | {applications, [ 7 | kernel, 8 | stdlib 9 | ]}, 10 | {env, []} 11 | ]}. 12 | -------------------------------------------------------------------------------- /rebar.config: -------------------------------------------------------------------------------- 1 | {lib_dirs, ["deps"]}. 2 | {erl_opts, [debug_info]}. 3 | {clean_files, ["ebin/*.beam"]}. 4 | 5 | {xref_checks, [exports_not_used, undefined_function_calls]}. 6 | {deps, [ 7 | {proper, "", {git,"git://github.com/manopapad/proper.git"}} 8 | ]}. 9 | 10 | {ct_use_short_names, true}. 11 | -------------------------------------------------------------------------------- /priv/basho_bench_locker.config: -------------------------------------------------------------------------------- 1 | {mode, max}. 2 | %{mode, {rate, 100}}. 3 | 4 | {duration, 15}. 5 | 6 | {concurrent, 8}. 7 | 8 | {driver, basho_bench_driver_locker}. 9 | 10 | {code_paths, ["../locker/ebin"]}. 11 | 12 | {operations, [{get, 9}, {set,1}]}. 13 | 14 | {key_generator, {partitioned_sequential_int, 100000000}}. 15 | 16 | {value_generator, {fixed_bin, 1}}. 17 | 18 | {masters, [{'localhost', 'a'}, {'localhost', 'b'}, {'localhost', 'c'}]}. 19 | {w, 2}. 20 | {replicas, [{'localhost', 'r1'}, {'localhost', 'r2'}, {'localhost', 'r3'}]}. 21 | {start_nodes, true}. 22 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright (c) 2011 wooga GmbH 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. -------------------------------------------------------------------------------- /test/locker_benchmark.erl: -------------------------------------------------------------------------------- 1 | -module(locker_benchmark). 2 | -compile([export_all]). 3 | 4 | 5 | 6 | start() -> 7 | StatsConfig = [{docroot, filename:absname( 8 | filename:join(code:priv_dir(statman), 9 | "docroot"))}], 10 | elli:start_link([{callback, statman_elli}, {callback_args, StatsConfig}]), 11 | 12 | locker:start_link(1), 13 | locker:set_nodes([node()], [node()], []), 14 | 15 | statman_server:start_link(1000), 16 | statman_merger:start_link(), 17 | 18 | statman_elli_server:start_link(), 19 | statman_server:add_subscriber(statman_merger), 20 | statman_merger:add_subscriber(statman_elli_server). 21 | 22 | 23 | run(Start, End, ExtendInterval, LeaseLength) -> 24 | [begin 25 | timer:sleep(100), 26 | spawn(?MODULE, session, [N, ExtendInterval, LeaseLength]) 27 | end || N <- lists:seq(Start, End)]. 28 | 29 | 30 | 31 | session(Id, ExtendInterval, LeaseLength) -> 32 | Start = erlang:monotonic_time(micro_seconds), 33 | case locker:lock(Id, self(), LeaseLength) of 34 | {ok, _, _, _} -> 35 | statman_histogram:record_value(get_lease, (erlang:monotonic_time(micro_seconds) - Start)), 36 | ?MODULE:session_loop(Id, ExtendInterval, LeaseLength); 37 | {error, _} -> 38 | error_logger:info_msg("~p: could not get lock~n", [Id]) 39 | end. 40 | 41 | session_loop(Id, ExtendInterval, LeaseLength) -> 42 | erlang:send_after(ExtendInterval, self(), extend), 43 | 44 | receive 45 | extend -> 46 | Start = erlang:monotonic_time(micro_seconds), 47 | case locker:extend_lease(Id, self(), LeaseLength) of 48 | ok -> 49 | statman_histogram:record_value(extend_lease, (erlang:monotonic_time(micro_seconds) - Start)), 50 | ?MODULE:session_loop(Id, ExtendInterval, LeaseLength); 51 | {error, _} -> 52 | error_logger:info_msg("~p: could not extend lease~n", [Id]) 53 | end 54 | end. 55 | -------------------------------------------------------------------------------- /test/basho_bench_driver_locker.erl: -------------------------------------------------------------------------------- 1 | -module(basho_bench_driver_locker). 2 | 3 | -export([new/1, 4 | run/4]). 5 | 6 | new(_Id) -> 7 | case mark_setup_completed() of 8 | true -> 9 | error_logger:info_msg("setting up cluster~n"), 10 | net_kernel:start([master, shortnames]), 11 | {ok, _LocalLocker} = locker:start_link(2), 12 | MasterNames = basho_bench_config:get(masters), 13 | ReplicaNames = basho_bench_config:get(replicas), 14 | W = basho_bench_config:get(w), 15 | 16 | case basho_bench_config:get(start_nodes) of 17 | true -> 18 | Masters = setup(MasterNames, W), 19 | Replicas = setup(ReplicaNames, W), 20 | 21 | ok = locker:set_nodes(Masters ++ Replicas, Masters, Replicas), 22 | error_logger:info_msg("~p~n", 23 | [rpc:call(hd(Replicas), locker, get_meta, [])]), 24 | {ok, {Masters, Replicas}}; 25 | false -> 26 | {ok, []} 27 | end; 28 | false -> 29 | %%timer:sleep(30000), 30 | Masters = [list_to_atom(atom_to_list(N) ++ "@" ++ atom_to_list(H)) 31 | || {H, N} <- basho_bench_config:get(masters)], 32 | 33 | Replicas = [list_to_atom(atom_to_list(N) ++ "@" ++ atom_to_list(H)) 34 | || {H, N} <- basho_bench_config:get(replicas)], 35 | 36 | {ok, {Masters, Replicas}} 37 | end. 38 | 39 | setup(NodeNames, W) -> 40 | Nodes = [begin 41 | element(2, slave:start_link(Hostname, N)) 42 | end 43 | || {Hostname, N} <- NodeNames], 44 | 45 | [rpc:call(N, code, add_path, ["/home/knutin/git/locker/ebin"]) || N <- Nodes], 46 | [rpc:call(N, locker, start_link, [W]) || N <- Nodes], 47 | 48 | Nodes. 49 | 50 | 51 | mark_setup_completed() -> 52 | case whereis(locker_setup) of 53 | undefined -> 54 | true = register(locker_setup, self()), 55 | true; 56 | _ -> 57 | false 58 | end. 59 | 60 | 61 | 62 | run(set, KeyGen, _ValueGen, {[M | Masters], Replicas}) -> 63 | NewMasters = lists:reverse([M | lists:reverse(Masters)]), 64 | 65 | Key = KeyGen(), 66 | case rpc:call(M, locker, lock, [Key, Key]) of 67 | {ok, _, _, _} -> 68 | {ok, {NewMasters, Replicas}}; 69 | {error, Error} -> 70 | error_logger:info_msg("Key: ~p~, ~p~n", [Key, Error]), 71 | {error, Error, {NewMasters, Replicas}} 72 | end; 73 | 74 | run(get, KeyGen, _, {[M | Masters], Replicas}) -> 75 | NewMasters = lists:reverse([M | lists:reverse(Masters)]), 76 | 77 | Key = KeyGen(), 78 | case locker:dirty_read(Key) of 79 | {ok, Key} -> 80 | {ok, {NewMasters, Replicas}}; 81 | {ok, _OtherValue} -> 82 | {error, wrong_value, {NewMasters, Replicas}}; 83 | {error, not_found} -> 84 | {ok, {NewMasters, Replicas}} 85 | end. 86 | 87 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## locker - atomic distributed "check and set" for short-lived keys 2 | 3 | `locker` is a distributed de-centralized consistent in-memory 4 | key-value store written in Erlang. An entry expires after a certain 5 | amount of time, unless the lease is extended. This makes it a good 6 | practical option for locks, mutexes and leader election in a 7 | distributed system. 8 | 9 | In terms of the CAP theorem, `locker` chooses consistency by requiring 10 | a quorum for every write. For reads, `locker` chooses availability and 11 | always does a local read which can be inconsistent. Extensions of the 12 | lease is used as an anti-entropy mechanism to eventually propagate all 13 | leases. 14 | 15 | It is designed to be used inside your application on the Erlang VM, 16 | using the Erlang distribution to communicate with masters and 17 | replicas. 18 | 19 | Operations: 20 | 21 | * `locker:lock/2,3,4` 22 | * `locker:update/3,4` 23 | * `locker:extend_lease/3` 24 | * `locker:release/2,3` 25 | * `locker:wait_for/2` 26 | * `locker:wait_for_release/2` 27 | 28 | 29 | ### Writes 30 | 31 | To achieve "atomic" updates, the write is done in two phases, voting and 32 | commiting. 33 | 34 | In the voting phase, the client asks every master node for a promise 35 | that the node can later set the key. The promise is only granted if 36 | the current value is what the client expects. The promise will block 37 | any other clients from also receiving a promise for that key. 38 | 39 | If the majority of the master nodes gives the client the promise 40 | (quorum), the client can go ahead and commit the lock. If a positive 41 | majority was not reached, the client will abort and delete any 42 | promises it received. 43 | 44 | ### Reads 45 | 46 | `locker` currently only offers dirty reads from the local node. If we 47 | need consistent reads, a read quorum can be used. 48 | 49 | ### Failure 50 | 51 | "So, this is all fine and good, but what happens when something 52 | fails?". To make the implementation simple, there is a timeout on 53 | every promise and every lock. If a promise is not converted into a 54 | lock in time, it is simply deleted. 55 | 56 | If the user process fails to extend the lease of its lock, the lock 57 | expires without consulting any other node. If a node is partitioned 58 | away from the rest of the cluster, the lock might expire too soon 59 | resulting in reads returning the empty value. However, a new lock 60 | cannot be created as a quorum cannot be reached. 61 | 62 | Calling `locker:wait_for_release/2` will block until a lock expires, 63 | either by manual release or from a expired lease. 64 | 65 | ### Lease expiration 66 | 67 | Synchronized clocks is not required for correct expiration of a 68 | lease. It is only required that the clocks progress at roughly the 69 | same speed. When a lock is created or extended, the node will set the 70 | expiration to `now() + lease_length`, which means that the user needs 71 | to account for the skew when extending the lease. With leases in the 72 | order of minutes, the skew should be very small. 73 | 74 | When a lease is extended, it is replicated to the other nodes in the 75 | cluster which will update their local copy if they don't already have 76 | the key. This is used to bring new nodes in sync. 77 | 78 | ### Replication 79 | 80 | A `locker` cluster consists of masters and replicas. The masters 81 | participate in the quorum and accept writes from the clients. The 82 | masters implements strong consistency. Periodically the masters send 83 | off their transaction log to the replicas where it is replayed to 84 | create the same state. Replication is thus asynchronous and reads on 85 | the replicas might be inconsistent. Replication is done in batch to 86 | improve performance by reducing the number of messages each replica 87 | needs to handle. Calling `locker:wait_for/2` after a succesful write 88 | will block until the key is replicated to the local node. If the local 89 | node is a master, it will return immediately. 90 | 91 | ### Adding new nodes 92 | 93 | New nodes may first be added as replicas to sync up before being 94 | promoted to master. Every operation happening after the replica 95 | joined, will be also propagated to the replica. The time to catch up 96 | is then determined by how long it takes for all leases to be extended. 97 | 98 | New nodes might also be set directly as masters, in which case the new 99 | node might give negative votes in the quorum. As long as a quorum can 100 | be reached, the out-of-sync master will still accept writes and catch 101 | up as fast as a replica. 102 | 103 | Using `locker:set_nodes/3` masters and replicas can be set across the 104 | entire cluster in a "send-and-pray" operation. If something happens 105 | during this operation, the locker cluster might be in an inconsistent 106 | state. 107 | -------------------------------------------------------------------------------- /test/locker_proper.erl: -------------------------------------------------------------------------------- 1 | -module(locker_proper). 2 | -compile([export_all]). 3 | 4 | -include_lib("proper/include/proper.hrl"). 5 | 6 | -record(state, {master_leases, replicated_leases}). 7 | 8 | -define(MASTERS, [host_name("a")]). 9 | -define(REPLICAS, [host_name("b")]). 10 | 11 | test() -> 12 | proper:quickcheck(prop_lock_release()). 13 | 14 | prop_lock_release() -> 15 | ?FORALL(Commands, parallel_commands(?MODULE), 16 | ?TRAPEXIT( 17 | begin 18 | [A, B] = Cluster = setup([a, b]), 19 | ok = rpc:call(A, locker, set_nodes, [Cluster, [A], [B]]), 20 | {Seq, P, Result} = run_parallel_commands(?MODULE, Commands), 21 | teardown(Cluster), 22 | ?WHENFAIL( 23 | io:format("Sequential: ~p\nParallel: ~p\nRes: ~p\n", 24 | [Seq, P, Result]), 25 | Result =:= ok) 26 | end)). 27 | 28 | key() -> 29 | elements([1]). 30 | 31 | value() -> 32 | elements([foo, bar]). 33 | 34 | get_master() -> 35 | elements(?MASTERS). 36 | 37 | get_replica() -> 38 | elements(?REPLICAS). 39 | 40 | get_node() -> 41 | elements(?MASTERS ++ ?REPLICAS). 42 | 43 | is_master(N) -> 44 | lists:member(N, ?MASTERS). 45 | 46 | is_replica(N) -> 47 | lists:member(N, ?REPLICAS). 48 | 49 | command(S) -> 50 | Leases = S#state.master_leases =/= [], 51 | oneof([{call, ?MODULE, lock, [get_node(), key(), value()]}] ++ 52 | [{call, ?MODULE, read, [get_node(), key()]}] ++ 53 | [?LET({Key, Value}, elements(S#state.master_leases), 54 | {call, ?MODULE, release, 55 | [get_node(), Key, Value]}) || Leases] ++ 56 | [{call, ?MODULE, update, [get_node(), key(), value(), value()]} 57 | || Leases] ++ 58 | [{call, ?MODULE, replicate, []}] 59 | ). 60 | 61 | lock(Node, Key, Value) -> 62 | rpc:call(Node, locker, lock, [Key, Value]). 63 | 64 | release(Node, Key, Value) -> 65 | rpc:call(Node, locker, release, [Key, Value]). 66 | 67 | update(Node, Key, Value, NewValue) -> 68 | rpc:call(Node, locker, update, [Key, Value, NewValue]). 69 | 70 | replicate() -> 71 | rpc:sbcast(?MASTERS, locker, push_trans_log). 72 | 73 | read(Node, Key) -> 74 | rpc:call(Node, locker, dirty_read, [Key]). 75 | 76 | 77 | initial_state() -> 78 | #state{master_leases = [], replicated_leases = []}. 79 | 80 | precondition(S, {call, _, release, [_, Key, _Value]}) -> 81 | lists:keymember(Key, 1, S#state.master_leases); 82 | 83 | precondition(_, _) -> 84 | true. 85 | 86 | next_state(S, _V, {call, _, lock, [_, Key, Value]}) -> 87 | case lists:keymember(Key, 1, S#state.master_leases) of 88 | true -> 89 | S; 90 | false -> 91 | S#state{master_leases = [{Key, Value} | S#state.master_leases]} 92 | end; 93 | 94 | next_state(S, _V, {call, _, release, [_, Key, Value]}) -> 95 | case lists:member({Key, Value}, S#state.master_leases) of 96 | true -> 97 | S#state{master_leases = lists:delete({Key, Value}, 98 | S#state.master_leases), 99 | replicated_leases = 100 | lists:delete({Key, Value}, S#state.replicated_leases)}; 101 | false -> 102 | S 103 | end; 104 | 105 | next_state(S, _V, {call, _, update, [_, Key, Value, NewValue]}) -> 106 | case lists:member({Key, Value}, S#state.master_leases) of 107 | true -> 108 | S#state{master_leases = [{Key, NewValue} | 109 | lists:delete({Key, Value}, 110 | S#state.master_leases)]}; 111 | false -> 112 | S 113 | end; 114 | 115 | next_state(S, _V, {call, _, replicate, []}) -> 116 | S#state{replicated_leases = S#state.master_leases}; 117 | 118 | next_state(S, _V, {call, _, read, _}) -> 119 | S. 120 | 121 | postcondition(S, {call, _, lock, [_, Key, _Value]}, Result) -> 122 | case Result of 123 | {ok, _, _, _} -> 124 | not lists:keymember(Key, 1, S#state.master_leases); 125 | {error, no_quorum} -> 126 | lists:keymember(Key, 1, S#state.master_leases) 127 | end; 128 | 129 | postcondition(S, {call, _, release, [_, Key, Value]}, {ok, _, _, _}) -> 130 | lists:member({Key, Value}, S#state.master_leases); 131 | 132 | postcondition(S, {call, _, release, [_, Key, _Value]}, {error, no_quorum}) -> 133 | lists:keymember(Key, 1, S#state.master_leases); 134 | 135 | postcondition(S, {call, _, update, [_, Key, Value, _NewValue]}, 136 | {ok, _, _, _}) -> 137 | lists:member({Key, Value}, S#state.master_leases); 138 | 139 | postcondition(S, {call, _, update, [_, Key, Value, _NewValue]}, 140 | {error, no_quorum}) -> 141 | Val = lists:keymember(Key, 1, S#state.master_leases), 142 | Val orelse (Val =/= Value); 143 | 144 | postcondition(_S, {call, _, replicate, []}, _) -> 145 | true; 146 | 147 | postcondition(S, {call, _, read, [Node, Key]}, Result) -> 148 | case is_master(Node) of 149 | true -> 150 | case Result of 151 | {ok, Value} -> 152 | lists:member({Key, Value}, S#state.master_leases); 153 | {error, not_found} -> 154 | not lists:keymember(Key, 1, S#state.master_leases) 155 | end; 156 | false -> 157 | case Result of 158 | {ok, Value} -> 159 | lists:member({Key, Value}, S#state.replicated_leases); 160 | {error, not_found} -> 161 | not lists:keymember(Key, 1, S#state.replicated_leases) 162 | end 163 | end. 164 | 165 | %% 166 | %% SETUP 167 | %% 168 | 169 | setup(Name) when is_atom(Name) -> 170 | {ok, Node} = slave:start_link(list_to_atom(net_adm:localhost()), Name), 171 | true = rpc:call(Node, code, add_path, ["ebin"]), 172 | {ok, _} = rpc:call(Node, locker, start_link, [1]), 173 | 174 | {ok, _, _, R1, R2, R3} = rpc:call(Node, locker, get_debug_state, []), 175 | {ok, cancel} = rpc:call(Node, timer, cancel, [R1]), 176 | {ok, cancel} = rpc:call(Node, timer, cancel, [R2]), 177 | {ok, cancel} = rpc:call(Node, timer, cancel, [R3]), 178 | Node; 179 | 180 | setup(NodeNames) -> 181 | lists:map(fun setup/1, NodeNames). 182 | 183 | teardown(Nodes) -> 184 | lists:map(fun slave:stop/1, Nodes). 185 | 186 | %% @doc Return fully qualified name for local host node. 187 | host_name(Name) -> 188 | list_to_atom(Name ++ "@" ++ net_adm:localhost()). 189 | -------------------------------------------------------------------------------- /test/locker_SUITE.erl: -------------------------------------------------------------------------------- 1 | -module(locker_SUITE). 2 | -compile([export_all]). 3 | -include_lib("test_server/include/test_server.hrl"). 4 | 5 | -define (EBIN_DIR, lists:flatten( 6 | filename:dirname(filename:dirname(filename:absname(""))) ++ 7 | ["/ebin"])). 8 | 9 | all() -> 10 | [ 11 | api, 12 | quorum, 13 | no_quorum_possible, 14 | release, 15 | lease_extend, 16 | expire_leases, 17 | one_node_down, 18 | extend_propagates, 19 | add_remove_node, 20 | replica, 21 | promote, 22 | wait_for, 23 | wait_for_release, 24 | update 25 | ]. 26 | 27 | api(_) -> 28 | [A, B, C] = Cluster = setup([a, b, c]), 29 | ok = rpc:call(A, locker, set_nodes, [Cluster, Cluster, []]), 30 | 31 | {Cluster, [], 2} = rpc:call(A, locker, get_meta, []), 32 | 33 | ok = rpc:call(A, locker, set_w, [[A], 3]), 34 | {Cluster, [], 3} = rpc:call(A, locker, get_meta, []), 35 | ok = rpc:call(A, locker, set_w, [[A], 2]), 36 | 37 | {ok, 2, 3, 3} = rpc:call(A, locker, lock, [123, self()]), 38 | %% slave:stop(C), 39 | Pid = rpc:call(C, erlang, whereis, [locker]), 40 | true = rpc:call(C, erlang, exit, [Pid, kill]), 41 | false = rpc:call(C, erlang, is_process_alive, [Pid]), 42 | {ok, 2, 2, 2} = rpc:call(A, locker, release, [123, self()]), 43 | {ok, 2, 2, 2} = rpc:call(B, locker, lock, [123, self()]), 44 | {error, no_quorum} = rpc:call(A, locker, update, [123, wrong_value, 45 | new_value]), 46 | 47 | teardown([A, B, C]). 48 | 49 | quorum(_) -> 50 | [A, B, C] = Cluster = setup([a, b, c]), 51 | ok = rpc:call(A, locker, set_nodes, [Cluster, Cluster, []]), 52 | 53 | Parent = self(), 54 | spawn(fun() -> 55 | Parent ! {1, catch rpc:call(A, locker, lock, [123, Parent])} 56 | end), 57 | spawn(fun() -> 58 | Parent ! {2, catch rpc:call(B, locker, lock, [123, Parent])} 59 | end), 60 | receive {1, P1} -> P1 after 1000 -> throw(timeout) end, 61 | receive {2, P2} -> P2 after 1000 -> throw(timeout) end, 62 | 63 | ?line {ok, Pid} = rpc:call(A, locker, dirty_read, [123]), 64 | ?line {ok, Pid} = rpc:call(B, locker, dirty_read, [123]), 65 | rpc:sbcast([A, B, C], locker, push_trans_log), 66 | ?line {ok, Pid} = rpc:call(C, locker, dirty_read, [123]), 67 | 68 | {ok, [], [{123, Pid, _}], _, _, _} = state(A), 69 | {ok, [], [{123, Pid, _}], _, _, _} = state(B), 70 | {ok, [], [{123, Pid, _}], _, _, _} = state(C), 71 | 72 | teardown([A, B, C]). 73 | 74 | no_quorum_possible(_) -> 75 | [A, B, C] = setup([a, b, c]), 76 | ok = rpc:call(A, locker, set_nodes, [[A, B], [A, B], []]), 77 | 78 | Parent = self(), 79 | spawn(fun() -> 80 | Parent ! {1, catch rpc:call(A, locker, lock, [123, Parent])} 81 | end), 82 | spawn(fun() -> 83 | Parent ! {2, catch rpc:call(B, locker, lock, [123, Parent])} 84 | end), 85 | 86 | {error, no_quorum} = receive {1, P1} -> P1 after 1000 -> throw(timeout) end, 87 | {error, no_quorum} = receive {2, P2} -> P2 after 1000 -> throw(timeout) end, 88 | 89 | {error, not_found} = rpc:call(A, locker, dirty_read, [123]), 90 | {error, not_found} = rpc:call(B, locker, dirty_read, [123]), 91 | rpc:sbcast([A, B, C], locker, push_trans_log), 92 | {error, not_found} = rpc:call(C, locker, dirty_read, [123]), 93 | 94 | {ok, [], [], _, _, _} = state(A), 95 | {ok, [], [], _, _, _} = state(B), 96 | {ok, [], [], _, _, _} = state(C), 97 | 98 | teardown([A, B, C]). 99 | 100 | release(_) -> 101 | [A, B, C] = Cluster = setup([a, b, c]), 102 | ok = rpc:call(A, locker, set_nodes, [Cluster, Cluster, []]), 103 | 104 | Value = self(), 105 | {ok, 2, 3, 3} = rpc:call(A, locker, lock, [123, Value]), 106 | 107 | {ok, Value} = rpc:call(A, locker, dirty_read, [123]), 108 | {ok, Value} = rpc:call(B, locker, dirty_read, [123]), 109 | rpc:sbcast([A, B, C], locker, push_trans_log), 110 | {ok, Value} = rpc:call(C, locker, dirty_read, [123]), 111 | slave:stop(A), 112 | slave:stop(B), 113 | 114 | {error, no_quorum} = rpc:call(C, locker, release, [123, Value]), 115 | rpc:sbcast([A, B, C], locker, push_trans_log), 116 | {ok, Value} = rpc:call(C, locker, dirty_read, [123]), 117 | 118 | teardown([A, B, C]). 119 | 120 | one_node_down(_) -> 121 | [A, B, C] = Cluster = setup([a, b, c]), 122 | ok = rpc:call(A, locker, set_nodes, [Cluster, Cluster, []]), 123 | slave:stop(C), 124 | 125 | Pid = self(), 126 | spawn(fun() -> 127 | Pid ! {1, catch rpc:call(A, locker, lock, [123, Pid])} 128 | end), 129 | receive {1, P1} -> P1 after 1000 -> throw(timeout) end, 130 | 131 | {ok, Pid} = rpc:call(A, locker, dirty_read, [123]), 132 | {ok, Pid} = rpc:call(B, locker, dirty_read, [123]), 133 | 134 | {ok, [], [{123, Pid, _}], _, _, _} = state(A), 135 | {ok, [], [{123, Pid, _}], _, _, _} = state(B), 136 | 137 | teardown([A, B, C]). 138 | 139 | extend_propagates(_) -> 140 | [A, B, C] = setup([a, b, c]), 141 | ok = rpc:call(A, locker, set_nodes, [[A, B], [A, B], []]), 142 | 143 | Pid = self(), 144 | {ok, 2, 2, 2} = rpc:call(A, locker, lock, [123, Pid]), 145 | 146 | {ok, Pid} = rpc:call(A, locker, dirty_read, [123]), 147 | {ok, Pid} = rpc:call(B, locker, dirty_read, [123]), 148 | {error, not_found} = rpc:call(C, locker, dirty_read, [123]), 149 | 150 | {ok, [], [{123, Pid, _}], _, _, _} = state(A), 151 | {ok, [], [{123, Pid, _}], _, _, _} = state(B), 152 | rpc:sbcast([A, B, C], locker, push_trans_log), 153 | {ok, [], [], _, _, _} = state(C), 154 | 155 | ok = rpc:call(A, locker, set_nodes, [[A, B, C], [A, B], [C]]), 156 | 157 | ok = rpc:call(A, locker, extend_lease, [123, Pid, 2000]), 158 | 159 | 160 | {ok, [], [{123, Pid, _ExA}], _, _, _} = state(A), 161 | {ok, [], [{123, Pid, _ExB}], _, _, _} = state(B), 162 | rpc:sbcast([A, B, C], locker, push_trans_log), 163 | {ok, [], [{123, Pid, _ExC}], _, _, _} = state(C), 164 | 165 | %% abs((ExA - ExB)) < 3 orelse throw(too_much_drift), 166 | %% abs((ExB - ExC)) < 3 orelse throw(too_much_drift), 167 | %% abs((ExA - ExC)) < 3 orelse throw(too_much_drift), 168 | 169 | teardown([A, B, C]). 170 | 171 | 172 | lease_extend(_) -> 173 | [A, B, C] = Cluster = setup([a, b, c]), 174 | ok = rpc:call(A, locker, set_nodes, [Cluster, Cluster, []]), 175 | 176 | Pid = self(), 177 | {ok, _, _, _} = rpc:call(A, locker, lock, [123, Pid]), 178 | {ok, Pid} = rpc:call(A, locker, dirty_read, [123]), 179 | {ok, Pid} = rpc:call(B, locker, dirty_read, [123]), 180 | {ok, Pid} = rpc:call(C, locker, dirty_read, [123]), 181 | 182 | timer:sleep(2000), 183 | rpc:sbcast([A, B, C], locker, expire_leases), 184 | 185 | {error, not_found} = rpc:call(A, locker, dirty_read, [123]), 186 | {error, not_found} = rpc:call(B, locker, dirty_read, [123]), 187 | {error, not_found} = rpc:call(C, locker, dirty_read, [123]), 188 | 189 | {ok, _, _, _} = rpc:call(A, locker, lock, [123, Pid]), 190 | {ok, Pid} = rpc:call(A, locker, dirty_read, [123]), 191 | {ok, Pid} = rpc:call(B, locker, dirty_read, [123]), 192 | {ok, Pid} = rpc:call(C, locker, dirty_read, [123]), 193 | 194 | 195 | ok = rpc:call(B, locker, extend_lease, [123, Pid, 2000]), 196 | rpc:sbcast([A, B, C], locker, expire_leases), 197 | {ok, Pid} = rpc:call(A, locker, dirty_read, [123]), 198 | {ok, Pid} = rpc:call(B, locker, dirty_read, [123]), 199 | {ok, Pid} = rpc:call(C, locker, dirty_read, [123]), 200 | 201 | ok. 202 | 203 | expire_leases(_) -> 204 | [A, B, C] = Cluster = setup([a, b, c]), 205 | ok = rpc:call(A, locker, set_nodes, [Cluster, Cluster, []]), 206 | 207 | Pid = self(), 208 | {ok, _, _, _} = rpc:call(A, locker, lock, [123, Pid]), 209 | 210 | timer:sleep(1000), 211 | {ok, _, _, _} = rpc:call(A, locker, lock, [abc, Pid]), 212 | 213 | {ok, Pid} = rpc:call(A, locker, dirty_read, [123]), 214 | {ok, Pid} = rpc:call(B, locker, dirty_read, [123]), 215 | {ok, Pid} = rpc:call(C, locker, dirty_read, [123]), 216 | {ok, Pid} = rpc:call(A, locker, dirty_read, [abc]), 217 | {ok, Pid} = rpc:call(B, locker, dirty_read, [abc]), 218 | {ok, Pid} = rpc:call(C, locker, dirty_read, [abc]), 219 | 220 | timer:sleep(2000), 221 | rpc:sbcast([A, B, C], locker, expire_leases), 222 | 223 | {error, not_found} = rpc:call(A, locker, dirty_read, [123]), 224 | {error, not_found} = rpc:call(B, locker, dirty_read, [123]), 225 | {error, not_found} = rpc:call(C, locker, dirty_read, [123]), 226 | {error, not_found} = rpc:call(A, locker, dirty_read, [abc]), 227 | {error, not_found} = rpc:call(B, locker, dirty_read, [abc]), 228 | {error, not_found} = rpc:call(C, locker, dirty_read, [abc]), 229 | 230 | teardown([A, B, C]). 231 | 232 | add_remove_node(_) -> 233 | [A, B, C] = Cluster = setup([a, b, c]), 234 | ok = rpc:call(A, locker, set_nodes, [Cluster, Cluster, []]), 235 | 236 | {ok, 2, 3, 3} = rpc:call(A, locker, lock, [123, self()]), 237 | {ok, 2, 3, 3} = rpc:call(B, locker, release, [123, self()]), 238 | 239 | ok = rpc:call(A, locker, set_nodes, [Cluster, [A, B], []]), 240 | {ok, 2, 2, 2} = rpc:call(A, locker, lock, [123, self()]), 241 | 242 | teardown([A, B, C]). 243 | 244 | replica(_) -> 245 | [A, B, C] = Cluster = setup([a, b, c]), 246 | ok = rpc:call(A, locker, set_nodes, [Cluster, [A, B], [C]]), 247 | 248 | {[A, B], [C], 2} = rpc:call(A, locker, get_meta, []), 249 | {[A, B], [C], 2} = rpc:call(B, locker, get_meta, []), 250 | {[A, B], [C], 2} = rpc:call(C, locker, get_meta, []), 251 | 252 | Pid = self(), 253 | {ok, 2, 2, 2} = rpc:call(A, locker, lock, [123, Pid]), 254 | 255 | {ok, Pid} = rpc:call(A, locker, dirty_read, [123]), 256 | {ok, Pid} = rpc:call(B, locker, dirty_read, [123]), 257 | {error, not_found} = rpc:call(C, locker, dirty_read, [123]), 258 | rpc:sbcast([A, B, C], locker, push_trans_log), 259 | {ok, Pid} = rpc:call(C, locker, dirty_read, [123]), 260 | 261 | slave:stop(B), 262 | 263 | {error, no_quorum} = rpc:call(A, locker, release, [123, Pid]), 264 | 265 | teardown([A, B, C]). 266 | 267 | promote(_) -> 268 | [A, B, C] = Cluster = setup([a, b, c]), 269 | ok = rpc:call(A, locker, set_nodes, [Cluster, [A, B], [C]]), 270 | 271 | Pid = self(), 272 | {ok, 2, 2, 2} = rpc:call(A, locker, lock, [123, Pid]), 273 | timer:sleep(200), 274 | {ok, Pid} = rpc:call(A, locker, dirty_read, [123]), 275 | {ok, Pid} = rpc:call(B, locker, dirty_read, [123]), 276 | rpc:sbcast([A, B, C], locker, push_trans_log), 277 | {ok, Pid} = rpc:call(C, locker, dirty_read, [123]), 278 | 279 | 280 | ok = rpc:call(A, locker, set_nodes, [Cluster, [A, B, C], []]), 281 | {ok, 2, 3, 3} = rpc:call(A, locker, release, [123, Pid]), 282 | 283 | teardown([A, B, C]). 284 | 285 | 286 | wait_for(_) -> 287 | [A, B, C] = Cluster = setup([a, b, c]), 288 | ok = rpc:call(A, locker, set_nodes, [Cluster, [A, B], [C]]), 289 | 290 | Pid = self(), 291 | {ok, 2, 2, 2} = rpc:call(A, locker, lock, [123, Pid]), 292 | 293 | {error, not_found} = rpc:call(C, locker, dirty_read, [123]), 294 | {badrpc, {'EXIT', {timeout, _}}} = rpc:call(C, locker, wait_for, [123, 100]), 295 | 296 | rpc:sbcast([A, B, C], locker, push_trans_log), 297 | {ok, Pid} = rpc:call(C, locker, wait_for, [123, 5000]), 298 | 299 | teardown([A, B, C]). 300 | 301 | wait_for_release(_) -> 302 | [A, B, C] = Cluster = setup([a, b, c]), 303 | ok = rpc:call(A, locker, set_nodes, [Cluster, [A, B], [C]]), 304 | 305 | LeaseLength = 500, 306 | Pid = Parent = self(), 307 | {ok, 2, 2, 2} = rpc:call(A, locker, lock, [123, Pid, LeaseLength, 1000]), 308 | 309 | {ok, Pid} = rpc:call(B, locker, dirty_read, [123]), 310 | {error, not_found} = rpc:call(C, locker, dirty_read, [123]), 311 | {error, key_not_locked} = 312 | rpc:call(C, locker, wait_for_release, [123, 100]), 313 | 314 | rpc:sbcast([A, B, C], locker, push_trans_log), 315 | timer:sleep(100), 316 | 317 | P1 = spawn(fun() -> 318 | Parent ! {self(), (catch rpc:call(B, locker, wait_for_release, [123, 1000]))} 319 | end), 320 | P2 = spawn(fun() -> 321 | Parent ! {self(), (catch rpc:call(C, locker, wait_for_release, [123, 1000]))} 322 | end), 323 | timer:sleep(LeaseLength), 324 | rpc:sbcast([A, B, C], locker, expire_leases), 325 | 326 | {ok, released} = receive {P1, M1} -> M1 end, 327 | {ok, released} = receive {P2, M2} -> M2 end, 328 | 329 | teardown([A, B, C]). 330 | 331 | update(_) -> 332 | [A, B, C] = Cluster = setup([a, b, c]), 333 | ok = rpc:call(A, locker, set_nodes, [Cluster, [A, B], [C]]), 334 | 335 | Key = 123, 336 | Value0 = 41, 337 | Value1 = 42, 338 | LeaseLength = 50, 339 | {ok, 2, 2, 2} = rpc:call(A, locker, lock, [Key, Value0, LeaseLength]), 340 | {ok, 2, 2, 2} = rpc:call(B, locker, update, [Key, Value0, Value1]), 341 | 342 | rpc:sbcast([A, B, C], locker, push_trans_log), 343 | {ok, Value1} = rpc:call(C, locker, dirty_read, [Key]), 344 | {ok, Value1} = rpc:call(B, locker, dirty_read, [Key]), 345 | 346 | {error, no_quorum} = rpc:call(A, locker, update, [Key, Value0, 347 | random_value]), 348 | 349 | timer:sleep(LeaseLength), 350 | rpc:sbcast([A, B, C], locker, expire_leases), 351 | 352 | Res = lists:duplicate(3, {error, not_found}), 353 | {Res, []} = rpc:multicall(Cluster, locker, dirty_read, [Key]), 354 | 355 | teardown([A, B, C]). 356 | 357 | %% 358 | %% HELPERS 359 | %% 360 | 361 | 362 | setup(Name) when is_atom(Name) -> 363 | {ok, Node} = slave:start_link(list_to_atom(net_adm:localhost()), Name), 364 | 365 | true = rpc:call(Node, code, add_path, [?EBIN_DIR]), 366 | {ok, _} = rpc:call(Node, locker, start_link, [2]), 367 | 368 | {ok, _, _, R1, R2, R3} = rpc:call(Node, locker, get_debug_state, []), 369 | {ok, cancel} = rpc:call(Node, timer, cancel, [R1]), 370 | {ok, cancel} = rpc:call(Node, timer, cancel, [R2]), 371 | {ok, cancel} = rpc:call(Node, timer, cancel, [R3]), 372 | Node; 373 | 374 | setup(NodeNames) -> 375 | lists:map(fun setup/1, NodeNames). 376 | 377 | 378 | teardown(Nodes) -> 379 | lists:map(fun slave:stop/1, Nodes). 380 | 381 | state(N) -> 382 | rpc:call(N, locker, get_debug_state, []). 383 | -------------------------------------------------------------------------------- /src/locker.erl: -------------------------------------------------------------------------------- 1 | %% @doc Distributed consistent key-value store 2 | %% 3 | %% Reads use the local copy, all data is replicated to all nodes. 4 | %% 5 | %% Writing is done in two phases, in the first phase the key is 6 | %% locked, if a quorum can be made, the value is written. 7 | 8 | -module(locker). 9 | -behaviour(gen_server). 10 | -author('Knut Nesheim '). 11 | 12 | %% API 13 | -export([start_link/1, start_link/4]). 14 | -export([set_w/2, set_nodes/3]). 15 | 16 | -export([lock/2, lock/3, lock/4, update/3, update/4, 17 | extend_lease/3,release/2, release/3]). 18 | -export([wait_for/2, wait_for_release/1, wait_for_release/2]). 19 | -export([dirty_read/1, master_dirty_read/1]). 20 | -export([lag/0, summary/0]). 21 | 22 | 23 | -export([get_write_lock/4, do_write/6, release_write_lock/3]). 24 | -export([get_meta/0, get_meta_ets/1, get_debug_state/0]). 25 | 26 | -export([now_to_seconds/0]). 27 | 28 | %% gen_server callbacks 29 | -export([init/1, handle_call/3, handle_cast/2, handle_info/2, 30 | terminate/2, code_change/3]). 31 | 32 | -record(state, { 33 | %% The masters queue writes in the trans_log for batching to 34 | %% the replicas, triggered every N milliseconds by the 35 | %% push_replica timer 36 | trans_log = [], 37 | 38 | %% Clients can wait for a key to become locked 39 | waiters = [], 40 | 41 | %% Clients can wait for a lock to be released 42 | release_waiters = [], 43 | 44 | %% Previous point of expiration, no keys older than this 45 | %% point should exist 46 | prev_expire_point, 47 | 48 | %% Timer references 49 | lease_expire_ref, 50 | write_locks_expire_ref, 51 | push_trans_log_ref 52 | }). 53 | 54 | -define(LEASE_LENGTH, 2000). 55 | -define(DB, locker_db). 56 | -define(LOCK_DB, locker_lock_db). 57 | -define(META_DB, locker_meta_db). 58 | -define(EXPIRE_DB, locker_expire_db). 59 | 60 | %%%=================================================================== 61 | %%% API 62 | %%%=================================================================== 63 | 64 | start_link(W) -> 65 | start_link(W, 1000, 1000, 100). 66 | 67 | start_link(W, LeaseExpireInterval, LockExpireInterval, PushTransInterval) -> 68 | Args = [W, LeaseExpireInterval, LockExpireInterval, PushTransInterval], 69 | gen_server:start_link({local, ?MODULE}, ?MODULE, Args, []). 70 | 71 | lock(Key, Value) -> 72 | lock(Key, Value, ?LEASE_LENGTH). 73 | 74 | lock(Key, Value, LeaseLength) -> 75 | lock(Key, Value, LeaseLength, 5000). 76 | 77 | %% @doc: Tries to acquire the lock. In case of unreachable nodes, the 78 | %% timeout is 1 second per node which might need tuning. Returns {ok, 79 | %% W, V, C} where W is the number of agreeing nodes required for a 80 | %% quorum, V is the number of nodes that voted in favor of this lock 81 | %% in the case of contention and C is the number of nodes who 82 | %% acknowledged commit of the lock successfully. 83 | lock(Key, Value, LeaseLength, Timeout) -> 84 | Nodes = get_meta_ets(nodes), 85 | W = get_meta_ets(w), 86 | 87 | %% Try getting the write lock on all nodes 88 | {Tag, RequestReplies, _BadNodes} = get_write_lock(Nodes, Key, not_found, Timeout), 89 | 90 | case ok_responses(RequestReplies) of 91 | {OkNodes, _} when length(OkNodes) >= W -> 92 | %% Majority of nodes gave us the lock, go ahead and do the 93 | %% write on all masters. The write also releases the 94 | %% lock. Replicas are synced asynchronously by the 95 | %% masters. 96 | {WriteReplies, _} = do_write(Nodes, 97 | Tag, Key, Value, 98 | LeaseLength, Timeout), 99 | {OkWrites, _} = ok_responses(WriteReplies), 100 | {ok, W, length(OkNodes), length(OkWrites)}; 101 | _ -> 102 | {_AbortReplies, _} = release_write_lock(Nodes, Tag, Timeout), 103 | {error, no_quorum} 104 | end. 105 | 106 | update(Key, Value, NewValue) -> 107 | update(Key, Value, NewValue, 5000). 108 | 109 | %% @doc: Tries to update the lock. The update only happens if an existing 110 | %% value of the lock corresponds to the given Value within the W number of 111 | %% master nodes. 112 | %% Returns the same tuple as in lock/4 case. 113 | update(Key, Value, NewValue, Timeout) -> 114 | Nodes = get_meta_ets(nodes), 115 | W = get_meta_ets(w), 116 | 117 | %% Try getting the write lock on all nodes 118 | {Tag, RequestReplies, _BadNodes} = get_write_lock(Nodes, Key, Value, 119 | Timeout), 120 | 121 | case ok_responses(RequestReplies) of 122 | {OkNodes, _} when length(OkNodes) >= W -> 123 | {UpdateReplies, _} = do_update(Nodes, Tag, Key, NewValue, Timeout), 124 | {OkUpdates, _} = ok_responses(UpdateReplies), 125 | {ok, W, length(OkNodes), length(OkUpdates)}; 126 | _ -> 127 | {_AbortReplies, _} = release_write_lock(Nodes, Tag, Timeout), 128 | {error, no_quorum} 129 | end. 130 | 131 | %% @doc: Waits for the key to become available on the local node. If a 132 | %% value is already available, returns immediately, otherwise it will 133 | %% return within the timeout. In case of timeout, the caller might get 134 | %% a reply anyway if it sent at the same time as the timeout. 135 | wait_for(Key, Timeout) -> 136 | case dirty_read(Key) of 137 | {ok, Value} -> 138 | {ok, Value}; 139 | {error, not_found} -> 140 | gen_server:call(locker, {wait_for, Key, Timeout}, Timeout) 141 | end. 142 | 143 | wait_for_release(Key) -> 144 | wait_for_release(Key, 5000). 145 | 146 | wait_for_release(Key, Timeout) -> 147 | case dirty_read(Key) of 148 | {ok, _Value} -> 149 | gen_server:call(locker, {wait_for_release, Key, Timeout}, Timeout); 150 | {error, not_found} -> 151 | {error, key_not_locked} 152 | end. 153 | 154 | release(Key, Value) -> 155 | release(Key, Value, 5000). 156 | 157 | release(Key, Value, Timeout) -> 158 | Nodes = get_meta_ets(nodes), 159 | Replicas = get_meta_ets(replicas), 160 | W = get_meta_ets(w), 161 | 162 | %% Try getting the write lock on all nodes 163 | {Tag, WriteLockReplies, _} = get_write_lock(Nodes, Key, Value, Timeout), 164 | 165 | case ok_responses(WriteLockReplies) of 166 | {OkNodes, _} when length(OkNodes) >= W -> 167 | Request = {release, Key, Value, Tag}, 168 | {ReleaseReplies, _BadNodes} = 169 | gen_server:multi_call(Nodes ++ Replicas, locker, Request, Timeout), 170 | 171 | {OkWrites, _} = ok_responses(ReleaseReplies), 172 | 173 | {ok, W, length(OkNodes), length(OkWrites)}; 174 | _ -> 175 | {_AbortReplies, _} = release_write_lock(Nodes, Tag, Timeout), 176 | {error, no_quorum} 177 | end. 178 | 179 | 180 | extend_lease(Key, Value, LeaseLength) -> 181 | extend_lease(Key, Value, LeaseLength, 5000). 182 | 183 | %% @doc: Extends the lease for the lock on all nodes that are up. What 184 | %% really happens is that the expiration is scheduled for (now + lease 185 | %% time), to allow for nodes that just joined to set the correct 186 | %% expiration time without knowing the start time of the lease. 187 | extend_lease(Key, Value, LeaseLength, Timeout) -> 188 | Nodes = get_meta_ets(nodes), 189 | W = get_meta_ets(w), 190 | 191 | {Tag, WriteLockReplies, _} = get_write_lock(Nodes, Key, Value, Timeout), 192 | 193 | case ok_responses(WriteLockReplies) of 194 | {N, _E} when length(N) >= W -> 195 | 196 | Request = {extend_lease, Tag, Key, Value, LeaseLength}, 197 | {Replies, _} = gen_server:multi_call(Nodes, locker, Request, Timeout), 198 | {_, FailedExtended} = ok_responses(Replies), 199 | release_write_lock(FailedExtended, Tag, Timeout), 200 | ok; 201 | _ -> 202 | {_AbortReplies, _} = release_write_lock(Nodes, Tag, Timeout), 203 | {error, no_quorum} 204 | end. 205 | 206 | %% @doc: A dirty read does not create a read-quorum so consistency is 207 | %% not guaranteed. The value is read directly from a local ETS-table, 208 | %% so the performance should be very high. 209 | dirty_read(Key) -> 210 | case ets:lookup(?DB, Key) of 211 | [{Key, Value, _Lease}] -> 212 | {ok, Value}; 213 | [] -> 214 | {error, not_found} 215 | end. 216 | 217 | %% @doc: Execute a dirty read on the master. Same caveats as for 218 | %% dirty_read/1 219 | master_dirty_read(Key) -> 220 | Masters = get_meta_ets(nodes), 221 | case lists:member(node(), Masters) of 222 | true -> 223 | dirty_read(Key); 224 | false -> 225 | Master = lists:nth(random:uniform(length(Masters)), Masters), 226 | rpc:call(Master, locker, dirty_read, [Key]) 227 | end. 228 | 229 | %% 230 | %% Helpers for operators 231 | %% 232 | 233 | lag() -> 234 | Key = {'__lock_lag_probe', os:timestamp()}, 235 | {Time, Result} = timer:tc(fun() -> 236 | lock(Key, foo, 2000) 237 | end), 238 | release(Key, foo), 239 | {Time / 1000, Result}. 240 | 241 | summary() -> 242 | [{write_locks, ets:info(?LOCK_DB, size)}, 243 | {leases, ets:info(?DB, size)}]. 244 | 245 | get_meta() -> 246 | {get_meta_ets(nodes), get_meta_ets(replicas), get_meta_ets(w)}. 247 | 248 | %% 249 | %% Helpers 250 | %% 251 | 252 | get_write_lock(Nodes, Key, Value, Timeout) -> 253 | Tag = make_ref(), 254 | Request = {get_write_lock, Key, Value, Tag}, 255 | {Replies, Down} = gen_server:multi_call(Nodes, locker, Request, Timeout), 256 | {Tag, Replies, Down}. 257 | 258 | do_write(Nodes, Tag, Key, Value, LeaseLength, Timeout) -> 259 | gen_server:multi_call(Nodes, locker, 260 | {write, Tag, Key, Value, LeaseLength}, 261 | Timeout). 262 | 263 | do_update(Nodes, Tag, Key, Value, Timeout) -> 264 | gen_server:multi_call(Nodes, locker, 265 | {update, Tag, Key, Value}, 266 | Timeout). 267 | 268 | release_write_lock(Nodes, Tag, Timeout) -> 269 | gen_server:multi_call(Nodes, locker, {release_write_lock, Tag}, Timeout). 270 | 271 | get_meta_ets(Key) -> 272 | case ets:lookup(?META_DB, Key) of 273 | [] -> 274 | throw({locker, no_such_meta_key}); 275 | [{Key, Value}] -> 276 | Value 277 | end. 278 | 279 | %% @doc: Replaces the primary and replica node list on all nodes in 280 | %% the cluster. Assumes no failures. 281 | set_nodes(Cluster, Primaries, Replicas) -> 282 | {_Replies, []} = gen_server:multi_call(Cluster, locker, 283 | {set_nodes, Primaries, Replicas}), 284 | ok. 285 | 286 | set_w(Cluster, W) when is_integer(W) -> 287 | {_Replies, []} = gen_server:multi_call(Cluster, locker, {set_w, W}), 288 | ok. 289 | 290 | get_debug_state() -> 291 | gen_server:call(?MODULE, get_debug_state). 292 | 293 | %%%=================================================================== 294 | %%% gen_server callbacks 295 | %%%=================================================================== 296 | init([W, LeaseExpireInterval, LockExpireInterval, PushTransInterval]) -> 297 | ?DB = ets:new(?DB, [named_table, protected, set, 298 | {read_concurrency, true}, 299 | {write_concurrency, true}]), 300 | 301 | ?LOCK_DB = ets:new(?LOCK_DB, [named_table, protected, set]), 302 | ?EXPIRE_DB = ets:new(?EXPIRE_DB, [named_table, protected, bag]), 303 | 304 | 305 | ?META_DB = ets:new(?META_DB, [named_table, protected, set, 306 | {read_concurrency, true}]), 307 | ets:insert(?META_DB, {w, W}), 308 | ets:insert(?META_DB, {nodes, []}), 309 | ets:insert(?META_DB, {replicas, []}), 310 | 311 | 312 | {ok, LeaseExpireRef} = timer:send_interval(LeaseExpireInterval, expire_leases), 313 | {ok, WriteLocksExpireRef} = timer:send_interval(LockExpireInterval, expire_locks), 314 | {ok, PushTransLog} = timer:send_interval(PushTransInterval, push_trans_log), 315 | {ok, #state{lease_expire_ref = LeaseExpireRef, 316 | write_locks_expire_ref = WriteLocksExpireRef, 317 | push_trans_log_ref = PushTransLog, 318 | prev_expire_point = now_to_seconds()}}. 319 | 320 | %% 321 | %% WRITE-LOCKS 322 | %% 323 | 324 | handle_call({get_write_lock, Key, Value, Tag}, _From, State) -> 325 | %% Phase 1: Grant a write lock on the key if the value in the 326 | %% database is what the coordinator expects. If the atom 327 | %% 'not_found' is given as the expected value, the lock is granted 328 | %% if the key does not exist. 329 | %% 330 | %% Only one lock per key is allowed. Timeouts are triggered when 331 | %% expiring leases. 332 | 333 | case is_locked(Key) of 334 | true -> 335 | %% Key already has a write lock 336 | {reply, {error, already_locked}, State}; 337 | false -> 338 | case ets:lookup(?DB, Key) of 339 | [{Key, DbValue, _Expire}] when DbValue =:= Value -> 340 | set_lock(Tag, Key), 341 | {reply, ok, State}; 342 | [] when Value =:= not_found-> 343 | set_lock(Tag, Key), 344 | {reply, ok, State}; 345 | _Other -> 346 | {reply, {error, not_expected_value}, State} 347 | end 348 | end; 349 | 350 | handle_call({release_write_lock, Tag}, _From, State) -> 351 | del_lock(Tag), 352 | {reply, ok, State}; 353 | 354 | %% 355 | %% DATABASE OPERATIONS 356 | %% 357 | 358 | handle_call({write, LockTag, Key, Value, LeaseLength}, _From, 359 | #state{trans_log = TransLog} = State) -> 360 | %% Database write. LockTag might be a valid write-lock, in which 361 | %% case it is deleted to avoid the extra round-trip of explicit 362 | %% delete. If it is not valid, we assume the coordinator had a 363 | %% quorum before writing. 364 | del_lock(LockTag), 365 | ExpireAt = expire_at(LeaseLength), 366 | ets:insert(?DB, {Key, Value, ExpireAt}), 367 | schedule_expire(ExpireAt, Key), 368 | 369 | NewTransLog = [{write, Key, Value, LeaseLength} | TransLog], 370 | {reply, ok, State#state{trans_log = NewTransLog}}; 371 | 372 | handle_call({update, LockTag, Key, Value}, _From, 373 | #state{trans_log = TransLog} = State) -> 374 | del_lock(LockTag), 375 | 376 | case ets:lookup(?DB, Key) of 377 | [{Key, _Value, ExpireAt}] -> 378 | %% Update the lock 379 | ets:insert(?DB, {Key, Value, ExpireAt}); 380 | [] -> 381 | %% Lock not found (most likely it has expired after acquiring write 382 | %% lock) 383 | ok 384 | end, 385 | 386 | NewTransLog = [{update, Key, Value} | TransLog], 387 | {reply, ok, State#state{trans_log = NewTransLog}}; 388 | 389 | %% 390 | %% LEASES 391 | %% 392 | 393 | handle_call({extend_lease, LockTag, Key, Value, ExtendLength}, _From, 394 | #state{trans_log = TransLog} = State) -> 395 | %% Extending a lease sets a new expire time. As the coordinator 396 | %% holds a write lock on the key, it validation has already been 397 | %% done 398 | 399 | del_lock(LockTag), 400 | delete_expire(expires(Key), Key), 401 | 402 | ExpireAt = expire_at(ExtendLength), 403 | ets:insert(?DB, {Key, Value, ExpireAt}), 404 | schedule_expire(ExpireAt, Key), 405 | 406 | NewTransLog = [{extend_lease, Key, Value, ExtendLength} | TransLog], 407 | {reply, ok, State#state{trans_log = NewTransLog}}; 408 | 409 | 410 | handle_call({release, Key, Value, LockTag}, _From, 411 | #state{trans_log = TransLog} = State) -> 412 | {Reply, NewState} = 413 | case ets:lookup(?DB, Key) of 414 | [{Key, Value, ExpireAt}] -> 415 | del_lock(LockTag), 416 | ets:delete(?DB, Key), 417 | delete_expire(ExpireAt, Key), 418 | 419 | NewTransLog = [{release, Key} | TransLog], 420 | {ok, State#state{trans_log = NewTransLog}}; 421 | 422 | [{Key, _OtherValue, _}] -> 423 | {{error, not_owner}, State}; 424 | [] -> 425 | {{error, not_found}, State} 426 | end, 427 | NewWaiters = notify_release_waiter(Key, released, NewState#state.release_waiters), 428 | {reply, Reply, NewState#state{release_waiters = NewWaiters}}; 429 | 430 | %% 431 | %% WAIT-FOR 432 | %% 433 | 434 | handle_call({wait_for, Key, Timeout}, From, #state{waiters = Waiters} = State) -> 435 | %% 'From' waits for the given key to become available, using 436 | %% gen_server:call/3. We will reply when replaying the transaction 437 | %% log. If we do not have a response within the given timeout, the 438 | %% reply is discarded. 439 | 440 | %% Possible race: wait_for/2 reads from ETS, finds nothing, sends 441 | %% this message. Before this message is processed, we have 442 | %% processed the transaction log, the waiter will time out. Fix: 443 | %% read again here? 444 | {noreply, State#state{waiters = [{Key, From, now_to_ms() + Timeout} | Waiters]}}; 445 | 446 | handle_call({wait_for_release, Key, Timeout}, From, 447 | #state{release_waiters = Waiters} = State) -> 448 | %% 'From' waits for the given key lock to become released, using 449 | %% gen_server:call/3. We will reply when replaying the transaction 450 | %% log. If we do not have a response within the given timeout, the 451 | %% reply is discarded. 452 | {noreply, State#state{release_waiters = [{Key, From, now_to_ms() + Timeout} | Waiters]}}; 453 | 454 | %% 455 | %% ADMINISTRATION 456 | %% 457 | 458 | handle_call({set_w, W}, _From, State) -> 459 | ets:insert(?META_DB, {w, W}), 460 | {reply, ok, State}; 461 | 462 | handle_call({set_nodes, Primaries, Replicas}, _From, State) -> 463 | ets:insert(?META_DB, {nodes, ordsets:to_list( 464 | ordsets:from_list(Primaries))}), 465 | ets:insert(?META_DB, {replicas, ordsets:to_list( 466 | ordsets:from_list(Replicas))}), 467 | {reply, ok, State}; 468 | 469 | handle_call(get_debug_state, _From, State) -> 470 | {reply, {ok, ets:tab2list(?LOCK_DB), 471 | ets:tab2list(?DB), 472 | State#state.lease_expire_ref, 473 | State#state.write_locks_expire_ref, 474 | State#state.push_trans_log_ref}, State}. 475 | 476 | %% 477 | %% REPLICATION 478 | %% 479 | 480 | handle_cast({trans_log, _FromNode, TransLog}, State0) -> 481 | %% Replay transaction log. 482 | 483 | %% In the future, we might want to offset the lease length in the 484 | %% master before writing it to the log to ensure the lease length 485 | %% is at least reasonably similar for all replicas. 486 | Now = now_to_ms(), 487 | ReplayF = 488 | fun ({write, Key, Value, LeaseLength}, State) -> 489 | %% With multiple masters, we will get multiple writes 490 | %% for the same key. The last write will win for the 491 | %% lease db, but make sure we only have one entry in the 492 | %% expire table. 493 | delete_expire(expires(Key), Key), 494 | 495 | ExpireAt = expire_at(LeaseLength), 496 | ets:insert(?DB, {Key, Value, ExpireAt}), 497 | schedule_expire(ExpireAt, Key), 498 | 499 | NewWaiters = notify_lock_waiter(Now, Key, Value, 500 | State#state.waiters), 501 | State#state{waiters = NewWaiters}; 502 | 503 | ({extend_lease, Key, Value, ExtendLength}, State) -> 504 | delete_expire(expires(Key), Key), 505 | 506 | ExpireAt = expire_at(ExtendLength), 507 | ets:insert(?DB, {Key, Value, ExpireAt}), 508 | schedule_expire(ExpireAt, Key), 509 | 510 | State; 511 | 512 | ({release, Key}, State) -> 513 | %% Due to replication lag, the key might already have 514 | %% been expired in which case we simply do nothing 515 | case ets:lookup(?DB, Key) of 516 | [{Key, _Value, ExpireAt}] -> 517 | delete_expire(ExpireAt, Key), 518 | ets:delete(?DB, Key); 519 | [] -> 520 | ok 521 | end, 522 | State; 523 | 524 | ({update, Key, Value}, State) -> 525 | delete_expire(expires(Key), Key), 526 | 527 | case ets:lookup(?DB, Key) of 528 | [{Key, _Value, ExpireAt}] -> 529 | ets:insert(?DB, {Key, Value, ExpireAt}), 530 | %% If removal of expired locks and updates were handled 531 | %% by multiple processes, i.e. in non-sequential order, 532 | %% then it would be possible to end up in a situation, 533 | %% in which expired lock has been re-inserted. Calling 534 | %% scedule_expire/2 after updating the lock prevents 535 | %% from that. 536 | schedule_expire(ExpireAt, Key); 537 | [] -> 538 | %% Lock has been expired 539 | ok 540 | end, 541 | 542 | State 543 | 544 | end, 545 | 546 | NewState = lists:foldl(ReplayF, State0, TransLog), 547 | 548 | {noreply, NewState}; 549 | 550 | handle_cast(Msg, State) -> 551 | {stop, {badmsg, Msg}, State}. 552 | 553 | %% 554 | %% SYSTEM EVENTS 555 | %% 556 | 557 | handle_info(expire_leases, State) -> 558 | %% Delete any leases that has expired. There might be writes in 559 | %% flight, but they have already been validated in the locking 560 | %% phase and will be written regardless of what is in the db. 561 | 562 | Now = now_to_seconds(), 563 | Expired = lists:flatmap(fun (T) -> ets:lookup(?EXPIRE_DB, T) end, 564 | lists:seq(State#state.prev_expire_point, Now)), 565 | 566 | ReleaseLockAndNotifyWaiters = 567 | fun ({At, Key}, RemainingWaiters) -> 568 | delete_expire(At, Key), 569 | case ets:lookup(?DB, Key) of 570 | [{Key, _Value, ExpAt}] when ExpAt =:= At -> 571 | ets:delete(?DB, Key), 572 | notify_release_waiter(Key, released, RemainingWaiters); 573 | _Other -> 574 | %% locker_expire_db is out of sync with locker_db 575 | %% resulting in one correct and one or more incorrect 576 | %% locker_exipre_db entries. 577 | %% This exipre entry is incorrect. 578 | RemainingWaiters 579 | end 580 | end, 581 | NewWaiters = lists:foldl(ReleaseLockAndNotifyWaiters, 582 | State#state.release_waiters, Expired), 583 | {noreply, State#state{prev_expire_point = Now, 584 | release_waiters = NewWaiters}}; 585 | 586 | handle_info(expire_locks, State) -> 587 | %% Make a table scan of the write locks. There should be very few 588 | %% (<1000) writes in progress at any time, so a full scan is 589 | %% ok. Optimize like the leases if needed. 590 | Now = now_to_seconds(), 591 | ets:select_delete(?LOCK_DB, 592 | [{ {'_', '_', '$1'}, [{'<', '$1', Now}], [true] }]), 593 | 594 | {noreply, State}; 595 | 596 | handle_info(push_trans_log, #state{trans_log = TransLog} = State) -> 597 | %% Push transaction log to *all* replicas. With multiple masters, 598 | %% each replica will receive the same write multiple times. 599 | Msg = {trans_log, node(), lists:reverse(TransLog)}, 600 | gen_server:abcast(get_meta_ets(replicas), locker, Msg), 601 | {noreply, State#state{trans_log = []}}; 602 | 603 | handle_info(_Info, State) -> 604 | {noreply, State}. 605 | 606 | terminate(_Reason, _State) -> 607 | ok. 608 | 609 | code_change(_OldVsn, State, _Extra) -> 610 | {ok, State}. 611 | 612 | %%%=================================================================== 613 | %%% Internal functions 614 | %%%=================================================================== 615 | 616 | %% Notify waiter on a lock that the lock has been taken. 617 | notify_lock_waiter(Now, Key, Value, AllWaiters) -> 618 | KeyWaiter = fun ({K, _, _}) when Key =:= K -> true; 619 | (_) -> false 620 | end, 621 | ReplyIfNotExpired = 622 | fun ({_, From, Expire}) when Expire > Now -> 623 | gen_server:reply(From, {ok, Value}); 624 | (_) -> 625 | ok 626 | end, 627 | {KeyWaiters, OtherWaiters} = lists:partition(KeyWaiter, AllWaiters), 628 | lists:foreach(ReplyIfNotExpired, KeyWaiters), 629 | OtherWaiters. 630 | 631 | %% Notify waiter of a release of a lock, even if it is expired. 632 | notify_release_waiter(Key, Value, AllWaiters) -> 633 | KeyWaiter = fun ({K, _, _}) when Key =:= K -> true; 634 | (_) -> false 635 | end, 636 | Reply = fun ({_, From, _Expire}) -> gen_server:reply(From, {ok, Value}) end, 637 | {KeyWaiters, OtherWaiters} = lists:partition(KeyWaiter, AllWaiters), 638 | lists:foreach(Reply, KeyWaiters), 639 | OtherWaiters. 640 | 641 | now_to_seconds() -> 642 | now_to_seconds(os:timestamp()). 643 | 644 | now_to_seconds(Now) -> 645 | {MegaSeconds, Seconds, _} = Now, 646 | MegaSeconds * 1000000 + Seconds. 647 | 648 | now_to_ms() -> 649 | now_to_ms(os:timestamp()). 650 | 651 | now_to_ms({MegaSecs,Secs,MicroSecs}) -> 652 | (MegaSecs * 1000000 + Secs) * 1000 + MicroSecs div 1000. 653 | 654 | ok_responses(Replies) -> 655 | lists:partition(fun ({_, ok}) -> true; 656 | (_) -> false 657 | end, Replies). 658 | 659 | %% 660 | %% EXPIRATION 661 | %% 662 | 663 | schedule_expire(At, Key) -> 664 | true = ets:insert(?EXPIRE_DB, {At, Key}), 665 | ok. 666 | 667 | delete_expire(At, Key) -> 668 | ets:delete_object(?EXPIRE_DB, {At, Key}), 669 | ok. 670 | 671 | expire_at(Length) -> 672 | trunc(now_to_seconds() + (Length/1000)). 673 | 674 | expires(Key) -> 675 | case ets:lookup(?DB, Key) of 676 | [{Key, _Value, ExpireAt}] -> 677 | ExpireAt; 678 | [] -> 679 | [] 680 | end. 681 | 682 | %% 683 | %% WRITE-LOCKS 684 | %% 685 | 686 | is_locked(Key) -> 687 | ets:match(?LOCK_DB, {Key, '_', '_'}) =/= []. 688 | 689 | set_lock(Tag, Key) -> 690 | ets:insert_new(?LOCK_DB, {Key, Tag, now_to_seconds() + 10}). 691 | 692 | del_lock(Tag) -> 693 | ets:match_delete(?LOCK_DB, {'_', Tag, '_'}). 694 | --------------------------------------------------------------------------------