├── rebar ├── .gitignore ├── src ├── dht_ring.app.src └── dht_ring.erl ├── Makefile ├── rebar.config ├── LICENSE ├── README └── test └── dht_ring_test.erl /rebar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EchoTeam/dht_ring/HEAD/rebar -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .eunit 2 | deps 3 | *.o 4 | *.beam 5 | *.plt 6 | ebin/* 7 | -------------------------------------------------------------------------------- /src/dht_ring.app.src: -------------------------------------------------------------------------------- 1 | {application, dht_ring, 2 | [ 3 | {description, "Consistent hashing ring for Distributed Hash Tables (DHT)"}, 4 | {vsn, "1.0.0"}, 5 | {registered, []}, 6 | {applications, [kernel, stdlib]}, 7 | {env, []} 8 | ]}. 9 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: all get-deps compile clean test-unit test-ct check distclean 2 | 3 | REBAR := $(shell which ./rebar || which rebar) 4 | 5 | all: get-deps compile 6 | 7 | compile: 8 | $(REBAR) compile 9 | 10 | get-deps: 11 | $(REBAR) get-deps 12 | 13 | test-unit: all 14 | $(REBAR) eunit skip_deps=true 15 | 16 | test-ct: all 17 | $(REBAR) ct skip_deps=true 18 | 19 | check: test-unit test-ct 20 | 21 | clean: 22 | $(REBAR) clean 23 | rm -rf ./ebin 24 | rm -rf ./logs 25 | rm -f ./erl_crash.dump 26 | rm -rf ./.eunit 27 | rm -f ./test/*.beam 28 | 29 | distclean: clean 30 | rm -rf ./deps 31 | -------------------------------------------------------------------------------- /rebar.config: -------------------------------------------------------------------------------- 1 | %%% vim: set ts=4 sts=4 sw=4 et: 2 | 3 | {deps_dir, "deps"}. 4 | 5 | {eunit_opts, [verbose, {report,{eunit_surefire,[{dir,"."}]}}]}. 6 | {eunit_compile_opts, [export_all]}. 7 | {cover_enabled, true}. 8 | {cover_export_enabled, true}. 9 | 10 | {erl_opts, [ 11 | bin_opt_info, 12 | 13 | warn_unused_vars, 14 | warn_export_all, 15 | warn_shadow_vars, 16 | warn_unused_import, 17 | warn_unused_function, 18 | warn_bif_clash, 19 | warn_unused_record, 20 | warn_deprecated_function, 21 | warn_obsolete_guard, 22 | strict_validation, 23 | warn_export_vars, 24 | warn_exported_vars, 25 | warn_untyped_record, 26 | 27 | {parse_transform, lager_transform}, 28 | {lager_truncation_size, 4096} 29 | ]}. 30 | 31 | {deps, [ 32 | % let it always be the first 33 | {lager, "2.0.1", 34 | {git, "git://github.com/basho/lager.git", {tag, "2.0.1"}}} 35 | ]}. 36 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2008-2014 JackNyfe, Inc. 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions 6 | are met: 7 | 1. Redistributions of source code must retain the above copyright 8 | notice, this list of conditions and the following disclaimer. 9 | 2. Redistributions in binary form must reproduce the above copyright 10 | notice, this list of conditions and the following disclaimer in the 11 | documentation and/or other materials provided with the distribution. 12 | 13 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 14 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 | ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 17 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 | OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 | SUCH DAMAGE. 24 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | An implementation of consistent hashing ring for distributed hash tables as 2 | a gen_server. 3 | 4 | Here is an informal explanation of the technique: 5 | http://www.spiteful.com/2008/03/17/programmers-toolbox-part-3-consistent-hashing/ 6 | 7 | Exports: 8 | 9 | start_link(Peers) -> ServerRef 10 | start_link(ServerName, Peers) -> ServerRef 11 | Types: 12 | Peers = [{Node, Opaque, Weight}] 13 | Node = term() 14 | Opaque = term() 15 | Weight = int(), >= 1 16 | 17 | Create a ring according to peer configuration Peers. Opaque is an opaque 18 | data associated with the node. Greater weight makes the node own bigger 19 | portion of the ring. 20 | 21 | The ring creation time is N^2 where N is the sum of the nodes' weights. 22 | 23 | 24 | add(ServerRef, Peers) -> ok | {error, already_there, [Node]} 25 | 26 | Add Peers to the ring. If any of the nodes being added is already in the 27 | ring (the check is done by comparing node *names*), error is returned and 28 | the ring remains intact. 29 | 30 | 31 | add(ServerRef, {Node, Opaque, Weight}) 32 | 33 | Equivalent to add(ServerRef, [{Node, Opaque, Weight}]). 34 | 35 | 36 | delete(ServerRef, [Node]) -> ok | {error, unknown_nodes, [Node]} 37 | 38 | Removes nodes from the ring by their names. If any of the names doesn't 39 | reference an existing node in the ring, an error is returned and the ring 40 | remains intact. 41 | 42 | 43 | delete(ServerRef, Node) 44 | 45 | Equivalent to delete(ServerRef, [Node]). 46 | 47 | 48 | get_config(ServerRef) -> Peers 49 | 50 | Returns the current configuration of the ring. The order of nodes within 51 | the returned list is not specified. 52 | 53 | 54 | lookup(ServerRef, Key) -> [{Node, Opaque}] 55 | 56 | Types: 57 | Key = term() 58 | 59 | Look up the Ring for a list of nodes corresponding to Key. The returned 60 | list is guaranteed to contain no duplicates. 61 | 62 | The lookup time is log(N), where N is the sum of the nodes' weights. 63 | 64 | 65 | nodes(ServerRef) -> [{Node, Opaque}] 66 | 67 | Return the list of nodes in the ring. The order of nodes in the returned 68 | list is not specified, but it is guaranteed that the list contains no 69 | duplicates. 70 | 71 | This operation execution time is constant. 72 | -------------------------------------------------------------------------------- /test/dht_ring_test.erl: -------------------------------------------------------------------------------- 1 | -module(dht_ring_test). 2 | 3 | -compile(export_all). 4 | 5 | -include_lib("eunit/include/eunit.hrl"). 6 | 7 | 8 | run_test_() -> 9 | [{"Basic test with three peers", fun basic_test/0}, 10 | {"Test with one peer", fun vacuous_test/0}, 11 | {"Dynamic ring", fun dynamic_test/0}]. 12 | 13 | check_distribution(Nodes, Expected) -> 14 | {ok, RingServer} = dht_ring:start_link(Nodes), 15 | 16 | true = (length(Nodes) == length(dht_ring:nodes(RingServer))), 17 | 18 | KeysQty = length(Expected), 19 | 20 | true = KeysQty >= 1, 21 | 22 | Keys = lists:seq(1, KeysQty), 23 | 24 | [?assertMatch(E, dht_ring:lookup(RingServer, integer_to_list(Key))) || {Key, E} <- lists:zip(Keys, Expected)]. 25 | 26 | basic_test() -> 27 | check_distribution( 28 | [{a, a, 5}, {b, b, 3}, {c, c, 2}], 29 | [ 30 | [{a,a},{b,b},{c,c}], 31 | [{a,a},{c,c},{b,b}], 32 | [{c,c},{a,a},{b,b}], 33 | [{a,a},{b,b},{c,c}], 34 | [{a,a},{b,b},{c,c}], 35 | [{c,c},{a,a},{b,b}], 36 | [{a,a},{b,b},{c,c}], 37 | [{c,c},{a,a},{b,b}], 38 | [{b,b},{a,a},{c,c}], 39 | [{b,b},{a,a},{c,c}] 40 | ] 41 | ). 42 | 43 | vacuous_test() -> 44 | check_distribution([{n, n, 1}], lists:duplicate(10, [{n, n}])). 45 | 46 | dynamic_test() -> 47 | A = {a, a, 5}, 48 | B = {b, b, 3}, 49 | C = {c, c, 10}, 50 | {ok, Ring} = dht_ring:start_link([A]), 51 | {ok, Ring0} = dht_ring:start_link([]), 52 | {ok, RingA} = dht_ring:start_link([A]), 53 | {ok, RingAB} = dht_ring:start_link([A, B]), 54 | {ok, RingAC} = dht_ring:start_link([A, C]), 55 | {ok, RingABC} = dht_ring:start_link([A, B, C]), 56 | 57 | {error, already_there, [a]} = dht_ring:add(Ring, [A]), 58 | 59 | Keys = lists:seq(1, 10), 60 | 61 | true = (length(dht_ring:nodes(Ring)) == 1), 62 | ok = dht_ring:add(Ring, [B]), 63 | true = (length(dht_ring:nodes(Ring)) == 2), 64 | 65 | Tests = [ 66 | { "A+B vs AB", compare_rings(Ring, RingAB, Keys) }, 67 | 68 | { "A+B vs AB config", compare_configs(Ring, RingAB) }, 69 | 70 | begin 71 | ok = dht_ring:add(Ring, [C]), 72 | true = (length(dht_ring:nodes(Ring)) == 3), 73 | { "A+B+C vs ABC", compare_rings(Ring, RingABC, Keys) } 74 | end, 75 | 76 | { "A+B+C vs ABC config", compare_configs(Ring, RingABC) }, 77 | 78 | % Check if 'nodes/1' returns the right thing 79 | { "nodes in A+B+C", 80 | begin 81 | Nodes = lists:keysort(1, dht_ring:nodes(Ring)), 82 | if 83 | Nodes == [{a, a}, {b, b}, {c, c}] -> pass; 84 | true -> {fail, Nodes} 85 | end 86 | end 87 | }, 88 | 89 | begin 90 | {error, unknown_nodes, [d, e]} = dht_ring:delete(Ring, [d, e]), 91 | ok = dht_ring:delete(Ring, [b]), 92 | { "A+C vs AC", compare_rings(Ring, RingAC, Keys) } 93 | end, 94 | 95 | { "A+C vs AC config", compare_configs(Ring, RingAC) }, 96 | 97 | begin 98 | ok = dht_ring:delete(Ring, [a, c]), 99 | {"empty rings lookups", compare_rings(Ring, Ring0, Keys) } 100 | end, 101 | 102 | begin 103 | ok = dht_ring:add(Ring, A), 104 | { "0,add{A} vs A", compare_configs(Ring, RingA) } 105 | end, 106 | 107 | begin 108 | ok = dht_ring:delete(Ring, a), 109 | { "A,delete{A} vs 0", compare_configs(Ring, Ring0) } 110 | end 111 | ], 112 | 113 | case [ Result || {_, Status} = Result <- Tests, Status =/= pass ] of 114 | [] -> pass; 115 | Else -> {fail, Else} 116 | end. 117 | 118 | compare_rings(Ring1, Ring2, Keys) -> 119 | Results = [ {Key, dht_ring:lookup(Ring1, Key) == dht_ring:lookup(Ring2, Key)} 120 | || Key <- Keys 121 | ], 122 | 123 | case [ Key || {Key, false} <- Results ] of 124 | [] -> pass; 125 | FailedKeys -> {fail, FailedKeys} 126 | end. 127 | 128 | 129 | compare_configs(Ring1, Ring2) -> 130 | Config1 = dht_ring:get_config(Ring1), 131 | Config2 = dht_ring:get_config(Ring2), 132 | 133 | case lists:keysort(1, Config1) == lists:keysort(1, Config2) of 134 | true -> pass; 135 | _ -> {fail, Config1, Config2} 136 | end. 137 | -------------------------------------------------------------------------------- /src/dht_ring.erl: -------------------------------------------------------------------------------- 1 | %%% vim: ts=4 sts=4 sw=4 expandtab: 2 | 3 | -module(dht_ring). 4 | 5 | -behaviour(gen_server). 6 | 7 | % Public API 8 | -export([ 9 | add/2, 10 | delete/2, 11 | get_config/1, 12 | lookup/2, 13 | lookup_index/2, 14 | node_shares/1, 15 | nodes/1, 16 | partitions/1, 17 | partitions_if_node_added/2, 18 | set_opaque/3, 19 | start_link/1, 20 | start_link/2, 21 | stop/1 22 | ]). 23 | 24 | % gen_server callbacks 25 | -export([ 26 | code_change/3, 27 | handle_call/3, 28 | handle_cast/2, 29 | handle_info/2, 30 | init/1, 31 | terminate/2 32 | ]). 33 | 34 | -record(state, { ring, nodes }). 35 | 36 | %% Public API 37 | 38 | add(Ring, {_Node, _Opaque, _Weight} = Peer) -> 39 | gen_server:call(Ring, {add, [Peer]}); 40 | 41 | add(Ring, Nodes) -> 42 | gen_server:call(Ring, {add, Nodes}). 43 | 44 | delete(Ring, Node) when not is_list(Node) -> 45 | gen_server:call(Ring, {delete, [Node]}); 46 | 47 | delete(Ring, Nodes) -> 48 | gen_server:call(Ring, {delete, Nodes}). 49 | 50 | get_config(Ring) -> 51 | gen_server:call(Ring, {get_config}). 52 | 53 | lookup(Ring, Key) -> 54 | lookup_index(Ring, index(Key)). 55 | 56 | lookup_index(Ring, Index) -> 57 | gen_server:call(Ring, {lookup, Index}, 15000). 58 | 59 | nodes(Ring) -> 60 | gen_server:call(Ring, {nodes}). 61 | 62 | node_shares(Ring) -> 63 | Partitions = partitions(Ring), 64 | NodePartitions = fun(Node) -> 65 | lists:foldl(fun 66 | ({RN, From, To}, Acc) when RN == Node -> Acc + (To - From); 67 | (_, Acc) -> Acc 68 | end, 0, Partitions) 69 | end, 70 | lists:flatten([io_lib:format("\t~p weight ~p share ~.2f%~n", 71 | [Node, Weight, Share]) 72 | || {Node, _, Weight} <- get_config(Ring), 73 | Share <- [100 * NodePartitions(Node) / 65536]]). 74 | 75 | partitions(Ring) -> 76 | partitions_from_ring(gen_server:call(Ring, {ring})). 77 | 78 | partitions_if_node_added(Ring, Node) -> 79 | Nodes = get_config(Ring), 80 | {ok, S} = init([Node | Nodes]), 81 | partitions_from_ring(S#state.ring). 82 | 83 | set_opaque(Ring, Node, Opaque) -> 84 | gen_server:call(Ring, {set_opaque, {Node, Opaque}}). 85 | 86 | start_link(Peers) -> 87 | gen_server:start_link(?MODULE, Peers, []). 88 | 89 | start_link(ServerName, Peers) -> 90 | gen_server:start_link({local, ServerName}, ?MODULE, Peers, []). 91 | 92 | stop(RingPid) -> 93 | gen_server:call(RingPid, {stop}). 94 | 95 | %% gen_server callbacks 96 | 97 | handle_call({add, Nodes}, _From, #state{ nodes = OldNodes } = State) -> 98 | case [ N || {N, _, _} <- Nodes, lists:keymember(N, 1, OldNodes) ] of 99 | [] -> 100 | {ok, NewState} = init(OldNodes ++ Nodes), 101 | {reply, ok, NewState}; 102 | Overlaps -> 103 | {reply, {error, already_there, Overlaps}, State} 104 | end; 105 | 106 | handle_call({delete, Nodes}, _From, #state{ nodes = OldNodes } = State) -> 107 | case [ N || N <- Nodes, not lists:keymember(N, 1, OldNodes) ] of 108 | [] -> 109 | {ok, NewState} = init( 110 | [Node || {N, _, _} = Node <- OldNodes, not lists:member(N, Nodes)] 111 | ), 112 | {reply, ok, NewState}; 113 | NotThere -> {reply, {error, unknown_nodes, NotThere}, State} 114 | end; 115 | 116 | handle_call({get_config}, _From, #state{ nodes = Nodes } = State) -> 117 | {reply, Nodes, State}; 118 | 119 | handle_call({ring}, _From, #state{ ring = Ring } = State) -> 120 | {reply, Ring, State}; 121 | 122 | handle_call({lookup, KeyIndex}, _From, #state{ ring = Ring } = State) -> 123 | true = (KeyIndex >= 0) andalso (KeyIndex < 65536), 124 | case bsearch(Ring, KeyIndex) of 125 | empty -> {reply, [], State}; 126 | PartIdx -> 127 | {_Hash, NodeList} = array:get(PartIdx, Ring), 128 | {reply, NodeList, State} 129 | end; 130 | 131 | handle_call({nodes}, _From, #state{ nodes = Nodes } = State) -> 132 | {reply, [{Name, Opaque} || {Name, Opaque, _} <- Nodes], State}; 133 | 134 | handle_call({set_opaque, {Name, Opaque}}, _From, State) -> 135 | NewNodes = lists:map(fun 136 | ({N, _OldOpaque, Weight}) when N == Name -> {N, Opaque, Weight}; 137 | (V) -> V 138 | end, State#state.nodes), 139 | NewRing = array:from_list(lists:map(fun({Hash, Data}) -> 140 | {Hash, lists:map(fun 141 | ({N, _OldOpaque}) when N == Name -> {N, Opaque}; 142 | (V) -> V 143 | end, Data)} 144 | end, array:to_list(State#state.ring))), 145 | NewState = State#state{ 146 | ring = NewRing, 147 | nodes = NewNodes 148 | }, 149 | {reply, ok, NewState}; 150 | 151 | handle_call({stop}, _From, State) -> 152 | {stop, normal, stopped, State}; 153 | 154 | handle_call(_Request, _From, State) -> 155 | {noreply, State}. 156 | 157 | handle_cast(_Request, State) -> 158 | {noreply, State}. 159 | 160 | handle_info(_Request, State) -> 161 | {noreply, State}. 162 | 163 | init(Peers) -> 164 | RawRing = lists:keysort(1, 165 | [{H, {Node, Opaque}} || {Node, Opaque, Weight} <- Peers, 166 | N <- lists:seq(1, Weight), 167 | H <- [index([atom_to_list(Node), integer_to_list(N)])] 168 | ] 169 | ), 170 | Ring = array:from_list(assemble_ring([], lists:reverse(RawRing), [], length(Peers))), 171 | lager:info("Created a ring with ~b points in it.", [array:sparse_size(Ring)]), 172 | {ok, #state{ ring = Ring, nodes = Peers }}. 173 | 174 | terminate(_Reason, _State) -> 175 | ok. 176 | 177 | code_change(_OldVsn, State, _Extra) -> 178 | {ok, State}. 179 | 180 | %% Internal functions 181 | 182 | assemble_ring(_, [], R, _) -> R; 183 | assemble_ring(H,[{Hash, {NN, _} = N} |T],R,L) -> 184 | ITN = [N|[E || {N2,_} = E<-H, N2 /= NN]], 185 | LITN = length(ITN), 186 | TN = case LITN == L of 187 | true -> ITN; 188 | false -> 189 | {_, RN} = try lists:foldr( 190 | fun(_, {L2, Acc}) when L2==L -> throw({L2, Acc}); 191 | ({_, {N2, _} = E}, {L2, Acc}) -> 192 | case lists:keymember(N2, 1, Acc) of 193 | true -> {L2, Acc}; 194 | false -> {L2+1, Acc++[E]} 195 | end 196 | end, {LITN, ITN}, T) 197 | catch throw:V -> V end, 198 | RN 199 | end, 200 | assemble_ring(ITN,T,[{Hash,TN}|R],L). 201 | 202 | calc_partitions([{Idx, [{Node, _} | _]}], FirstIdx, Acc) -> 203 | [{Node, 0, FirstIdx}, {Node, Idx, 65536} | Acc]; 204 | calc_partitions([{Idx1, [{Node, _} | _]}, {Idx2, _} = E | T], FirstIdx, Acc) -> 205 | calc_partitions([E|T], FirstIdx, [{Node, Idx1, Idx2} | Acc]). 206 | 207 | partitions_from_ring(Ring) -> 208 | ArrL = array:to_list(Ring), 209 | [{Idx, _} | _] = ArrL, 210 | calc_partitions(ArrL, Idx, []). 211 | 212 | index(Key) -> 213 | <> = erlang:md5(term_to_binary(Key)), 214 | A bsl 8 + B. 215 | 216 | % We rely on the fact that the array is kept intact after creation, e.g. no 217 | % undefined entries exist in the middle. 218 | bsearch(Arr, K) -> 219 | Size = array:sparse_size(Arr), 220 | if Size == 0 -> empty; true -> bsearch(Arr, Size, 0, Size - 1, K) end. 221 | 222 | bsearch(Arr, Size, LIdx, RIdx, K) -> 223 | MIdx = LIdx + (RIdx - LIdx + 1) div 2, 224 | true = (MIdx >= LIdx) andalso (MIdx =< RIdx), 225 | case key_fits(Arr, Size, MIdx - 1, MIdx, K) of 226 | {yes, Idx} -> Idx; 227 | {no, lt} -> bsearch(Arr, Size, LIdx, MIdx, K); 228 | {no, gt} -> 229 | if 230 | MIdx == (Size - 1) -> Size - 1; 231 | true -> bsearch(Arr, Size, MIdx, RIdx, K) 232 | end 233 | end. 234 | 235 | key_fits(_Arr, 1, -1, 0, _K) -> 236 | {yes, 0}; 237 | 238 | key_fits(Arr, Size, -1, 0, K) -> 239 | {Hash0, _} = array:get(0, Arr), 240 | {HashS, _} = array:get(Size - 1, Arr), 241 | true = K < HashS, 242 | if 243 | K < Hash0 -> {yes, Size - 1}; 244 | true -> {no, gt} 245 | end; 246 | 247 | key_fits(Arr, Size, L, R, K) -> 248 | {LHash, _} = array:get(L, Arr), 249 | {RHash, _} = array:get(R, Arr), 250 | if 251 | K < LHash -> if L == 0 -> {yes, Size - 1}; true -> {no, lt} end; 252 | (K >= LHash) andalso (K < RHash) -> {yes, L}; 253 | K >= RHash -> {no, gt} 254 | end. 255 | --------------------------------------------------------------------------------