├── rebar3 ├── .gitignore ├── src ├── kv_index_tictactree.app.src ├── aae_runner.erl ├── aae_util.erl └── aae_treecache.erl ├── test ├── end_to_end │ ├── testutil.hrl │ ├── fold_SUITE.erl │ ├── testutil.erl │ ├── basic_SUITE.erl │ └── mock_kv_vnode.erl ├── timeout_test.erl └── property │ └── aae_eqc.erl ├── .github └── workflows │ └── erlang.yml ├── rebar.config ├── include └── aae.hrl ├── docs ├── RIAK_2_AAE.md ├── SEGMENT_FILTERED_SST.md ├── TICTAC.md ├── GENERAL_TICTACAAE_FOR_RIAK.md ├── RIAK_3_AAE.md └── DESIGN.md ├── README.md └── LICENSE /rebar3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/martinsumner/kv_index_tictactree/HEAD/rebar3 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .eunit 2 | deps 3 | *.o 4 | *.beam 5 | *.plt 6 | erl_crash.dump 7 | ebin/*.beam 8 | rel/example_project 9 | .concrete/DEV_MODE 10 | .rebar 11 | .DS_Store 12 | _build/* 13 | rebar.lock 14 | aae_data/* 15 | -------------------------------------------------------------------------------- /src/kv_index_tictactree.app.src: -------------------------------------------------------------------------------- 1 | {application, kv_index_tictactree, [ 2 | {description, "AAE helper service for KV vnode"}, 3 | {vsn, git}, 4 | {registered, []}, 5 | {applications, [ 6 | kernel, 7 | stdlib, 8 | leveled 9 | ]}, 10 | {maintainers, ["Martin Sumner"]}, 11 | {licenses, ["Apache"]}, 12 | {links, [{"Github", "https://github.com/martinsumner/kv_index_tictactree"}]}, 13 | {env, [{root_path, "test"}]} 14 | ]}. 15 | -------------------------------------------------------------------------------- /test/end_to_end/testutil.hrl: -------------------------------------------------------------------------------- 1 | 2 | -record(r_content, { 3 | metadata, 4 | value :: term() 5 | }). 6 | 7 | -record(r_object, { 8 | bucket, 9 | key, 10 | contents :: [#r_content{}], 11 | vclock = [], 12 | updatemetadata=dict:store(clean, true, dict:new()), 13 | updatevalue :: term()}). 14 | 15 | 16 | -define(BUCKET_TYPE, <<"BucketType">>). -------------------------------------------------------------------------------- /.github/workflows/erlang.yml: -------------------------------------------------------------------------------- 1 | name: Erlang CI 2 | 3 | on: 4 | push: 5 | branches: 6 | - develop-3.4 7 | pull_request: 8 | branches: 9 | - develop-3.4 10 | 11 | jobs: 12 | 13 | build: 14 | 15 | name: Test on ${{ matrix.os }} with OTP ${{ matrix.otp }} 16 | runs-on: ${{ matrix.os }} 17 | 18 | strategy: 19 | fail-fast: false 20 | matrix: 21 | otp: [24, 26] 22 | os: [ubuntu-latest] 23 | 24 | steps: 25 | - uses: actions/checkout@v4 26 | - name: Install Erlang/OTP 27 | uses: erlef/setup-beam@v1 28 | with: 29 | otp-version: ${{ matrix.otp }} 30 | - name: Compile 31 | run: ./rebar3 compile 32 | - name: Check format 33 | run: ./rebar3 fmt --check 34 | - name: Run tests 35 | run: ./rebar3 do xref, dialyzer, eunit, ct 36 | -------------------------------------------------------------------------------- /test/timeout_test.erl: -------------------------------------------------------------------------------- 1 | -module(timeout_test). 2 | -behaviour(gen_server). 3 | -include_lib("eunit/include/eunit.hrl"). 4 | 5 | -export([start_link/0, stop/0]). 6 | -export([ 7 | init/1, 8 | handle_call/3, 9 | handle_cast/2, 10 | handle_info/2, 11 | code_change/3, 12 | terminate/2 13 | ]). 14 | 15 | start_link() -> gen_server:start_link({local, ?MODULE}, ?MODULE, [], []). 16 | stop() -> gen_server:call(?MODULE, stop). 17 | 18 | init([]) -> {ok, []}. 19 | 20 | handle_call({test}, _From, State) -> {reply, timer:sleep(1000), State}; 21 | handle_call(stop, _From, State) -> {stop, normal, ok, State}. 22 | 23 | handle_cast(_Request, State) -> {noreply, State}. 24 | handle_info(_, State) -> {noreply, State}. 25 | code_change(_Old, State, _Extra) -> {ok, State}. 26 | terminate(_Reason, _State) -> ok. 27 | 28 | -ifdef(TEST). 29 | 30 | wait_on_sync_test() -> 31 | {ok, P} = start_link(), 32 | ?assertMatch( 33 | timeout, 34 | aae_controller:wait_on_sync(gen_server, call, P, {test}, 100) 35 | ), 36 | ?assertMatch( 37 | ok, 38 | aae_controller:wait_on_sync(gen_server, call, P, {test}, 2000) 39 | ), 40 | stop(). 41 | 42 | -endif. 43 | -------------------------------------------------------------------------------- /rebar.config: -------------------------------------------------------------------------------- 1 | {minimum_otp_vsn, "22.3"}. 2 | 3 | {erl_opts, [warnings_as_errors]}. 4 | 5 | {cover_excl_mods, [ 6 | testutil, 7 | basic_SUITE, 8 | fold_SUITE, 9 | mockvnode_SUITE, 10 | mock_kv_vnode 11 | ]}. 12 | 13 | {erlfmt, [ 14 | write, 15 | {print_width, 80}, 16 | {files, [ 17 | "{src,include}/*.{hrl,erl,app.src}", 18 | "test/end_to_end/*.erl", 19 | "rebar.config" 20 | ]}, 21 | {exclude_files, ["src/erlfmt_parse.erl"]} 22 | ]}. 23 | 24 | {project_plugins, [ 25 | {eqwalizer_rebar3, 26 | {git_subdir, "https://github.com/OpenRiak/eqwalizer.git", 27 | {branch, "openriak-3.4"}, "eqwalizer_rebar3"}}, 28 | {erlfmt, {git, "https://github.com/OpenRiak/erlfmt.git", {branch, "main"}}} 29 | ]}. 30 | 31 | {profiles, [ 32 | {eqc, [ 33 | {deps, [meck, fqc]}, 34 | {erl_opts, [debug_info, {d, 'EQC'}]}, 35 | {extra_src_dirs, ["test/end_to_end", "test/property"]}, 36 | {plugins, [rebar_eqc]} 37 | ]}, 38 | {test, [{extra_src_dirs, ["test/end_to_end", "test/property"]}]} 39 | ]}. 40 | 41 | {xref_checks, [undefined_function_calls, undefined_functions, locals_not_used]}. 42 | 43 | {ct_opts, [{dir, ["test/end_to_end"]}]}. 44 | 45 | {deps, [ 46 | {leveled, 47 | {git, "https://github.com/martinsumner/leveled", 48 | {branch, "develop-3.4"}}}, 49 | {eqwalizer_support, 50 | {git_subdir, "https://github.com/OpenRiak/eqwalizer.git", 51 | {branch, "openriak-3.4"}, "eqwalizer_support"}} 52 | ]}. 53 | -------------------------------------------------------------------------------- /include/aae.hrl: -------------------------------------------------------------------------------- 1 | %%%============================================================================ 2 | %%% Non-configurable defaults 3 | %%%============================================================================ 4 | 5 | -define(TREE_SIZE, large). 6 | -define(MAGIC, 53). 7 | 8 | %%%============================================================================ 9 | %%% Tags 10 | %%%============================================================================ 11 | -define(HEAD_TAG, h). 12 | -define(RIAK_TAG, o_rkv). 13 | 14 | -if(?OTP_RELEASE < 26). 15 | -type dynamic() :: any(). 16 | -endif. 17 | 18 | %%%============================================================================ 19 | %%% Helper Functions 20 | %%%============================================================================ 21 | 22 | -define(LOG_LOCATION, #{ 23 | mfa => {?MODULE, ?FUNCTION_NAME, ?FUNCTION_ARITY}, 24 | line => ?LINE, 25 | file => ?FILE 26 | }). 27 | 28 | -define(STD_LOG(LogRef, Subs), 29 | ?STD_LOG_INT( 30 | element(1, aae_util:get_log(LogRef)), 31 | LogRef, 32 | Subs, 33 | leveled_log:get_opts() 34 | ) 35 | ). 36 | 37 | -define(STD_LOG_INT(LogLevel, LogRef, Subs, LogOpts), 38 | case 39 | logger:allow(LogLevel, ?MODULE) andalso 40 | leveled_log:should_i_log(LogLevel, LogRef, LogOpts) 41 | of 42 | true -> 43 | erlang:apply( 44 | logger, 45 | macro_log, 46 | [ 47 | ?LOG_LOCATION 48 | | aae_util:log(LogLevel, LogRef, LogOpts, Subs) 49 | ] 50 | ); 51 | false -> 52 | ok 53 | end 54 | ). 55 | -------------------------------------------------------------------------------- /src/aae_runner.erl: -------------------------------------------------------------------------------- 1 | %% -------- Overview --------- 2 | %% 3 | %% Runner used for fetch_clock queries on this AAE vnode 4 | 5 | -module(aae_runner). 6 | 7 | -behaviour(gen_server). 8 | 9 | -include("aae.hrl"). 10 | 11 | -export([ 12 | init/1, 13 | handle_call/3, 14 | handle_cast/2, 15 | handle_info/2, 16 | terminate/2, 17 | code_change/3 18 | ]). 19 | 20 | -export([ 21 | runner_start/1, 22 | runner_work/2, 23 | runner_stop/1 24 | ]). 25 | 26 | -record(state, { 27 | result_size = 0 :: integer(), 28 | query_count = 0 :: integer(), 29 | query_time = 0 :: integer(), 30 | aae_controller :: pid() | undefined 31 | }). 32 | 33 | -define(LOG_FREQUENCY, 10). 34 | 35 | -define(PROMPT_MILLISECONDS, 2000). 36 | 37 | %%%============================================================================ 38 | %%% API 39 | %%%============================================================================ 40 | 41 | -spec runner_start(aae_util:log_levels()) -> {ok, pid()}. 42 | %% @doc 43 | %% Start an AAE runner to manage folds 44 | runner_start(LogLevels) -> 45 | {ok, Pid} = gen_server:start_link(?MODULE, [LogLevels, self()], []), 46 | {ok, Pid}. 47 | 48 | -spec runner_work(pid(), aae_controller:runner_work() | queue_empty) -> ok. 49 | %% @doc 50 | %% Be cast some work 51 | runner_work(Runner, Work) -> 52 | gen_server:cast(Runner, Work). 53 | 54 | -spec runner_stop(pid()) -> ok. 55 | %% @doc 56 | %% Close the runner 57 | runner_stop(Runner) -> 58 | gen_server:call(Runner, close, 30000). 59 | 60 | %%%============================================================================ 61 | %%% gen_server callbacks 62 | %%%============================================================================ 63 | 64 | init([LogLevels, Controller]) -> 65 | ok = aae_util:set_loglevel(LogLevels), 66 | {ok, #state{aae_controller = Controller}, ?PROMPT_MILLISECONDS}. 67 | 68 | handle_call(close, _From, State) -> 69 | {stop, normal, ok, State}. 70 | 71 | handle_cast(queue_empty, State) -> 72 | {noreply, State, ?PROMPT_MILLISECONDS}; 73 | handle_cast({work, Folder, ReturnFun, SizeFun}, State) -> 74 | SW = os:timestamp(), 75 | State0 = 76 | try Folder() of 77 | query_backlog -> 78 | ?STD_LOG(r0002, []), 79 | ReturnFun({error, query_backlog}), 80 | State; 81 | Results -> 82 | QueryTime = timer:now_diff(os:timestamp(), SW), 83 | ?STD_LOG(r0003, [QueryTime]), 84 | RS0 = State#state.result_size + SizeFun(Results), 85 | QT0 = State#state.query_time + QueryTime, 86 | QC0 = State#state.query_count + 1, 87 | {RS1, QT1, QC1} = 88 | maybe_log( 89 | RS0, 90 | QT0, 91 | QC0, 92 | ?LOG_FREQUENCY 93 | ), 94 | 95 | ReturnFun(Results), 96 | 97 | State#state{ 98 | result_size = RS1, 99 | query_time = QT1, 100 | query_count = QC1 101 | } 102 | catch 103 | Error:Pattern -> 104 | ?STD_LOG(r0005, [Error, Pattern]), 105 | ReturnFun({error, Error}), 106 | State 107 | end, 108 | {noreply, State0, 0}. 109 | 110 | handle_info(timeout, State = #state{aae_controller = C}) when C =/= undefined -> 111 | ?STD_LOG(r0004, []), 112 | ok = aae_controller:aae_runnerprompt(C), 113 | {noreply, State}. 114 | 115 | terminate(_Reason, State) -> 116 | _ = maybe_log( 117 | State#state.result_size, 118 | State#state.query_time, 119 | State#state.query_count, 120 | 1 121 | ), 122 | ok. 123 | 124 | code_change(_OldVsn, State, _Extra) -> 125 | {ok, State}. 126 | 127 | %%%============================================================================ 128 | %%% Internal functions 129 | %%%============================================================================ 130 | 131 | maybe_log(RS_Acc, QT_Acc, QC_Acc, LogFreq) when QC_Acc < LogFreq -> 132 | {RS_Acc, QT_Acc, QC_Acc}; 133 | maybe_log(RS_Acc, QT_Acc, QC_Acc, _LogFreq) -> 134 | ?STD_LOG(r0001, [RS_Acc, QT_Acc, QC_Acc]), 135 | {0, 0, 0}. 136 | 137 | %%%============================================================================ 138 | %%% Test 139 | %%%============================================================================ 140 | 141 | -ifdef(TEST). 142 | 143 | -include_lib("eunit/include/eunit.hrl"). 144 | 145 | runner_fail_test() -> 146 | {ok, R} = runner_start(undefined), 147 | TestProcess = self(), 148 | CheckFun = 149 | fun(ReturnTuple) -> 150 | ?assertMatch(error, element(1, ReturnTuple)), 151 | TestProcess ! error 152 | end, 153 | ReturnFun = aae_controller:generate_returnfun("ABCD", CheckFun), 154 | FoldFun = fun() -> throw(noproc) end, 155 | SizeFun = fun(_Results) -> 0 end, 156 | runner_work(R, {work, FoldFun, ReturnFun, SizeFun}), 157 | error = start_receiver(), 158 | ok = runner_stop(R). 159 | 160 | start_receiver() -> 161 | receive 162 | error -> 163 | error 164 | end. 165 | 166 | coverage_cheat_test() -> 167 | {ok, _State1} = code_change(null, #state{}, null). 168 | 169 | -endif. 170 | -------------------------------------------------------------------------------- /docs/RIAK_2_AAE.md: -------------------------------------------------------------------------------- 1 | # AAE Implementation in Riak 2.2.5 2 | 3 | - Each riak vnode has a kv_index_hashtree process if AAE is enabled. 4 | 5 | - That kv_index_hashtree process keeps a single key store (that contains all the keys and hashes of all the keys and values within that vnode store) - so it is a parallel key store duplicating the data (but with just hashes and not the whole object). 6 | 7 | - The AAE key store is ordered by Segment ID - with Segment ID being the hash of the Key representing the Key's location in the Merkle Tree. 8 | 9 | - So if the object key is {Bucket1, Key1}, the Key in the AAE store is something like (hash{Bucket1, Key1}, Bucket1, Key1) - and the value is the hash for that object. 10 | 11 | - "the hash for that object" used to mean the hash of the whole object, but now it is just the hash of the version vector. 12 | 13 | - As well as the parallel key store, the kv_index_hashtree process keeps a merkle tree for each IndexN that the vnode supports. 14 | 15 | - IndexN is a reference to a combination of a n-val and a preflist supported by the vnode. If all buckets in the store are n-val 3, then there will be 3 IndexNs per vnode (and hence 3 merkle trees). If there are some buckets with an n-val of 3 and some with an n-val of 4 - there will be 7 merkle trees. 16 | 17 | - Each merkle tree also has an associated list of "Dirty Segments" - with a dirty segment meaning a SegmentId within the tree which has had a change since it was last calculated, and so shouldn't be trusted any more. 18 | 19 | - When a vnode receives a PUT request, it passes the new key, the new value, and the IndexN to the kv_index_hashtree process after the vnode has been updated. 20 | the kv_index_hashtree process hashes the value (actually the version vector), and hashes the Key to get the Segment ID; and queues up an insert into the Key store that represents this new Key and Hash. 21 | 22 | - The hashtree process then marks the SegmentID as being dirty for the Merkle tree whose IndexN matches the update. 23 | 24 | - The Riak cluster has a kv_entropy_manager process, and this will determine what vnodes have common IndexNs with what other vnodes - and it will schedule exchanges to take place between these vnodes to compare the merkle trees for their common IndexNs. 25 | 26 | - When an exchange request is received by the kv_index_hashtree process, it first must update the Merkle tree for that IndexN. 27 | 28 | - To do that it looks at the list of dirty segments, and for each dirty segment it fold over all keys in that segment in its local AAE keystore to get a list of [{K, H}] for that SegmentID and IndexN. 29 | 30 | - It then does a sha hash on that list - and that represents the new hash value for that segment in the tree. Once all leaves have been updated, it works up the tree recalculating branch and root hashes as necessary. 31 | 32 | - the AAE processes then exchange and compare the trees - to find a list of segment IDs that differ between the vnodes for that IndexN. Hopefully the list is small. 33 | 34 | - Now for each segmentID in this list of mismatched segments it has to fold over the key store again to find all the [{K, H}] for those SegmentIds and the relevant IndexN - and this is then compared with exchange vnodes. 35 | 36 | - If any keys are found to have different hashes, then read repair is invoked for those keys. This essential is managed just by doing a normal GET request, with a subtle difference. 37 | 38 | - If the difference in hashes is because of a real difference between the values in the vnodes, then read_repair should fix it ... however if no difference is found, it prompts a rehash at each riak_kv_vnode for that Key. 39 | 40 | - The rehash just takes the current value for the key out of the vnode backend and passes it as if it were a new PUT to the kv_index_hashtree process. 41 | 42 | - This means that if there are discrepancies between the kv_index_hashtree key store and the vnode store (normally due to uncontrolled shutdown), they don't keep prompting read repairs over and over again - the rehash should fix the kv_index_hashtree store. 43 | 44 | - The anti-entropy process described so far fixes differences, when an update is never received by the vnode, but doesn't handle the situation where an update is received, but subsequently lost by a vnode (e.g. disk corruption). 45 | 46 | - To protect against disk-based loss, the AAE keystore is periodically (normally once per month) rebuilt from the actual vnode store. This is a carefully scheduled and throttled process, as it requires a full vnode scan to complete, with page cache side effects etc. 47 | 48 | - For the period of the rebuild, which can be many hours, AAE exchanges stop for that vnode (but continue for other vnodes). 49 | 50 | This covers off most of what happens for intra-cluster AAE. The same mechanism can also be used for inter-cluster AAE - but the two clusters have to be partitioned identically. 51 | 52 | - This is used for some Riak <-> Riak DC synchronisation (the alternative is a key-listing comparison), and also I think for the riak <-> solr synchronisation 53 | the current Riak <-> Riak one is mangled though, and has all kinds of issues (although not ones that are too hard to resolve or workaround) 54 | 55 | - There are some issues with this setup: 56 | 57 | - there is an overhead of running a parallel keystore (which is necessary due to the need to frequently find things by segment) 58 | 59 | - every distinct key updated ultimately leeds to a range query in the keystore (because of dirty segments) - this has an overhead 60 | 61 | - it is possible for the rebuild to cause a "one slow node" scenario 62 | 63 | - although it is much, much more efficient than key-listing for full-sync multi-DC replication - it requires the n-vals and ring-sizes to be consistent (which is not helpful when trying to find a way to safely change your ring size, or trying to find an efficient backup process) 64 | 65 | Those are the issues scheduled to be addressed in Riak 3.0 66 | -------------------------------------------------------------------------------- /docs/SEGMENT_FILTERED_SST.md: -------------------------------------------------------------------------------- 1 | # Segment filtering in LSM trees 2 | 3 | In data stores designed around Log Structured Merge Trees, data it stored in a tree of sorted files (SST files). To find a key in the tree, the SST file whose key range 4 | covers the key is checked starting at the top level, and working down until the first instance of the key is found. 5 | 6 | It is important to read performance that SST files that don't contain the key can provide a negative response in an efficient manner, so that the level can be skipped through without delay. In order to achieve this, some form of [bloom filter](https://en.wikipedia.org/wiki/Bloom_filter) is generally used. 7 | 8 | Within the [leveled LSM tree](https://github.com/martinsumner/leveled/tree/master/src), there has been attempt to align these filters with the hashing to a position in a (Tictac) Merkle tree - to allow for the same index to be used to both accelerate fetch misses, but also to skip blocks within an SST when scanning an LSM tree in key order to find a subset of keys associated with particular leaves within a Merkle Tree that represents the data in the store. 9 | 10 | There are two methods which have been investigated for implement this capability: 11 | 12 | - A simple slot-based segment-index (the actual method currently implemented in Leveled); 13 | - A potentially more efficient rice-encoded filter. 14 | 15 | The idea for both is that a single hash function is used (rather than multiple hash functions as in a bloom filter), and that hash function produces the position in the Merkle tree. The same filter can then be used to check for presence, of an individual keys, or for multiple keys located in a subset of segment IDs. 16 | 17 | ## Slot-based Segment Index 18 | 19 | Within Leveled, two bloom filters are used. Each file has its own multi-hash bloom filter, which is designed to be small with a relatively high false positive rate. This is used as an initial filter to prevent lookups to a file. 20 | 21 | Once this has been passed, the SST file is divided into compressed blocks of (24 to 28 keys and values), each set of five blocks is held within a slot, and the SST file maintains a mapping of key ranges to slots. A slot can contain up to 128 Keys/Values, some of which may be non-lookup keys (e.g. index entries) which don't require creation of a bloom entry as they will never be directly access outside of folds. 22 | 23 | The second level of bloom filter, the simple segment index, is kept in-memory for each slot. The segment index is built from either 2-byte positions, or 1-byte gaps. 24 | 25 | 2-byte positions are of the form `<<1:1/integer, SegID:15/integer>>`, where SegID is 15 bits of the Merkle tree segment ID for the key. 26 | 27 | 1-byte gaps are of the form `<<0:1/integer, Gap:7/integer>>`, where Gap is the count of entries in the slot between the last indexed entry and this which have no index entry. 28 | 29 | This is less efficient than a bloom filter, as for the size the false positive rate is 1:256 as opposed to 1:2180 for an optimised bloom of equivalent size. However, it yields additional information, notably the block or blocks which contain the Key, and the position of the Key within the block. 30 | 31 | ## Rice-encoded Segment Filter 32 | 33 | A Rice-encoded filter is a way of providing improved memory efficiency compared to the Slot-based Segment Index. Rice-encoding filter is a bloom filter based on a single hash function (as above), but now the filter is packed using [rice encoding](https://en.wikipedia.org/wiki/Golomb_coding), which encodes the bloom an array of deltas based on the assumption of roughly equal spacing between deltas - which should be an expected outcome of using a 'good' hash function. 34 | 35 | So if there are 15-bits to the hash in the bloom, and 128 keys in the bloom, then it can be assumed that the deltas are around 256 numbers apart, and so an 8-bit remainder is used, and: 36 | 37 | - A delta of 255 would be represented as 0 1111 1111; 38 | - A delta of 257 would be represented as 10 0000 0001; 39 | - A delta of 1000 would be represented as 1110 1101 1000. 40 | 41 | So the approximate overall size of the filter will be around 10 bits per key, for the same false positive rate as using a 16-bit per key Segment Index. 42 | 43 | This no longer reveals the position of the entry in the slot - but, most of the value in revealing the position comes from knowing just the block identifier (as the difference in small blocks between lists:nth/2 and lists:keyfind/3 is expected to be marginal). 44 | 45 | The block knowledge problem could be resolved by bit-shifting the segment ID and adding a 3-bit position ID to the segmentID before encoding, and this would allow for either a 3-bit reduction in size, or a 8x improvement in false positive rate. 46 | 47 | 48 | ## Efficiency of Segment-filtering 49 | 50 | In the slot-based segment index there is a false-positive rate per slot of 1:256. With the same size Rice-encoded filter this can be improved to 1:2048 if the exact position is dropped from the requirement, and the same size filter is used. 51 | 52 | If there is a LSM tree containing 10M keys, than across the tree there will be approximately 90K slots, and 450K blocks. 53 | 54 | If we want to scan for 32 different segment IDs - there will be around 600 blocks which need to be opened to find all the keys within those segments. Using a slot-based segment index around 90% of the blocks will be skipped, but 100 times more blocks will be opened than is optimal. Using the 4-bits of improvement in the rice encoded example will lead to an improvement greater than one order of magnitude - with > 98% skipped, and only around 6K blocks unnecessarily being opened. 55 | 56 | However, this assumes that the Segment IDs we're looking for are evenly distributed. In the current implementation when we look for a subset of SegmentIDs, it will look for the subset SegmentIDs that are numerically closest. 57 | 58 | In these filters we use only 15 or 19-bits of the SegmentID. If we chose those bits by performing a bsr operation on the 20-bit SegmentID, we can gain further efficiency by finding sets of SegmentIDs that overlap to the common SegmentIDs within filters. 59 | -------------------------------------------------------------------------------- /test/end_to_end/fold_SUITE.erl: -------------------------------------------------------------------------------- 1 | -module(fold_SUITE). 2 | -include_lib("common_test/include/ct.hrl"). 3 | -export([all/0, init_per_suite/1, end_per_suite/1]). 4 | -export([ 5 | aae_fold_keyorder/1, 6 | aae_fold_segmentorder/1 7 | ]). 8 | 9 | all() -> 10 | [ 11 | aae_fold_keyorder, 12 | aae_fold_segmentorder 13 | ]. 14 | 15 | init_per_suite(Config) -> 16 | testutil:init_per_suite([{suite, "fold"} | Config]), 17 | Config. 18 | 19 | end_per_suite(Config) -> 20 | testutil:end_per_suite(Config). 21 | 22 | aae_fold_keyorder(_Config) -> 23 | aae_fold_tester(leveled_ko, 50000). 24 | 25 | aae_fold_segmentorder(_Config) -> 26 | aae_fold_tester(leveled_so, 50000). 27 | 28 | aae_fold_tester(ParallelStoreType, KeyCount) -> 29 | RootPath = testutil:reset_filestructure(), 30 | FoldPath1 = filename:join(RootPath, "folder1/"), 31 | SplitF = 32 | fun(X) -> 33 | T = binary_to_term(X), 34 | {rand:uniform(1000), 1, 0, element(1, T), element(2, T)} 35 | end, 36 | 37 | {ok, Cntrl1} = 38 | aae_controller:aae_start( 39 | {parallel, ParallelStoreType}, 40 | true, 41 | {1, 300}, 42 | [{2, 0}, {2, 1}], 43 | FoldPath1, 44 | SplitF 45 | ), 46 | 47 | BKVListXS = testutil:gen_keys([], KeyCount), 48 | 49 | {SWLowMegaS, SWLowS, _SWLowMicroS} = os:timestamp(), 50 | timer:sleep(1000), 51 | ok = testutil:put_keys(Cntrl1, 2, BKVListXS, none), 52 | timer:sleep(1000), 53 | {SWHighMegaS, SWHighS, _SWHighMicroS} = os:timestamp(), 54 | BucketList = [integer_to_binary(1), integer_to_binary(3)], 55 | FoldElements = [{clock, null}, {md, null}], 56 | FoldFun = 57 | fun(B, _K, ElementList, {B1Count, B3Count}) -> 58 | {clock, FoldClock} = lists:keyfind(clock, 1, ElementList), 59 | {md, FoldMD} = lists:keyfind(md, 1, ElementList), 60 | case binary_to_term(FoldMD) of 61 | [{clock, FoldClock}] -> 62 | case B of 63 | <<"1">> -> 64 | {B1Count + 1, B3Count}; 65 | <<"3">> -> 66 | {B1Count, B3Count + 1} 67 | end 68 | end 69 | end, 70 | InitAcc = {0, 0}, 71 | {async, Runner1} = 72 | aae_controller:aae_fold( 73 | Cntrl1, 74 | {buckets, BucketList}, 75 | all, 76 | all, 77 | false, 78 | FoldFun, 79 | InitAcc, 80 | FoldElements 81 | ), 82 | true = {KeyCount div 5, KeyCount div 5} == Runner1(), 83 | 84 | {async, Runner2} = 85 | aae_controller:aae_fold( 86 | Cntrl1, 87 | {buckets, BucketList}, 88 | all, 89 | {SWLowMegaS * 1000000 + SWLowS, SWHighMegaS * 1000000 + SWHighS}, 90 | false, 91 | FoldFun, 92 | InitAcc, 93 | FoldElements 94 | ), 95 | true = {KeyCount div 5, KeyCount div 5} == Runner2(), 96 | 97 | {async, Runner3} = 98 | aae_controller:aae_fold( 99 | Cntrl1, 100 | {buckets, BucketList}, 101 | all, 102 | {0, SWLowMegaS * 1000000 + SWLowS}, 103 | false, 104 | FoldFun, 105 | InitAcc, 106 | FoldElements 107 | ), 108 | 109 | {0, 0} = Runner3(), 110 | 111 | {async, Runner4} = 112 | aae_controller:aae_fold( 113 | Cntrl1, 114 | {buckets, BucketList}, 115 | all, 116 | { 117 | SWHighMegaS * 1000000 + SWHighS, 118 | SWHighMegaS * 1000000 + SWHighS + 60 119 | }, 120 | false, 121 | FoldFun, 122 | InitAcc, 123 | FoldElements 124 | ), 125 | {0, 0} = Runner4(), 126 | 127 | {async, Runner5} = 128 | aae_controller:aae_fold( 129 | Cntrl1, 130 | {buckets, BucketList}, 131 | all, 132 | all, 133 | 2000, 134 | FoldFun, 135 | InitAcc, 136 | FoldElements 137 | ), 138 | case ParallelStoreType of 139 | leveled_ko -> 140 | {0, {2000, 0}} = Runner5(); 141 | leveled_so -> 142 | true = 143 | {-1, {KeyCount div 5, KeyCount div 5}} == Runner5() 144 | end, 145 | 146 | {async, Runner6} = 147 | aae_controller:aae_fold( 148 | Cntrl1, 149 | {buckets, BucketList}, 150 | all, 151 | {SWLowMegaS * 1000000 + SWLowS, SWHighMegaS * 1000000 + SWHighS}, 152 | 2000, 153 | FoldFun, 154 | InitAcc, 155 | FoldElements 156 | ), 157 | case ParallelStoreType of 158 | leveled_ko -> 159 | {0, {2000, 0}} = Runner6(); 160 | leveled_so -> 161 | true = 162 | {-1, {KeyCount div 5, KeyCount div 5}} == Runner6() 163 | end, 164 | 165 | BKVSL = lists:sublist(BKVListXS, KeyCount - 1000, 128), 166 | SegMapFun = 167 | fun({B, K, _VV}) -> 168 | BinK = aae_util:make_binarykey(B, K), 169 | Seg32 = leveled_tictac:keyto_segment32(BinK), 170 | leveled_tictac:get_segment(Seg32, small) 171 | end, 172 | SegList = lists:map(SegMapFun, BKVSL), 173 | BKVSL_ByBL = 174 | lists:filter( 175 | fun({B, _K, _V}) -> lists:member(B, BucketList) end, 176 | BKVSL 177 | ), 178 | FoldClocksElements = [{clock, null}], 179 | FoldClocksFun = 180 | fun(B, K, ElementList, Acc) -> 181 | {clock, FoldClock} = lists:keyfind(clock, 1, ElementList), 182 | [{B, K, FoldClock} | Acc] 183 | end, 184 | 185 | {async, Runner7} = 186 | aae_controller:aae_fold( 187 | Cntrl1, 188 | {buckets, BucketList}, 189 | {segments, SegList, small}, 190 | all, 191 | false, 192 | FoldClocksFun, 193 | [], 194 | FoldClocksElements 195 | ), 196 | 197 | FetchedClocks = Runner7(), 198 | io:format( 199 | "Fetched ~w clocks with segment filter~n", 200 | [length(FetchedClocks)] 201 | ), 202 | true = 203 | [] == lists:subtract(BKVSL_ByBL, FetchedClocks), 204 | % Found all the Keys and clocks in the list 205 | true = 206 | (KeyCount div 64) > length(lists:subtract(FetchedClocks, BKVSL_ByBL)), 207 | % Didn't find "too many" others due to collisions on segment 208 | 209 | {async, Runner8} = aae_controller:aae_bucketlist(Cntrl1), 210 | ListOfBuckets = Runner8(), 211 | true = length(ListOfBuckets) == 5, 212 | true = lists:usort(ListOfBuckets) == ListOfBuckets, 213 | % There are five buckets - they are found in the expected order 214 | 215 | ok = aae_controller:aae_close(Cntrl1), 216 | RootPath = testutil:reset_filestructure(). 217 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # KV Tictac Tree 2 | 3 | [![Build Status](https://github.com/martinsumner/kv_index_tictactree/actions/workflows/erlang.yml/badge.svg?branch=develop-3.4)](https://github.com/martinsumner/kv_index_tictactree/actions) 4 | 5 | An Active Anti-Entropy library for Key-Value stores in Erlang. 6 | 7 | This is currently a working prototype with basic testing. The target for the library is to be fully integrated with [Riak KV](https://github.com/basho/riak_kv) for Release 3.0 (Autumn 2018). 8 | 9 | The library could in theory be used by any Erlang application wanting to use Merkle trees to compare different data stores, it is designed for Riak but not coupled to Riak. It is not though a general substitute for Merkle trees when the cryptographic strength of Merkle trees is of importance (e.g. a blockchain implementation). 10 | 11 | ## Overview 12 | 13 | Library to provide an Active-Anti-Entropy (AAE) capability in a KV store. The AAE functionality is based on that normally provided through [Merkle Trees](https://github.com/basho/riak_core/blob/2.1.9/src/hashtree.erl), but with two changes from standard practice: 14 | 15 | - The Merkle trees are not cryptographically secure (as it is assumed that the system will use them only for comparison between trusted actors over secure channels). This relaxation of security reduces significantly the cost of maintenance, without reducing their effectiveness for comparison over private channels. To differentiate from secure Merkle trees the name TicTac Merkle trees is used. [Further details on Tictac trees can be found here](docs/TICTAC.md). 16 | 17 | - Indexing of key stores within the AAE system can be 2-dimensional, where the store supports scanning by segment within the store as well as the natural order for the store (e.g. key order). The key store used is a Log-Structured Merge tree but the bloom-style indexes that are used within the store to accelerate normal access have been dual-purposed to align with the hashes used to map to a key into the Merkle tree, and therefore to accelerate access per-segment without requiring ordering by segment. [Further details on making bloom-based indexes in LSM trees dual prupose can be found here](docs/SEGMENT_FILTERED_SST.md) 18 | 19 | The purpose of these changes, and other small improvements to standard Merkle tree anti-entropy, are to allow for: 20 | 21 | - Supporting Active Anti-Entropy without the need to maintain and synchronise additional `parallel` key stores to provide a tree-ordered view of the store. this depends on the primary store having `native` AAE support, and a `parallel` store may still be used where this support is not available. 22 | 23 | - Cached views of TicTac Merkle trees to be maintained in memory by applying deltas to the trees, so as to avoid the scanning of dirty segments at the point of exchange and allow for immediate exchanges. Also the cache will be maintained and kept up-to-date during rebuild activity, to prevent loss of anti-entropy validation during any background rebuild processes. 24 | 25 | - False positive avoidance by double-checking each stage of the exchange (separated by a pause), utilising the low cost of querying the tree, and avoiding the false-negative exchanges associated with timing differences between changes reaching different vnodes. 26 | 27 | - The rapid merging of TicTac Merkle trees across data partitions - so a tree for the whole store can be quickly built from cached views of partitions within the store, and be compared with a matching store that may be partitioned using a different layout. 28 | 29 | - A consistent set of features to be made available between AAE in both `parallel` and `native` key store mode - including the ability to query the AAE store to discover information which otherwise would require expensive object folds. 30 | 31 | - Fully asynchronous API to the AAE controller so that the actual partition (vnode) management process can run an AAE controller without being blocked by AAE activity. 32 | 33 | - Allow for AAE exchanges to compare Keys and Clocks for mismatched segments, not just Keys and Hashes, so repair functions can be targeted at the side of the exchange which is behind - avoiding needlessly duplicated 2-way repairs. 34 | 35 | 36 | ## Primary Actors 37 | 38 | The primary actor in the library is the controller (`aae_controller`) - which provides the API to startup and shutdown a server for which will manage TicTac tree caches (`aae_treecache`) and a parallel Key Store (`aae_keystore` - which may be empty when run in `native` mode). The `aae_controller` can be updated by the actual vnode (partition) manager, and accessed by AAE Exchanges (either directly or also via the vnode manager). 39 | 40 | The AAE exchanges (`aae_exchange`) are finite-state machines which are initialised with a Blue List and a Pink List to compare. In the simplest form the two lists can be a single vnode and partition identifier each - or they could be different coverage plans consisting of multiple vnodes and multiple partition identifiers by vnode. The exchanges pass through two root comparison stages (to compare the root of the trees, taking the intersection of branch mismatches from both comparisons), two branch comparison stages, and then a Key and logical identifier exchange based on the leaf segment ID differences found, and finally a repair. 41 | 42 | The AAE exchange should work the same way if two partitions are bing compared, or two coverage queries across multiple partitions are being compared. 43 | 44 | [More detail on the design can be found here](docs/DESIGN.md). 45 | 46 | [Some further background information can be found here](https://github.com/martinsumner/leveled/blob/master/docs/ANTI_ENTROPY.md). 47 | 48 | ## Using the Library 49 | 50 | Following the [current tests](https://github.com/martinsumner/kv_index_tictactree/blob/master/test/end_to_end/basic_SUITE.erl) presently provides the simplest guide to using the library. There is also a [`mock_kv_vnode`](https://github.com/martinsumner/kv_index_tictactree/blob/master/test/end_to_end/mock_kv_vnode.erl) process used in these tests, and provides a sample view of how an `aae_controller` could be integrated. 51 | 52 | There are two main branches: 53 | 54 | [`develop-3.1 - default`](https://github.com/martinsumner/kv_index_tictactree/tree/develop-3.1): Target for the Riak 3.1 release with support for OTP 22 and OTP 24; 55 | 56 | [`develop-3.0`](https://github.com/martinsumner/kv_index_tictactree/tree/develop-3.0): Used in the Riak 3.0 release with support for OTP 20 and OTP 22; 57 | 58 | [`develop-2.9`](https://github.com/martinsumner/kv_index_tictactree/tree/develop-2.9): Used in the Riak 2.9 release with support for OTP R16 through to OTP 20. 59 | 60 | ### Contributing and Testing 61 | 62 | The acceptance criteria for updating kv_index_tictactree is that it passes rebar3 dialyzer, xref, eunit, and ct with 100% coverage. 63 | 64 | To have rebar3 execute the full set of tests, run: 65 | 66 | `rebar3 as test do xref, dialyzer, cover --reset, eunit --cover, ct --cover, cover --verbose` 67 | 68 | For those with a Quickcheck license, property-based tests can also be run using: 69 | 70 | `rebar3 as eqc do eunit --module=aae_eqc` 71 | 72 | 73 | ### Riak KV 74 | 75 | [This overview](docs/RIAK_2_AAE.md) details how the current (Riak KV 2.2.5) AAE implementation works. 76 | 77 | [This overview](docs/RIAK_3_AAE.md) details how the target (Riak KV 3.0) AAE implementation is expected to work utilising KV Tictac Trees. 78 | -------------------------------------------------------------------------------- /docs/TICTAC.md: -------------------------------------------------------------------------------- 1 | # Merkle Trees and Tictac Trees 2 | 3 | A [Merkle Tree](https://en.wikipedia.org/wiki/Merkle_tree) is a tree of hashes arranges so that the value of the hash of any branch, is the hash of the accumulation of hashes below it. This allows for trees which represent the same data, to confirm this synchronisation by transferring only the root of the tree. Also, where there are small deltas between the trees, for the tree to be traversed to quickly identify which tree-positions those deltas are in. 4 | 5 | 6 | ## Standard Merkle Tree (Riak) 7 | 8 | In Riak KV 2.2.5, the hashtree has two levels, each 1024 hashes wide - meaning o(1m) overall segments within each tree. The position of a key in the tree is determined by taking the `erlang:phash2/1` hash of the Bucket and Key. The hash of an individual element to be added to the tree is found by taking a `erlang:phash2/1` [hash of the sorted version vector](https://github.com/basho/riak_kv/blob/2.1.7/src/riak_object.erl#L667-L670). To calculate the hash of a segment, the hashtree process takes all of the Keys and Hashes in the segment and performs: 9 | 10 | `hash([{K1, H1}, {K2, H2} .... {Kn, Hn}])` 11 | 12 | This time using a sha hash from the erlang crypto library. This hashing of the list of all sub-elements is then used up to the root of the tree to calculate the parent hashes. 13 | 14 | ## Alternative Merkle Tree (Tictac) 15 | 16 | The Tictac trees still use as the value to hash the sorted list of the version vector, but now the hash for the segment ID is built up as follows: 17 | 18 | `hash({K1, VV1}) xor hash({K2, VV2}) xor .... hash({Kn, VVn})` 19 | 20 | This change weakens the cryptographic security of the Merkle tree, in that it directly exposes deltas i.e. the addition of the same Key and Version Vector will always result in the same hash delta in the tree, regardless of the starting point of the tree. Whereas, if the same change has been made to two different trees with strong Merkle trees, the delta in the tree would not be predictable. 21 | 22 | In this context, it is determined that the cryptographic strength isn't important. All actors already have access to data, have a secure communication path, and the purpose of the tree is simply to identify deltas and not to determine the integrity of a change. 23 | 24 | ### Supporting PUT 25 | 26 | The result of this change is that if we know for K1, the old version vector (VV1a) and a new version vector (VV1b), we can determine the hash change to be applied to the tree with just this knowledge: 27 | 28 | `Delta = hash({K1, VV1a}) xor hash({K1, VV1b})` 29 | 30 | This Delta can then be applied to reach level of the tree up to the root, and the tree reflects the change. Whereas, with the traditional Merkle tree it is first necessary to find **all** the Keys and Hashes within the changed segment, so that a *new* hash for that segment can be calculated (rather than a *delta* applied). 31 | 32 | As well as changing the process of combining hashes, the hash algorithm is changed to (4-bytes of) md5 for both keys and version vectors (relaxing the unnecessary cryptographic strength, and making it easier to produce the Trees outside of Riak, by not depending on the erlang hash function). 33 | 34 | In the current Merkle tree implementation, every change to a tree requires a scan of a key store, but with a TicTac tree, prior knowledge of the old version vector and the current version vector is all that is required to produce the delta. 35 | 36 | Within Riak, for most PUTs the `riak_kv_vnode` will read before write, and so the old version vector is already known - so no extra read cost is required to update the tree. If the update is following the write once path, by definition (assuming developer competence) the previous version vector can be assumed to be empty. 37 | 38 | The exceptional scenario is for updates using Last Write Wins, with a backend not supporting secondary indexes (currently only Bitcask), where the old version vector will not be known by the `riak_kv_vnode` and cannot be assumed to be empty. There are four options for handling this scenario: 39 | 40 | - Do not support AAE for such buckets; 41 | - Force the riak_kv_vnode to end the read-less write optimisation if Tictac AAE is enabled; 42 | - Pass the old version vector through as undefined, and require the Tictac AAE process to use its own Keystore to discover the old version vector before updating; 43 | - Pass the old version vector through as undefined, and require the Tictac AAE process to seek all Keys and Hashes in the segment to recalculate the hash in this case. 44 | 45 | The best approach is to be determined, but it is assumed it would be better to be one that places responsibility for change on the AAE process, not the existing vnode code. 46 | 47 | ### Coverage Implications 48 | 49 | In most cases, the cost of altering the tree on PUT is reduced dramatically by switching to Tictac trees. However, the biggest benefits is with regards to merging trees. 50 | 51 | Currently, when trying to perform a full-sync operation between two Riak clusters, this can be done either through key-listing or AAE exchange. The key-listing approach compares the two clusters one vnode at a time (over a covering set of vnodes), and in this case this is an implementation choice to throttle the process. 52 | 53 | The AAE exchange approach also runs the comparison one vnode at a time, but there is no choice in this regards - as the AAE trees are separated out on a per-vnode basis, and it is impossible to merge two Merkle trees without access to all the underlying keys and hashes within both Merkle Trees. 54 | 55 | However, to merge two Tictac trees that cover non-overlapping sets of data, for each segment the result is simply: 56 | 57 | `hash(SegA) xor hash(SegB)` 58 | 59 | So it would be possible to take just the trees from a covering set of vnodes, and without any knowledge of the underlying Keys and Hashes merge those trees (or indeed just the roots of those trees). This means that a covering set of vnodes can efficiently combined all their Tictac trees to produce a single Tictac tree to represent the whole cluster. 60 | 61 | Crucially, this allows for synchronisation between database clusters with different patterns of data partitioning. This would mean that an AAE full-sync process could be run: 62 | 63 | - To aid in migration between clusters of different ring size, working around the issues of ring re-sizing being deprecated in Riak. 64 | - As part of a backup approach, as it will be possible to AAE full-sync replica to a cluster that not just has a different node counts(e.g. a node count 1), backends (e.g. to one that is rsync friendly), ring-size (one that is optimal for a smaller cluster size) but also different n-vals (e.g. n-val of 1). 65 | - To make synchronisation between Riak and an alternate database management system easier, assuming that alternative database can also maintain a database-wide Tictac tree. 66 | 67 | 68 | ## Naming 69 | 70 | The name Tictac is taken from the [Tic-Tac language used by on-course bookmakers](https://en.wikipedia.org/wiki/Tic-tac), which was a non-secure but efficient way of communicating deltas in a wide market to a participant in the market. 71 | 72 | This variation in Merkle trees is not novel, in that the use of the less secure XOR operation is known to be used within the [Cassandra database](http://distributeddatastore.blogspot.co.uk/2013/07/cassandra-using-merkle-trees-to-detect.html). However, the overall pattern of anti-entropy in Cassandra is different, with [trees being built and destroyed on demand](https://wiki.apache.org/cassandra/AntiEntropy) rather than being cached and merged. 73 | -------------------------------------------------------------------------------- /test/end_to_end/testutil.erl: -------------------------------------------------------------------------------- 1 | -module(testutil). 2 | 3 | -export([ 4 | gen_keys/2, 5 | gen_keys/3, 6 | put_keys/3, 7 | put_keys/4, 8 | remove_keys/3, 9 | gen_riakobjects/3, 10 | get_modify_functions/1 11 | ]). 12 | -export([calc_preflist/2]). 13 | -export([ 14 | start_receiver/0, 15 | exchange_sendfun/1, 16 | exchange_vnodesendfun/1, 17 | repair_fun/3 18 | ]). 19 | -export([ 20 | reset_filestructure/0, 21 | reset_filestructure/2 22 | ]). 23 | 24 | -export([init_per_suite/1, end_per_suite/1]). 25 | 26 | -include("testutil.hrl"). 27 | 28 | -define(ROOT_PATH, "test/"). 29 | 30 | init_per_suite(Config) -> 31 | LogTemplate = 32 | [ 33 | time, 34 | " [", 35 | level, 36 | "] ", 37 | {pid, [pid, "@"], []}, 38 | {mfa, [mfa, ":"], []}, 39 | " ", 40 | msg, 41 | "\n" 42 | ], 43 | LogFormatter = 44 | { 45 | logger_formatter, 46 | #{ 47 | time_designator => $\s, 48 | template => LogTemplate 49 | } 50 | }, 51 | {suite, SUITEName} = lists:keyfind(suite, 1, Config), 52 | FileName = "kvtictac_" ++ SUITEName ++ "_ct.log", 53 | LogConfig = 54 | #{ 55 | config => 56 | #{ 57 | file => FileName, 58 | max_no_files => 5 59 | } 60 | }, 61 | 62 | ok = logger:add_handler(logfile, logger_std_h, LogConfig), 63 | ok = logger:set_handler_config(logfile, formatter, LogFormatter), 64 | ok = logger:set_handler_config(logfile, level, info), 65 | 66 | ok = logger:set_handler_config(default, level, notice), 67 | ok = logger:set_handler_config(cth_log_redirect, level, notice), 68 | 69 | ok = logger:set_primary_config(level, info), 70 | 71 | Config. 72 | 73 | end_per_suite(_Config) -> 74 | ok = logger:remove_handler(logfile), 75 | ok = logger:set_primary_config(level, notice), 76 | ok = logger:set_handler_config(default, level, all), 77 | ok = logger:set_handler_config(cth_log_redirect, level, all), 78 | 79 | ok. 80 | 81 | reset_filestructure() -> 82 | reset_filestructure(0, ?ROOT_PATH). 83 | 84 | reset_filestructure(Wait, RootPath) -> 85 | io:format( 86 | "Waiting ~w ms to give a chance for all file closes " ++ 87 | "to complete~n", 88 | [Wait] 89 | ), 90 | timer:sleep(Wait), 91 | clear_all(RootPath), 92 | RootPath. 93 | 94 | clear_all(RootPath) -> 95 | ok = filelib:ensure_dir(RootPath), 96 | {ok, FNs} = file:list_dir(RootPath), 97 | FoldFun = 98 | fun(FN) -> 99 | FFP = filename:join(RootPath, FN), 100 | case filelib:is_dir(FFP) of 101 | true -> 102 | clear_all(FFP ++ "/"); 103 | false -> 104 | case filelib:is_file(FFP) of 105 | true -> 106 | file:delete(FFP); 107 | false -> 108 | ok 109 | end 110 | end 111 | end, 112 | lists:foreach(FoldFun, FNs). 113 | 114 | gen_keys(KeyList, Count) -> 115 | gen_keys(KeyList, Count, spread_over_buckets). 116 | 117 | gen_keys(KeyList, Count, Floor) when is_integer(Floor) -> 118 | gen_keys(KeyList, Count, spread_over_buckets, Floor); 119 | gen_keys(KeyList, Count, BucketSpec) -> 120 | gen_keys(KeyList, Count, BucketSpec, 0). 121 | 122 | gen_keys(KeyList, Count, _, Floor) when Count == Floor -> 123 | KeyList; 124 | gen_keys(KeyList, Count, BucketSpec, Floor) -> 125 | Bucket = 126 | case BucketSpec of 127 | spread_over_buckets -> integer_to_binary(Count rem 5); 128 | _ -> BucketSpec 129 | end, 130 | Key = list_to_binary(string:right(integer_to_list(Count), 6, $0)), 131 | VersionVector = add_randomincrement([]), 132 | gen_keys( 133 | [{Bucket, Key, VersionVector} | KeyList], 134 | Count - 1, 135 | BucketSpec, 136 | Floor 137 | ). 138 | 139 | put_keys(Cntrl, NVal, KL) -> 140 | put_keys(Cntrl, NVal, KL, none). 141 | 142 | put_keys(_Cntrl, _Nval, [], _PrevVV) -> 143 | ok; 144 | put_keys(Cntrl, Nval, [{Bucket, Key, VersionVector} | Tail], PrevVV) -> 145 | ok = aae_controller:aae_put( 146 | Cntrl, 147 | calc_preflist(Key, Nval), 148 | Bucket, 149 | Key, 150 | VersionVector, 151 | PrevVV, 152 | term_to_binary( 153 | {[os:timestamp()], term_to_binary([{clock, VersionVector}])} 154 | ) 155 | ), 156 | put_keys(Cntrl, Nval, Tail, PrevVV). 157 | 158 | remove_keys(_Cntrl, _Nval, []) -> 159 | ok; 160 | remove_keys(Cntrl, Nval, [{Bucket, Key, _VV} | Tail]) -> 161 | ok = aae_controller:aae_put( 162 | Cntrl, 163 | calc_preflist(Key, Nval), 164 | Bucket, 165 | Key, 166 | none, 167 | undefined, 168 | <<>> 169 | ), 170 | remove_keys(Cntrl, Nval, Tail). 171 | 172 | gen_riakobjects(0, ObjectList, _TupleBuckets) -> 173 | ObjectList; 174 | gen_riakobjects(Count, ObjectList, TupleBuckets) -> 175 | Bucket = 176 | case TupleBuckets of 177 | true -> 178 | {?BUCKET_TYPE, integer_to_binary(Count rem 5)}; 179 | false -> 180 | integer_to_binary(Count rem 5) 181 | end, 182 | Key = list_to_binary(string:right(integer_to_list(Count), 6, $0)), 183 | Value = crypto:strong_rand_bytes(512), 184 | MD = [ 185 | {last_modified_date, os:timestamp()}, 186 | {random, rand:uniform(3)} 187 | ], 188 | Obj = #r_object{ 189 | bucket = Bucket, 190 | key = Key, 191 | contents = [#r_content{metadata = MD, value = Value}] 192 | }, 193 | gen_riakobjects(Count - 1, [Obj | ObjectList], TupleBuckets). 194 | 195 | get_modify_functions(PreflistFun) -> 196 | PutFun = 197 | fun(Store1, Store2) -> 198 | OtherStores = 199 | case Store2 of 200 | none -> []; 201 | Store2 -> [Store2] 202 | end, 203 | fun(Object) -> 204 | PL = PreflistFun(null, Object#r_object.key), 205 | mock_kv_vnode:put(Store1, Object, PL, OtherStores) 206 | end 207 | end, 208 | DeleteFun = 209 | fun(Stores) -> 210 | fun(Object) -> 211 | PL = PreflistFun(null, Object#r_object.key), 212 | lists:foreach( 213 | fun(Store) -> 214 | mock_kv_vnode:backend_delete( 215 | Store, 216 | Object#r_object.bucket, 217 | Object#r_object.key, 218 | PL 219 | ) 220 | end, 221 | Stores 222 | ) 223 | end 224 | end, 225 | RehashFun = 226 | fun(Stores) -> 227 | fun(Object) -> 228 | PL = PreflistFun(null, Object#r_object.key), 229 | lists:foreach( 230 | fun(Store) -> 231 | mock_kv_vnode:rehash( 232 | Store, 233 | Object#r_object.bucket, 234 | Object#r_object.key, 235 | PL 236 | ) 237 | end, 238 | Stores 239 | ) 240 | end 241 | end, 242 | {PutFun, DeleteFun, RehashFun}. 243 | 244 | add_randomincrement(Clock) -> 245 | RandIncr = rand:uniform(100), 246 | RandNode = 247 | lists:nth( 248 | rand:uniform(9), 249 | [ 250 | <<"a">>, 251 | <<"b">>, 252 | <<"c">>, 253 | <<"d">>, 254 | <<"e">>, 255 | <<"f">>, 256 | <<"g">>, 257 | <<"h">>, 258 | <<"i">> 259 | ] 260 | ), 261 | UpdClock = 262 | case lists:keytake(RandNode, 1, Clock) of 263 | false -> 264 | [{RandNode, RandIncr} | Clock]; 265 | {value, {RandNode, Incr0}, Rest} -> 266 | [{RandNode, Incr0 + RandIncr} | Rest] 267 | end, 268 | lists:usort(UpdClock). 269 | 270 | calc_preflist(Key, 2) -> 271 | case erlang:phash2(Key) band 3 of 272 | 0 -> 273 | {2, 0}; 274 | _ -> 275 | {2, 1} 276 | end; 277 | calc_preflist(Key, 3) -> 278 | case erlang:phash2(Key) band 3 of 279 | 0 -> 280 | {3, 0}; 281 | 1 -> 282 | {3, 1}; 283 | _ -> 284 | {3, 2} 285 | end. 286 | 287 | start_receiver() -> 288 | receive 289 | {result, Reply} -> 290 | Reply 291 | end. 292 | 293 | exchange_sendfun(Cntrl) -> 294 | SendFun = 295 | fun(Msg, Preflists, Colour) -> 296 | RPid = self(), 297 | ReturnFun = 298 | fun(R) -> 299 | aae_exchange:reply(RPid, R, Colour) 300 | end, 301 | case Msg of 302 | fetch_root -> 303 | aae_controller:aae_mergeroot( 304 | Cntrl, 305 | Preflists, 306 | ReturnFun 307 | ); 308 | {fetch_branches, BranchIDs} -> 309 | aae_controller:aae_mergebranches( 310 | Cntrl, 311 | Preflists, 312 | BranchIDs, 313 | ReturnFun 314 | ); 315 | {fetch_clocks, SegmentIDs} -> 316 | aae_controller:aae_fetchclocks( 317 | Cntrl, 318 | Preflists, 319 | SegmentIDs, 320 | ReturnFun, 321 | null 322 | ) 323 | end 324 | end, 325 | SendFun. 326 | 327 | exchange_vnodesendfun(VN) -> 328 | fun(Msg, Preflists, Colour) -> 329 | RPid = self(), 330 | ReturnFun = 331 | fun(R) -> 332 | aae_exchange:reply(RPid, R, Colour) 333 | end, 334 | mock_kv_vnode:exchange_message(VN, Msg, Preflists, ReturnFun) 335 | end. 336 | 337 | repair_fun(SourceList, Cntrl, NVal) -> 338 | Lookup = lists:map(fun({B, K, V}) -> {{B, K}, V} end, SourceList), 339 | RepairFun = 340 | fun(BucketKeyL) -> 341 | FoldFun = 342 | fun({{B0, K0}, _VCDelta}, Acc) -> 343 | {{B0, K0}, V0} = lists:keyfind({B0, K0}, 1, Lookup), 344 | [{B0, K0, V0} | Acc] 345 | end, 346 | KVL = lists:foldl(FoldFun, [], BucketKeyL), 347 | ok = put_keys(Cntrl, NVal, KVL) 348 | end, 349 | RepairFun. 350 | -------------------------------------------------------------------------------- /docs/GENERAL_TICTACAAE_FOR_RIAK.md: -------------------------------------------------------------------------------- 1 | # Background 2 | 3 | Further helpful details on the background to this work can be found in the previous [Anti-Entropy](ANTI_ENTROPY.md) write-up. 4 | 5 | The aim is to provide a better answer to the active anti-entropy in Riak. Specifically, it would be preferable to resolve the following issues: 6 | 7 | - Rebuild times. Both the cost of rebuilds but also the cost in the failure of AAE-dependent processes during rebuilds. For example, due to the [rate-limiting of rebuilds](https://github.com/basho/riak_kv/blob/2.1.7/src/riak_kv_index_hashtree.erl#L98-L101), rebuilding a single vnode can take o(10) hours. during this rebuild time, these partitions are not subject to internal AAE, and Multi-Data Centre AAE [may be blocked altogether](https://github.com/basho/riak_repl/issues/772). 8 | 9 | - Version inconsistencies. The process of trying to make the transition from one version of AAE to another smooth, is potentially [too disruptive](https://github.com/basho/riak_kv/issues/1659), and leaves a long legacy in [future versions](https://github.com/basho/riak_kv/issues/1656). 10 | 11 | - Cost of AAE. Every AAE exchange requires in effect a [range scan](https://github.com/basho/riak_core/blob/2.1.9/src/hashtree.erl#L65-L72) in the key-store for every key updated since the last exchange for that partition. This contributes to a 10% performance overhead associated with running AAE. 12 | 13 | - Support for native AAE support within backends. The Leveled backend can support optimisations for by-segment scanning over its native key-store, potentially rendering the need to keep (and periodically rebuild) a dedicated key-store for AAE unnecessary. It would be beneficial to have an improved AAE that can exploit this advantage, without preventing the anti-entropy solution form being used on backends that would require a dedicated anti-entropy store. 14 | 15 | # Overview Of Needs 16 | 17 | The high level changes proposed are: 18 | 19 | - Have an AAE solution per vnode where the key-store is both optional (and so can be avoided where native support renders it unnecessary), and has swappable backends (including a pure Erlang alternative to Leveldb). 20 | 21 | - Keep the actual AAE Merkle Trees cached using TicTac trees to support updates to the tree without scanning. 22 | 23 | - Use per-partition TicTac trees so that the Merkle trees can be merged across vnodes, to make AAE backed synchronisation possible between clusters of different ring sizes. 24 | 25 | - Allow rebuilds to take place in parallel to maintaining the old store and cache of the Merkle tree - so exchanges can continue through the rebuild process. 26 | 27 | - Formalise the use of dotted version vector as the basis for the object hash to reduce the cost of object binary changes and copying. Also allow for intelligent comparisons between clusters by exchanging keys & clocks, not just keys & hashes. 28 | 29 | - Have the new AAE solution work in parallel to the legacy solution, so that migration is controlled through administration/configuration, and the legacy solution can be permanently forgotten by the cluster. 30 | 31 | - Externalise the AAE functions, so that the same functions can be used for synchronisation with different database platforms, without requiring internal changes to Riak. 32 | 33 | # AAE design 34 | 35 | ## Actors, States and Messages 36 | 37 | The primary actors 38 | 39 | - AAEController (1 per vnode) - gen_fsm 40 | 41 | - KeyStore (1 per Controller) - gen_server 42 | 43 | - TreeCache (n per Controller) - gen_fsm 44 | 45 | - DiskLog (temporary - 1 per Controller) - gen_server 46 | 47 | ### AAEController 48 | 49 | The AAEController will have 3 states: `starting`, `replacing-store`, `replacing-tree` and `steady`. In all states except `starting` an exchange will be possible. 50 | 51 | The AAEController can receive data updates from the vnode in four forms: 52 | 53 | - {IndexN, Bucket, Key, CurrentClock, unidentified} for PUTs marshalled via the blind_put (for LWW buckets without 2i support in the backend e.g. LWW -> Bitcask), or when a rehash request has been made for a single object; 54 | 55 | - {IndexN, Bucket, Key, CurrentClock, PreviousClock} for standard object updates (PreviousClock will be none for fresh objects); 56 | 57 | - {IndexN, Bucket, Key, none, PreviousClock} for actual backend deletes (e.g. post tombstone). 58 | 59 | The AAE Controller will handle the casting or calling of these messages by casting a message to the appropriate TreeCache to prompt an update, and then adding the update to a queue to be batch written to the KeyStore. There is an additional penalty for changes where the PreviousClock is unidentified in that they will require a range scan of the KeyStore to generate the TreeCache update message. 60 | 61 | The AAE controller may also receive requests to retrieve the branch or leaf hashes for a given partition TreeCache, as well as trigger rebuilds or rehashes. 62 | 63 | ### KeyStore 64 | 65 | The KeyStore needs to support four operations: 66 | 67 | - A batch PUT of objects 68 | 69 | - An object fold bounded by a range 70 | 71 | - An is_empty check 72 | 73 | - A GET of a single object 74 | 75 | On startup the AAEController must be informed by the vnode the is_empty status of the actual vnode key store, and this should match the is_empty status of the AAE key store. If there is a mismatch then the KeyStore must be rebuilt before the AAEController can exit the `starting` state. 76 | 77 | As vnode changes are made, these changes should be reflected in batches in the KeyStore. The Key for the entry in the KeyStore should be a tuple of `{IndexN, SegmentID, Bucket, Key}` where SegmentID is the hash of the Bucket and Key used to map the identifier to a location in the merkle tree. The Value of the object should be a tuple of `{VectorClock, Hash}`. 78 | 79 | Activity in the KeyStore should be optimised for the vast majority of traffic being PUTs. Queries are only used for the KeyStore when: 80 | 81 | - Folding over all objects by IndexN and SegmentID to return Keys/Clocks for a given segment; 82 | 83 | - Folding over all objects to recalculate an AAE tree for each IndexN; 84 | 85 | - Fetching of a specific object by IndexN, SegmentID, Bucket and Key to recalculate a specific hash in the AAE tree when the update to the AAEController has a PreviousClock of `unidentified`. 86 | 87 | When a KeyStore needs to be rebuilt, a new KeyStore is started, but the old KeyStore should continue to receive updates, and be used to fulfil requests for Keys and Clocks and to read `unidentified` Clocks. Only once the new store is complete, should the old store be destroyed. 88 | 89 | A manifest file should be kept to indicate which is the current active store to be used on a restart. 90 | 91 | If the vnode backend has native support for the queries required by the AAE KeyStore, then the KeyStore can be run in native mode - ignoring the batch puts, and re-directing the queries to the actual vnode backend. In native mode `unidentified` previous clocks cannot be supported (and should not be needed). 92 | 93 | ### TreeCache 94 | 95 | There is a TreeCache process for each IndexN managed by the AAEController. The TreeCache receives changes in the form {SegmentID, HashChange}. The HashChange is calculated by performing an XOR operation on the hash of the current clock, and the hash of the previous clock. The SegmentID is calculated from the hash of the <> binary. 96 | 97 | The TreeCache process should respond to each update by changing the tree to reflect that update. 98 | 99 | The TreeCache can be in a `starting` state, for instance when a new cache is being built by the AAEController in the `replacing-tree` state. In the starting state the TreeCache should not be forwarded requests for AAE tree information. 100 | 101 | 102 | ### DiskLog 103 | 104 | When both replacing a store and replacing a tree, batches of updates need to be cached until the store or tree is ready to receive them. For example, rebuilding the store will start a new KeyStore backend and take a snapshot of the real vnode backend to fold and populate the store. However, the store being rebuilt cannot receive new updates during this rebuild process (without requiring all the updates from the fold to require a read before the PUT) - so the batches of new updates need to be cached in a log, to be applied only once the fold is complete. 105 | 106 | ## Startup and Shutdown 107 | 108 | On shutdown any incomplete batches should be passed to the KeyStore and the KeyStore shutdown. All functioning TreeCaches should be shutdown, and on shutdown should write a CRC-checked file containing the serialised tree. At the point the shutdown is requested, the TreeCache may be at a more advanced state than the KeyStore, and if sync_on_write is not enabled in the vnode backend the KeyStore could be in advance of the backend. To try and protect against situations on startup where the TreeCache reflects a more advanced state than the actual vnode - the TreeCache should not be persisted until the vnode backend and the AAE KeyStore have both successfully closed. 109 | 110 | On startup, if shutdown was completed normally, the TreeCaches should be restored from disk, as well as the KeyStore. Any partially rebuilt KeyStore should be destroyed. 111 | 112 | On recovering a TreeCache from disk, the TreeCache process should delete the TreeCache from disk before receiving any update. 113 | 114 | If the shutdown was unclean, and there is a KeyStore, but no persisted TreeCache, then before completing startup the AAEController should enforce a fold over the KeyStore to rebuild the TreeCaches. 115 | 116 | If the KeyStore has missing updates due to an abrupt shutdown, this will cause (potentially false) repairs of the keys, and the repair will also trigger a rehash. the rehash should prompt a correction in the AAE KeyStore (through an `unidentified`) previous clock to bring the TreeCache and KeyStore back into line. 117 | 118 | ## Rebuilds and Rehashes 119 | 120 | If an AAE KeyStore is used in non-native mode, periodically the Keystore should be rebuilt, should there be entropy from disk in the actual KeyStore. This is achieved using the `replacing-store` state in the AAEController. 121 | 122 | When replacing a store, the previous version of the store will be kept up to date and used throughout the rebuild process, in order to prevent the blocking of exchanges. The only exception to this is when a rebuild has been prompted by a conflict of `is_emtpy` properties on startup - in which case the vnode startup process should be paused to allow for the rebuild to complete. 123 | 124 | To avoid the need to do reads before writes when updating the AAE KeyStore from the vnode backend fold (so as not to replace a new update with an older snapshot value from the backend) new updates must be parked in a DiskLog process whilst the fold completes. Once the fold is complete, the rebuild of store can be finished by catching up on updates from the DiskLog. 125 | 126 | At this stage the old Keystore can be deleted, and the new KeyStore be used. At this stage though, the TreeCache does not necessarily reflect the state of the new KeyStore - the `replacing-tree` state is used to resolve this. When replacing the tree, new empty TreeCaches are started and maintained in parallel to the existing TreeCaches (which continue to be used in exchanges). A fold of the KeyStore is now commenced, whilst new updates are cached in a DiskLog. Once the fold is complete, the new updates are applied and the TreeCache can be migrated from the old cache to the new cache. 127 | 128 | 129 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /docs/RIAK_3_AAE.md: -------------------------------------------------------------------------------- 1 | # Proposed AAE Implementation in Riak 3.0 2 | 3 | - Each riak vnode has a kv_index_tictactree `aae_controller` process if Tictac AAE is enabled. It may also run [Riak 2 AAE](RIAK_2_AAE.md) in parallel - the two AAE implementations should not care about the existence of the other. 4 | 5 | - That `aae_controller` keeps a single `aae_keystore` process which is responsible for keeping all the keys and AAE metadata for the store. The `aae_keystore` process can be any backend that supports query by Merkle tree segment that duplicates the storage of keys; this is `parallel` mode. The `aae_keystore` could also be a reference to the vnode backend PID, if the vnode backend can itself support query by AAE segment; this is `native` mode and doesn't require the duplictaion of key/metadata storage. 6 | 7 | - As well as the key store, the `aae_controller` process keeps a `aae_treecache` process for each combination of preflist and n_val the vnode supports (or IndexN). This process is an in-memory cache of the current state of the Tictac Merkle tree for that IndexN. 8 | 9 | - IndexN is a reference to a combination of a n-val and a preflist supported by the vnode. If all buckets in the store are n-val 3, then there will be 3 IndexNs per vnode (and hence 3 merkle trees). If there are some buckets with an n-val of 3 and some with an n-val of 4 - there will be 7 merkle trees. 10 | 11 | - There is no longer a concept of dirty segments, each tree cache represents the current live state of the vnode. 12 | 13 | ## On PUT 14 | 15 | - When a vnode receives a PUT request, it passes a change note to the `aae_controller`. The change consists of the {Bucket, Key}, the IndexN, the CurrentClock and the PreviousClock for the PUT - as well as some useful metadata about the value (e.g. sibling count, index_hash, perhaps the whole object head). 16 | 17 | - If the change is a delete the CurrentClock should be none. 18 | 19 | - If the change is a fresh put, the PreviousClock should be none (this includes any PUt on the write once path). 20 | 21 | - If the change is a LWW PUT in an non-index backend, the PreviousClock should be undefined. 22 | 23 | - If the change is a rehash, then the PreviousClock should be undefined. 24 | 25 | - The `aae_controller` receives this update as a cast. It has two tasks now: 26 | 27 | - If the `aae_keystore` is running in `parallel` mode, cast the change on to the keystore. The `aae_keystore` should then queue up the new version for the next batch of keys/metadata to be stored. In `native` mode, no keystore change is required. 28 | 29 | - Based on the IndexN, the `aae_controller` should cast the delta to the appropriate `aae_treecache`. This should update the Merkle tree by removing the old version from the tree (by XORing the hash of the {Key, PreviousClock} again), and adding the new version (by XORing the segment by the hash of the {Key, CurrentClock}). 30 | 31 | - The exception to this is when the PreviousClock is undefined - meaning there was no read before write. In this case, the PreviousClock needs to be filled in with a read against the `aae_keystore` before processing. 32 | 33 | - One scenario where the previous clock is `undefined` is when Last Write Wins is used with a non-indexing backend. This removes some of the efficiency advantages of Last Write Wins writes, though doesn't eliminate the latency improvement (as the AAE read does not block the update from proceeding). 34 | 35 | - The other scenario is on a `rehash` request (when a read_repair GET request has not discovered an expected anomaly between the vnode values). 36 | 37 | - The `aae_keystore` may fill-in this information two ways. It could simple read the previous object in the keystore, or it could fold over all objects in that segment and IndexN to calculate an entirely new hash value for that segment. Perhaps the latter should be a fallback for `rehash` requests (i.e. on a dice roll on a rehash request, so rehash eventually causes this) 38 | 39 | - Before any fold operation on the `aae_keystore` the batch of changes waiting to be written are flushed. 40 | 41 | ## On Exchange 42 | 43 | - Riak can then be prompted to do exchanges (maybe via an entropy manager, maybe through scheduled external requests). An exchange could be: 44 | 45 | - An exchange for a given n_val between two coverage plans (with different offsets); 46 | 47 | - An exchange between each locally stored Preflist, and another remote Preflist for that n_val. 48 | 49 | - An exchange between a randomly chosen set of pairs of common IndexNs between vnodes. 50 | 51 | - An exchange is done by starting an `aae_exchange` process. The `aae_exchange` is a FSM and should be initiated by the calling service via sidejob. The `aae_exchange` process takes as input: 52 | 53 | - A BlueList and a PinkList - lists of {SendFun, [IndexN]} tuples, where the SendFun is a function that can send a message to a given controller, and the list of IndexNs are the preflist/n_val pairs relative to this exchange at that destination. The SendFun in this case should use the riak_core message passing to reach the riak_kv_vnode - and the riak_kv_vnode will be extended to detect AAE commands and forward them to the `aae_controller`. 54 | 55 | - A RepairFun - that will be passed any deltas, and in the case of intra-cluster anti-entropy the RepairFun should just send a throttled stream of GET requests to invoke read_repair 56 | 57 | - A ReplyFun - to send a response back to the client (giving the state at which the FSM exited, and the number of KeyDeltas discovered. 58 | 59 | - The exchange will request all the tree roots to be fetched from the Blue List and the Pink List - merging to give a Blue root and a Pink root, and comparing those roots. This will provide a list of branch IDs that may potentially have deltas. If the list is empty, the process will reply and exit. 60 | 61 | - The exchange will then pause, and then re-request all the tree roots. This will produce a new list of BranchID deltas from the comparison of the merged roots, and this will be intersected with the first list. If the list is empty, the process will reply and exit. 62 | 63 | - The exchange will then pause, and then request all the branches that were in the intersected list of BranchIDs from the Blue and Pink lists. This will again be a repeated request, with the intersection of the SegmentIDs that differ being taken forward to the next stage, and the process will reply and exit if the list is empty. 64 | 65 | - The number of SegmentIDs that are taken forward for the clock comparison is bounded, and the code will attempt to chose the set of SegmentIDs that are closest together as the subset to be used. Those SegmentIDs will then be forwarded in a request to `fetch_clocks`. These requests will be passed by the `aae_controller` to the `aae_keystore`, and this will fold over the store (and this will be the vnode store if the `aae_keystore` is running in native mode), looking for all Keys and Version Vectors within those segments and IndexNs. If the keystore is segment-ordered, this will be a series of range folds on the snapshot. If the keystore is key-ordered, then there will be a single fold across the whole store, but before a slot of keys is passed into the fold process it will be checked to see if it contains any key in the segment - and skipped if not. 66 | 67 | - When the Keys and Clocks are returned they are compared, and then deltas are passed to the RepairFun for read repair. 68 | 69 | 70 | ## On startup and shutdown 71 | 72 | - Before a vnode backend is shutdown, a special object should be stored where the value is a Shutdown GUID. When a vnode backend, the object should be read, and if present should then be deleted. 73 | 74 | - When the `aae_controller` is started it is passed the IsEmpty status of the vnode backend as well as the shutdown GUID. The `aae_keystore` should likewise have the Shutdown GUID on shutdown (and erased it on startup), and on startup an confirm that the IsEmpty status and Shutdown GUIDs match between the vnode and the `parallel` keystore. 75 | 76 | - If there is no match on startup, then it cannot be assumed that the two stores are consistent, and the next rebuild time should be set into the past. this should then prompt a rebuild. Until the rebuild is complete, the `aae_controller` should continue on a best endeavours basis, assuming that the data in the `aae_treecache` and `aae_keystore` is good enough until the rebuild completes. 77 | 78 | 79 | ## On rebuild 80 | 81 | - Rebuilds are not prompted by the `aae_controller`, they require an external actor to prompt them. 82 | 83 | - The `aae_controller` trackes the next rebuild time, the time when it should be next rebuild. This is based on adding a rebuild schedule (a fixed time between rebuilds and a random variable time to reduce the chances of rebuild synchronisation across vnodes) to the last rebuild time. The last rebuild time is persisted to be preserved across restarts. 84 | 85 | - The next rebuild time is reset to the past on startup, if a failure to shutdown cleanly and consistently either the vnode or the aae service is detected through a mismatch on the Shutdown GUID. 86 | 87 | - The vnode should check the next rebuild time after startup of the `aae_controller`, and schedule a callback to prompt the rebuild at that time. 88 | 89 | - When the rebuild time is reached, the vnode should prompt the rebuild of the store via the `aae_controller` 90 | 91 | - the prompt should pass in a SplitObjFun which can extract/calculate the IndexN and version vector for a Riak object in the store (this is required only in `parallel` mode). 92 | 93 | - the prompt should first flush all batched updates to the `aae_keystore`, and trigger the `aae_keystore` to enter the `loading` state. 94 | 95 | - in the `loading` state a separate backend for the keystore is started. All updates received from that point are added in batches to the main keystore as normal, but also queued up for the new keystore. 96 | 97 | - a fold objects function and a finish function is returned in response to the prompt request. 98 | 99 | - the fold fun will load all passed objects in batches as object_specs (by extracting out the vector clock etc) directly into the new load store of the `aae_keystore`. 100 | 101 | - the finish fun will prompt the worker running the fold to prompt the keystore to complete the load. This will involve, loading all the queued object specs (of updates received since the fold was started) into the store, deleting the old key store, and making the newly rebuilt key store the master. 102 | 103 | - the vnode process takes these fold functions, and starts an object fold using the functions (if snapshot fold is supported in the backend, then via a riak_core_node_worker so as not to avoid multiple parallel folds). 104 | 105 | - the vnode should also request a callback from the worker when the work is completed, to prompt it to prompt the `aae_controller` to now rebuild the tree caches. 106 | 107 | - If the `aae_keystore` is in `native` mode, none of the above happens, as the store is the store, and so there is no need for a rebuild. 108 | 109 | - the `aae_controller` should be prompted to rebuild_trees, and for this IndexNs (the list of IndexNs the vnode currently manages), PreflistFun (a fun to calculate the IndexN from a {B, K} pair - required only in `native` mode), and a WorkerFun (a node/vnode worker to tun the next fold) is passed. 110 | 111 | - the `aae_controller` should inform the `aae_treecaches` to start loading, this requires them to queue new updates as well as making the changes in the cache. 112 | 113 | - A fold is then run over the key store (or the vnode store in the case of `native` backends), using the WorkerFun. 114 | 115 | - the fold should incrementally build a separate TicTac tree for each IndexN. 116 | 117 | - when the fold is complete, the trees are sent to the `aae_treecache` processes for loading. Loading discards the previous tree, takes the new tree, and loads all the changes which have been queued since the point the snapshot for the fold to build the trees was taken. 118 | 119 | - This completes the rebuild process. It is important that the folds in the rebuild process use snapshots which are co-ordinated with the formation of the load queues - so that the deltas being applied to the load queues takes the system to a consistent point. 120 | 121 | - If there is a shutdown during a rebuild process, all the partially built elements are discarded, and the rebuild will be due again at startup. 122 | 123 | - Scheduling of rebuilds is not ventrally managed (so the locks required to reduce concurrency in the existing AAE process are discarded). There is instead a combination of some random factor added to the schedule time, plus the use of the core_node_worker_pool - which prevents more than one M folds being on a node concurrently (where M is the size of the pool, normally 1). 124 | 125 | ## Secondary Uses 126 | 127 | - A proof of concept on coverage folds showed that there were some interesting operations that could be managed more safely and efficiently than Map Reduce, using folds over the heads of objects, using a `core_node_worker_pool` and a backend which stores heads separate to objects. In introducing an AEE store where the `aae_keystore` can store additional metadata, perhaps including the whole object head - there exists the potential to bring these features to all backends. 128 | 129 | - Another possibility is the efficient handling of `HEAD` not `GET` requests (for example where only version vector is required). This is only supported in the leveled backend at present, in other backends it can be supported by still reading the object, and just stripping to the head to avoid the network overhead. It may be possible for a `riak_kv_vnode` with a bitcask backend to handle `HEAD` requests in this way, unless co-ordination between backend and AAE store is confirmed (because of matching Shutdown GUIDs at startup or a rebuild since startup). In this case the `HEAD` request could instead be handled by the AAE store, avoiding the actula object read. 130 | -------------------------------------------------------------------------------- /src/aae_util.erl: -------------------------------------------------------------------------------- 1 | %% -------- Overview --------- 2 | %% 3 | %% Centralised definition of log functions. To make switching to Lager in 4 | %% the future a bit easier, and avoid repeated codes across modules 5 | 6 | -module(aae_util). 7 | 8 | -include("aae.hrl"). 9 | 10 | -export([ 11 | log/4, 12 | get_log/1, 13 | get_opt/2, 14 | get_opt/3, 15 | make_binarykey/2, 16 | safe_open/1, 17 | set_loglevel/1, 18 | maybe_include_key/2, 19 | check_rootpath/1 20 | ]). 21 | 22 | -export([ 23 | clean_subdir/1, 24 | test_key_generator/1, 25 | flip_byte/3 26 | ]). 27 | 28 | -ifdef(TEST). 29 | -export([get_segmentid/2]). 30 | -endif. 31 | 32 | -type log_level() :: debug | info | warning | error | critical. 33 | -type log_levels() :: list(log_level()) | undefined. 34 | 35 | -export_type([log_levels/0]). 36 | 37 | -define(DOMAIN, [background, tictacaae]). 38 | 39 | %% erlfmt:ignore-begin 40 | -define(LOGBASE, 41 | #{ 42 | g0001 => 43 | {info, <<"Generic log point">>}, 44 | g0002 => 45 | {info, <<"Generic log point with term ~w">>}, 46 | d0001 => 47 | {debug, <<"Generic debug log">>}, 48 | aae01 => 49 | {warning, 50 | << 51 | "AAE Key Store rebuild required on startup due to mismatch between vnode store state ~w " 52 | "and AAE key store state of ~w maybe restart with node excluded from coverage " 53 | "queries to improve AAE operation until rebuild is complete" 54 | >> 55 | }, 56 | aae02 => 57 | {info, <<"Native KeyStore type ~w startup request">>}, 58 | aae03 => 59 | {debug, 60 | <<"Unexpected Bucket ~w Key ~w passed with IndexN ~w that does not match any of ~w">> 61 | }, 62 | aae04 => 63 | {warning, <<"Misrouted request for IndexN ~w">>}, 64 | aae06 => 65 | {info, <<"Received rebuild trees request for IndexNs ~w">>}, 66 | aae07 => 67 | {info, <<"Dispatching test fold">>}, 68 | aae08 => 69 | {info, <<"Spawned worker receiving test fold">>}, 70 | aae09 => 71 | {info, <<"Change in IndexNs detected at rebuild - new IndexN ~w">>}, 72 | aae10 => 73 | {info, <<"AAE controller started with IndexNs ~w and StoreType ~w">>}, 74 | aae11 => 75 | {info, <<"Next rebuild scheduled for ~w">>}, 76 | aae12 => 77 | {info, <<"Received rebuild store for parallel store ~w">>}, 78 | aae13 => 79 | {info, <<"Completed tree rebuild with rebuild_time_ms=~w">>}, 80 | aae14 => 81 | {debug, <<"Mismatch finding unexpected IndexN in fold of ~w">>}, 82 | aae15 => 83 | {info, <<"Ping showed time difference of ~w ms">>}, 84 | aae16 => 85 | {info, <<"Keystore ~w when tree rebuild requested">>}, 86 | aae17 => 87 | {warning, <<"Corrupted object with B=~p K=~p for ~w ~w">>}, 88 | ex001 => 89 | {info, <<"Exchange id=~s with target_count=~w expected purpose=~w">>}, 90 | ex002 => 91 | {error, <<"~w with pending_state=~w and missing_count=~w for exchange id=~s purpose=~w">>}, 92 | ex003 => 93 | {info, 94 | << 95 | "Normal exit for full exchange purpose=~w in_sync=~w pending_state=~w for exchange id=~s " 96 | "scope of mismatched_segments=~w root_compare_loops=~w branch_compare_loops=~w keys_passed_for_repair=~w" 97 | >> 98 | }, 99 | ex004 => 100 | {info, <<"Exchange id=~s purpose=~w led to prompting of repair_count=~w">>}, 101 | ex005 => 102 | {info, <<"Exchange id=~s throttled count=~w at state=~w">>}, 103 | ex006 => 104 | {debug, <<"State change to ~w for exchange id=~s">>}, 105 | ex007 => 106 | {debug, <<"Reply received for colour=~w in exchange id=~s">>}, 107 | ex008 => 108 | {debug, <<"Comparison between BlueList ~w and PinkList ~w">>}, 109 | ex009 => 110 | {info, 111 | << 112 | "Normal exit for full exchange purpose=~w in_sync=~w pending_state=~w for exchange id=~s " 113 | "scope of mismatched_segments=~w tree_compare_loops=~w keys_passed_for_repair=~w" 114 | >> 115 | }, 116 | ex010 => 117 | {warning, <<"Exchange not_supported in exchange id=~s for colour=~w purpose=~w">>}, 118 | ex011 => 119 | {info, <<"Filtered clocks before comparison removing blue=~w pink=~w">>}, 120 | ex012 => 121 | {info, <<"Bucket counts for blue ~0p pink ~0p">>}, 122 | ks001 => 123 | {info, <<"Key Store loading with id=~w has reached deferred count=~w">>}, 124 | ks002 => 125 | {warning, <<"No valid manifest found for AAE keystore at ~s reason ~s">>}, 126 | ks003 => 127 | {info, <<"Storing manifest with current GUID ~s">>}, 128 | ks004 => 129 | {info, <<"Key Store building with id=~w has reached loaded_count=~w">>}, 130 | ks005 => 131 | {info, <<"Clean opening of manifest with current GUID ~s">>}, 132 | ks006 => 133 | {warning, <<"Pending store is garbage and should be deleted at ~s">>}, 134 | ks007 => 135 | {info, <<"Rebuild prompt ~w with GUID ~s">>}, 136 | ks008 => 137 | {info, <<"Rebuild queue load backlog_items=~w loaded_count=~w">>}, 138 | r0001 => 139 | {info, <<"AAE fetch clock runner has seen results=~w query_time=~w for a query_count=~w queries">>}, 140 | r0002 => 141 | {info, <<"Query backlog resulted in dummy fold">>}, 142 | r0003 => 143 | {debug, <<"Query complete in time ~w">>}, 144 | r0004 => 145 | {debug, <<"Prompting controller">>}, 146 | r0005 => 147 | {warning, <<"Query lead to error ~w pattern ~w">>}, 148 | c0001 => 149 | {info, <<"Pending filename ~s found and will delete">>}, 150 | c0002 => 151 | {warning, <<"File ~w opened with error=~w so will be ignored">>}, 152 | c0003 => 153 | {info, <<"Saving tree cache to path ~s and filename ~s">>}, 154 | c0004 => 155 | {info, <<"Destroying tree cache for partition ~w">>}, 156 | c0005 => 157 | {info, <<"Starting cache with is_restored=~w and IndexN of ~w">>}, 158 | c0006 => 159 | {debug, <<"Altering segment for PartitionID=~w ID=~w Hash=~w">>}, 160 | c0007 => 161 | {warning, <<"Treecache exiting after trapping exit from Pid=~w">>}, 162 | c0008 => 163 | {info, <<"Complete load of tree with length of change_queue=~w">>}, 164 | c0009 => 165 | {info, <<"During cache rebuild reached length of change_queue=~w">>} 166 | 167 | }). 168 | %% erlfmt:ignore-end 169 | 170 | %%%============================================================================ 171 | %%% External functions 172 | %%%============================================================================ 173 | 174 | -spec get_log(atom()) -> {log_level(), binary()}. 175 | get_log(LogRef) -> 176 | maps:get(LogRef, ?LOGBASE). 177 | 178 | -spec log(log_level(), atom(), leveled_log:log_options(), list()) -> list(). 179 | log(LogLevel, LogRef, LogOpts, Subs) -> 180 | leveled_log:log( 181 | LogLevel, 182 | LogRef, 183 | LogOpts, 184 | Subs, 185 | ?LOGBASE, 186 | ?DOMAIN 187 | ). 188 | 189 | -spec set_loglevel(list() | undefined) -> ok. 190 | set_loglevel(undefined) -> 191 | ok; 192 | set_loglevel(Inputs) when Inputs =/= undefined -> 193 | LogLevel = 194 | lists:foldl( 195 | fun(Check, Acc) -> 196 | case {Check, Acc} of 197 | {_, none} -> 198 | case {Check, lists:member(Check, Inputs)} of 199 | {_Check, true} -> 200 | Check; 201 | {critical, false} -> 202 | % no valid input - set to info 203 | info; 204 | {_Check, false} -> 205 | none 206 | end; 207 | {_, Level} when Level /= none -> 208 | Level 209 | end 210 | end, 211 | none, 212 | [debug, info, warning, error, critical] 213 | ), 214 | case LogLevel of 215 | LogLevel when LogLevel /= none -> 216 | leveled_log:set_loglevel(LogLevel) 217 | end. 218 | 219 | -spec check_rootpath(list()) -> string(). 220 | check_rootpath(RootPath) -> 221 | case io_lib:printable_list(RootPath) of 222 | true -> 223 | RootPath 224 | end. 225 | 226 | -spec get_opt(atom(), list()) -> any(). 227 | %% @doc 228 | %% Return an option from a KV list 229 | get_opt(Key, Opts) -> 230 | get_opt(Key, Opts, undefined). 231 | 232 | -spec get_opt(atom(), list(), any()) -> any(). 233 | %% @doc 234 | %% Return an option from a KV list, or a default if not present 235 | get_opt(Key, Opts, Default) -> 236 | case proplists:get_value(Key, Opts) of 237 | undefined -> 238 | Default; 239 | Value -> 240 | Value 241 | end. 242 | 243 | -spec make_binarykey(aae_keystore:bucket(), aae_keystore:key()) -> binary(). 244 | %% @doc 245 | %% Convert Bucket and Key into a single binary 246 | make_binarykey({Type, Bucket}, Key) when 247 | is_binary(Type), is_binary(Bucket), is_binary(Key) 248 | -> 249 | <>; 250 | make_binarykey(Bucket, Key) when is_binary(Bucket), is_binary(Key) -> 251 | <>. 252 | 253 | -spec maybe_include_key( 254 | aae_controller:key_include_fun(), 255 | {aae_keystore:bucket(), aae_keystore:key()} | reset 256 | ) -> 257 | boolean(). 258 | maybe_include_key(none, _Input) -> 259 | true; 260 | maybe_include_key(KeyFilterFun, {Bucket, Key}) -> 261 | KeyFilterFun({Bucket, Key}); 262 | maybe_include_key(KeyFilterFun, reset) -> 263 | KeyFilterFun(reset). 264 | 265 | %%%============================================================================ 266 | %%% Internal functions 267 | %%%============================================================================ 268 | 269 | -spec safe_open(string()) -> {ok, binary()} | {error, atom()}. 270 | safe_open(FileName) -> 271 | case filelib:is_file(FileName) of 272 | true -> 273 | case file:read_file(FileName) of 274 | {ok, <>} -> 275 | case erlang:crc32(BinContent) of 276 | CRC32 -> 277 | {ok, BinContent}; 278 | _ -> 279 | {error, crc_wonky} 280 | end; 281 | _ -> 282 | {error, no_crc} 283 | end; 284 | false -> 285 | {error, not_present} 286 | end. 287 | 288 | %%%============================================================================ 289 | %%% Test 290 | %%%============================================================================ 291 | 292 | flip_byte(Binary, Offset, Length) -> 293 | Byte1 = rand:uniform(Length) + Offset - 1, 294 | <> = Binary, 295 | case A of 296 | 0 -> 297 | <>; 298 | _ -> 299 | <> 300 | end. 301 | 302 | test_key_generator(hash) -> 303 | ValueFun = 304 | fun() -> 305 | V = rand:uniform(1000), 306 | <> = 307 | crypto:hash(md5, <>), 308 | Hash 309 | end, 310 | internal_generator(ValueFun); 311 | test_key_generator(v1) -> 312 | ValueFun = 313 | fun() -> 314 | Clock = [{rand:uniform(1000), rand:uniform(1000)}], 315 | BClock = term_to_binary(Clock), 316 | Size = rand:uniform(100000), 317 | SibCount = rand:uniform(3), 318 | <> = crypto:hash(md5, BClock), 319 | {Clock, Hash, Size, SibCount} 320 | end, 321 | internal_generator(ValueFun). 322 | 323 | internal_generator(ValueFun) -> 324 | fun(I) -> 325 | Key = <<"Key", I:32/integer>>, 326 | Value = ValueFun(), 327 | {Key, Value} 328 | end. 329 | 330 | clean_subdir(DirPath) -> 331 | case filelib:is_dir(DirPath) of 332 | true -> 333 | {ok, Files} = file:list_dir(DirPath), 334 | lists:foreach( 335 | fun(FN) -> 336 | File = filename:join(DirPath, FN), 337 | io:format("Attempting deletion ~s~n", [File]), 338 | ok = 339 | case filelib:is_dir(File) of 340 | true -> 341 | clean_subdir(File), 342 | file:del_dir(File); 343 | false -> 344 | file:delete(File) 345 | end, 346 | io:format("Success deleting ~s~n", [File]) 347 | end, 348 | Files 349 | ); 350 | false -> 351 | ok 352 | end. 353 | 354 | -ifdef(TEST). 355 | 356 | -include_lib("eunit/include/eunit.hrl"). 357 | 358 | get_loglevel() -> 359 | element(2, leveled_log:get_opts()). 360 | 361 | set_loglevel_test() -> 362 | ?assertMatch(error, get_loglevel()), 363 | ok = set_loglevel([debug]), 364 | ?assertMatch(debug, get_loglevel()), 365 | ok = set_loglevel([nonsense]), 366 | ?assertMatch(info, get_loglevel()), 367 | ok = set_loglevel([warning, error, critical]), 368 | ?assertMatch(warning, get_loglevel()), 369 | ok = set_loglevel([nonsense, critical]), 370 | ?assertMatch(critical, get_loglevel()). 371 | 372 | get_segmentid(B, K) -> 373 | Seg32 = leveled_tictac:keyto_segment32(make_binarykey(B, K)), 374 | leveled_tictac:get_segment(Seg32, ?TREE_SIZE). 375 | 376 | flipbyte_test() -> 377 | Bin0 = <<0:256/integer>>, 378 | BinFB0 = flip_byte(Bin0, 0, 32), 379 | ?assertMatch(false, BinFB0 == Bin0), 380 | Bin1 = <<4294967295:32/integer>>, 381 | BinFB1 = flip_byte(Bin1, 1, 1), 382 | ?assertMatch(false, BinFB1 == Bin1). 383 | 384 | clen_empty_subdir_test() -> 385 | FakePath = "test/foobar99", 386 | ok = clean_subdir(FakePath). 387 | 388 | -endif. 389 | -------------------------------------------------------------------------------- /docs/DESIGN.md: -------------------------------------------------------------------------------- 1 | # TicTac Tree - design 2 | 3 | ## Objective 4 | 5 | The purpose of the KV TicTac Tree is to be able to make comparisons of groups of data partitions within and between database clusters, and prompt repair should differences be found. 6 | 7 | ### Sample Scenario 8 | 9 | Consider two different data stores. 10 | 11 | One store (Store A) stores data split across two virtual nodes (A1 and A2), and each node has the data split into three different partitions (A1.1, A1.2, A1.3, and A2.1, A2.2, A2.3). 12 | 13 | - Store A 14 | - Vnode A1 15 | - Partition A1.1 16 | - Partition A1.2 17 | - Partition A1.3 18 | - Vnode A2 19 | - Partition A2.1 20 | - Partition A2.2 21 | - Partition A3.3 22 | 23 | A second store (Store B) stores data in one virtual node (B1), but within that node data is split evenly across 4 partitions (B1.1, B1.2, B1.3 and B1.4). 24 | 25 | - Store B 26 | - Vnode B1 27 | - Partition B1.1 28 | - Partition B1.2 29 | - Partition B1.3 30 | - Partition B1.4 31 | 32 | There are a number of different relationships with regards to the data ownership that we expect to be true within this system, for example: 33 | 34 | - union(A1.1, A1.2, A1.3) == union(A2.1, A2.2, A2.3) 35 | - union(A1.1, A1.2, A1.3) == union(B1.1, B1.2, B1.3, B1.4) 36 | - union(A2.1, A2.2, A2.3) == union(B1.1, B1.2, B1.3, B1.4) 37 | - A1.1 == A2.1 38 | - A1.2 == A2.2 39 | - A1.3 == A2.3 40 | 41 | ### Constraints 42 | 43 | The objective is to have a simple and efficient way of validating all these relationships subject to the following constraints and conditions: 44 | 45 | - The AAE system should not place a dependancy on how the vnodes store their data in the partitions. 46 | 47 | - The AAE system should confirm that not only has all data reach each location but also that all data remains in that location, in particular entropy of persisted data must also be considered. 48 | 49 | - The AAE system should be always on and available to support as many comparisons as possible, management of anti-entropy should not require to schedule around downtime of AAE. 50 | 51 | - The AAE system should allow for throttling of exchanges and repairs so as not to overwhelm the system, especially when an alternative process may currently be managing an efficient repair (e.g. hinted handoff). 52 | 53 | - Any rebuild process (where state is refreshed to reflect current on-disk status in the main store), must be safe to throttle without impacting exchange availability. 54 | 55 | - It can be generally assumed that the vnode is aware of both the before and after state of objects subject to change (to inform the AAE process), but there may be exceptional circumstances (e.g. in Riak with LWW=true on a non-2i backend), where the AAE process itself may need to determine the before state. It may be that this process is less efficient as it is generally assumed that a system that cares enough about data loss to run anti-entropy, will also care enough to read before a write. 56 | 57 | The issue of how to handle timestamped objects with automatic background expiry is important, but is not currently thought through. 58 | 59 | ## Actors 60 | 61 | It is assumed that there are actor currently managing vnodes within the stores, and mechanisms for communicating within and between the vnodes in the stores, and determining membership relationships between partitions and vnodes. 62 | 63 | For each vnode an `aae_controller` will be started, with the controller requested to handle data for each partition supported by the vnode. The `aae_controller` will start an `aae_treecache` for each of the partitions, and a single `aae_keystore` for the vnode. 64 | 65 | ### Controller 66 | 67 | The `aae_controller` is responsible for marshalling all requests from the vnode, and for checking that the keystore, treecache and vnode partition stores remain locally synchronised. It primarily receives the follow requests: 68 | 69 | - put 70 | - Make a change to a TreeCache and update the KeyStore to represent a vnode change. The put request should inform the controller of the current clock, the previous clock, and the partition reference. 71 | - merge_root/merge_branches 72 | - Return the merged root of the tree roots for a list of partitions, or the merged branches of a list of Branch IDs. 73 | - fetch_clocks 74 | - for a given set of leaf identifiers and partitions return all the keys and version clocks for the objects in the system (from the key store). 75 | - rebuild 76 | - prompt the store to rebuild from the vnode store all state. 77 | - open/close 78 | - Open and close, using a shutdown GUID reference on close (then open) to confirm if open and close events are known to have returned the data and the AAE system to a consistent state (the same shutdown GUID should be persisted in the vnode data store at shutdown). 79 | - fold_keystore 80 | - Allow a general fold over the keystore covering all of the store, or a list of buckets, with a function to apply to each key/metadata pair in the store. 81 | 82 | 83 | ### TreeCache 84 | 85 | The `aae_treecache` is responsible for an individual partition reference. The partition reference is expected to be a combination of {n_val, partition_id} - so in a cluster any given vnode will have as many partition references as the sum of the n_vals supported by the cluster. 86 | 87 | The tree cache is an in-memory tictac tree using the `leveled_tictac` module. Changes are made as they are received (via async message). 88 | 89 | The `aae_treecache` process can also be placed in a load mode. When in load mode, deltas are queued as well as being applied to the current cache. When ready, `complete_load` can be called with a TicTac Tree formed from a snapshot taken as part of the same unit of work when the load was initialised. At this stage, the original tree can be destroyed, and the queue of changes can be applied to the new tree. This process can be used by the `aae_controller` to refresh the tree from Key Store, without ever having the tree cache go inactive. 90 | 91 | 92 | ### KeyStore 93 | 94 | The `aae_keystore` is a FSM that can be in three states: 95 | 96 | - `loading` 97 | - In the `loading` state store updates are PUT into the store, but queued for a second (replacement) store. The keystore can also receive load requests, which are only added into the replacement store. When the load is complete, the queued requests are put into the replacement store and the original store may be discarded. This allows the keystore to be rebuilt. 98 | - `parallel` 99 | - In the `parallel` state, a keystore is kept in parallel to the vnode store, to resolve any fold requests passed in. A `parallel` store may transition in and out of the `loading` state (back into the `parallel` state). 100 | - `native` 101 | - In the `native` state, not parallel store is kept, but a reference is kept by the `aae_keystore` process to the vnode backend, and queries are resolved by calling the actual vnode backend. This requires the vnode backend to support the same API is the parallel `aae_keystore` (and so would currently in riak need to be the leveled backend). There is no transition in and out of `loading` from the `native` state. 102 | 103 | There are two types of parallel stores currently supported (but adding other stores should be simple): 104 | 105 | - `leveled_so` (leveled backend but with a key that is ordered first by segment ID) 106 | - `leveled_ko` (leveled backend but ordered by the actual object {Bucket, Key} pair, but with accelerated scanning over a sublist of segment IDs). 107 | 108 | 109 | ### Exchange 110 | 111 | The `aae_exchange` is a FSM used for managing a single anti-entropy exchange to find keys to repair based on comparison between two lists - the `blue` and `pink` lists. The lists for the comparison are a list of `[{SendFun, PartitionRefList}]` tuples, where the SendFun encapsulates a mechanism for reaching a given `aae_controller`, and the PartitionRefList is a list of Partition References which are required from that controller. 112 | 113 | The lists can have multiple items (e.g. require contact with multiple controllers), and request multiple partition references from each controller - which would be normal for comparing coverage plans. The lists do not need to be of equivalent dimensions between `blue` and `pink`. 114 | 115 | The FSM process will alternate between multiple 'checking' states and the special state `waiting_all_results`, until a 'checking' state reveals a full match. The 'checking' state are: 116 | 117 | - `root_compare` - fetch the tree roots and compare. 118 | - `root_confirm` - fetch the tree roots and compare, select the intersection of branch IDs from this first pass and the last pass to use at the next stage. 119 | - `branch_compare` - fetch the tree branches which differ in the root and compare. 120 | - `branch_confirm` - fetch the tree branches which differ in the root and compare, select the intersection of segment leaf IDs from the first pass and last pass to use at the next stages. 121 | - `clock_compare` - fetch the keys and clocks associated with the segment leaf IDs and compare - passing any deltas to a RepairFun provided by the calling process to repair. 122 | 123 | The exchange is throttled in two ways. Firstly, there is a jittered pause between each state transition. Secondly, the number of IDs (branch or segment leaf IDs) that can be passed from a confirm state is limited. This will increase the number of iterations required to fill-in an entirely diverse vnode. The RepairFun that makes the repair is passed-in, and may apply its own throttle, but the `aae_exchange` will not explicitly throttle the repairs. 124 | 125 | 126 | ## Notes on Riak Implementation 127 | 128 | Although the AAE library is intended to be generic, it is primarily focused on being a new AAE mechanism for Riak. Some notes on how this should be implemented within Riak, and functionality that can be expected. 129 | 130 | ### Transition 131 | 132 | Transition between AAE releases is hard (as demonstrated by the difficulties of the hash algorithm change from legacy to version 0 in the existing riak_kv_index_hashtree implementation). The intention is to allow this AAE to be a plugin over and above the existing AAE implementation, making transition an administrative task: the tictac tree AAE can be run in Riak oblivious to whether existing AAE versions are running. 133 | 134 | ### Startup, Shutdown and Synchronisation 135 | 136 | The `riak_kv_vnode` is expected to be responsible for stopping and starting the `aae_controller` should this version of AAE be implemented. The `aae_controller` should only be started after the vnode backend has been started, but before the vnode is marked as ready. The trees, parallel keystore (in parallel mode) and vnode store may at this stage be out of sync, if the vnode had not previously shut down cleanly. Whilst stores are out of sync, they will still operate but return false negative results: however, false negative results will prompt incremental repair of the synchronisation issue. Incremental repair of a parallel keystore is done using the per-vnode rehash. Incremental repair of the trees is done through a rehashing of the segments undertaken as part of the `aae_controller:aae_fetchclocks/5`. 137 | 138 | If the `aae_treecache` was not shutdown correctly, then the whole cache may be incorrect (e.g. empty). This would take a long time to incrementally repair, and so this scenario is detected and flagged at startup. It is therefore recommended at vnode startup, that the `aae_controller:aae_rebuildtrees/5` be called with the `OnlyIfBroken = true`. This will return `skipped` if the treecache appeared to have been recovered successfully and not rebuild, but will rebuild if a potential issue with any of the tree_caches had been flagged at startup. 139 | 140 | Whilst the stores are potentially out of sync, then the controller should operate as normal - this will potentially lead to false repairs until the rebuild is complete. If to an administrator, the possibility of non-synchronisation is a known possibility, such as when a node is restarting following a hard crash - then the [participate in coverage](https://github.com/basho/riak_core/pull/917) feature can be used to remove the node's vnodes from any coverage plan based AAE exchanges. 141 | 142 | There exists the potential for further improvements of vnode store to aae coordination, should the aae store be used for additional functional reasons in the future. 143 | 144 | ### Intra-Cluster AAE 145 | 146 | The `aae_exchange` is flexible so that intra-cluster AAE can be done pairwise or between coverage offsets. If we have a ring size of 128, and a single n-val of 3, there are 384 pairwise exchanges. So an entropy_manager could be elected in the cluster which rotates around those pairwise exchanges. 147 | 148 | It would be quicker to just perform the 3 comparisons necessary to rotate around the 3 coverage plans (with the 3 different offsets), and compare those coverage plans. However, in the scenario where a single 149 | 150 | ### AAE Cluster Full-Sync 151 | 152 | .... 153 | 154 | ### MapFold Changes - Backend Independent 155 | 156 | Previously there had been some work down to add [MapFold](https://github.com/martinsumner/riak_kv/blob/mas-2.1.7-foldobjects/docs/MAPFOLD.md) as a feature to Riak. This is in someways an alternative to the work done by Basho on riak_kv_sweeper - there is a generic need to have functions that fold over objects, that produce outputs that aren't required immediately. This is especially true for operational reasons e.g.: 157 | 158 | - find all sibling'd objects; 159 | - count then number of objects in a bucket; 160 | - what is the average object size in the database; 161 | - provide a histogram of last modified dates on objects in a bucket). 162 | 163 | There may also be functional reasons whereby we might want to have non-disruptive folds with bespoke functions and accumulators - especially for reporting (e.g. count all the people by age-range and gender), that currently require a secondary index and for all 2i terms to be fed back to the application for processing, with the application needing to control throttling of the query. 164 | 165 | Riak previously had Map/Reduce which could answer these problems, but Map/Reduce was designed to be fast. It was controlled in the sense it has back pressure to prevent the reading of data from overwhelming the processing of that data - but it was not controlled to prevent a Map/Reduce workload from overloading standard K/V GET/PUT activity. Also Map/Reduce required the reading of the whole object, so didn't offer any optimisation if the interesting information was on a 2i term or in object metadata. 166 | 167 | The Mapfold work provided to a solution to this, but to be efficient it depended on 168 | the backend supporting secondary indexes and/or fold_heads. The Mapfold work was optimised for the leveled backend, but left other backends behind. 169 | 170 | One side effect of kv_index_tictactree is that provides a parallel store (when leveled is not used), that can still be key-ordered. The metadata that gets put into that parallel store could be extended to include the full object head. So the same queries that work with a native leveled backend, will work with a parallel AAE leveled key-ordered backend. Potentially this would mean that MapFold could be supported efficiently with any backend where AAE has been enabled. 171 | 172 | ### Per-Bucket MDC Replication 173 | 174 | ... 175 | 176 | ### Bitcask and HEAD requests 177 | 178 | .... 179 | 180 | ### Bitcask and 2i Support 181 | 182 | .... 183 | 184 | ### Improved Vnode Synchronisation on Abrupt Shutdown 185 | 186 | .... 187 | 188 | ### Backup Use-case 189 | 190 | Backups in Riak are hard. Hard for good reasons: 191 | 192 | - the difficulty of co-ordinating a snapshot in a point in time across many processes on many machines; 193 | - the volume of data traditionally used by people who need a distributed database; 194 | - the inherent duplication of data in Riak; 195 | - the write amplification in some Riak backends (leveldb) increasing the cost of any rsync based mechanism for backup. 196 | 197 | Historically different approaches have been tried, and ultimately most Riak systems either end up running without historic backups (just MDC replication), or with a bespoke backup approach integrated into either the database and/or the application. 198 | 199 | One possibility is to be able to run a very small cluster with dense storage machines, in a backup configuration: e.g. node count of 1, ring size of 8, n/r/w-val of 1, vnode backend rsync friendly (leveled/bitcask) with 2i disabled. If we can now replicate from a production scale cluster to this (using rabl for real time-replication so that peak load is queued), then stopping this single node cluster and running rsync periodically could produce a more traditional backup approach without impeding on decision making wrt production database setup (e.g. ring size, n-val and write-amplification and query support in the backend). 200 | 201 | The combination of repl replication, and AAE full-sync independent of ring-size and n-val might make such a solution possible without bespoke application effort. 202 | 203 | ### AAE for 2i Terms 204 | 205 | .... 206 | 207 | ### 2i Repair 208 | 209 | .... 210 | 211 | ### Rehash Support - Consideration for W1 Misuse 212 | 213 | .... 214 | 215 | ### Support for LWW on Bitcask 216 | 217 | .... 218 | -------------------------------------------------------------------------------- /test/property/aae_eqc.erl: -------------------------------------------------------------------------------- 1 | %%% @author Thomas Arts 2 | %%% @copyright (C) 2019, Thomas Arts 3 | %%% @doc 4 | %%% 5 | %%% @end 6 | %%% Created : 5 Feb 2019 by Thomas Arts 7 | 8 | -module(aae_eqc). 9 | 10 | -ifdef(EQC). 11 | -include_lib("eqc/include/eqc.hrl"). 12 | -include_lib("eqc/include/eqc_statem.hrl"). 13 | -include_lib("eunit/include/eunit.hrl"). 14 | 15 | -compile([export_all, nowarn_export_all]). 16 | -compile({nowarn_deprecated_function, [{erlang, now, 0}]}). 17 | 18 | -define(LOG_LEVELS, [error, critical]). 19 | -define(EXCHANGE_PAUSE_MS, 10). 20 | 21 | -define(NUMTESTS, 1000). 22 | -define(QC_OUT(P), 23 | eqc:on_output(fun(Str, Args) -> 24 | io:format(user, Str, Args) end, P)). 25 | 26 | 27 | eqc_test_() -> 28 | {timeout, 120, 29 | ?_assertEqual(true, 30 | eqc:quickcheck(eqc:testing_time(60, ?QC_OUT(prop_aae()))))}. 31 | 32 | run() -> 33 | run(?NUMTESTS). 34 | 35 | run(Count) -> 36 | eqc:quickcheck(eqc:numtests(Count, prop_aae())). 37 | 38 | check() -> 39 | eqc:check(prop_aae()). 40 | 41 | 42 | %% -- State and state functions ---------------------------------------------- 43 | initial_state() -> 44 | #{aae_controllers => 45 | [{"a", #{store => []}}, 46 | {"b", #{store => []}}], %% list of controllers, each unique map 47 | history => 48 | [] %% {Bucket, Key, VClock, LastModified} 49 | }. 50 | 51 | %% -- Generators ------------------------------------------------------------- 52 | 53 | pos() -> 54 | ?LET(N, nat(), N+1). 55 | 56 | timestamp(_Obj) -> 57 | 1. 58 | 59 | gen_vclock() -> 60 | ?LET(Names, non_empty(sublist(names())), 61 | [ {Name, nat()} || Name <- Names]). 62 | 63 | gen_vclock(VClockGen) -> 64 | ?LET(VClock, VClockGen, 65 | ?LET({{K, C}, P}, {elements(VClock), pos()}, 66 | lists:keyreplace(K, 1, VClock, {K, C + P}))). 67 | 68 | names() -> 69 | [a, b, c, d, e, f]. 70 | 71 | %% Cannot be atoms! 72 | %% key() type specified: should be binary(). 73 | gen_bucket() -> 74 | elements([<<"bucket1">>, <<"bucket2">>, <<"bucket3">>]). 75 | 76 | gen_key() -> 77 | binary(16). 78 | 79 | gen_bkcm(S) -> 80 | ?LET({B, K}, frequency([{length(maps:get(history, S, [])), ?LAZY(elements([F || {F, _, _} <-maps:get(history, S)]))}, 81 | {10, {gen_bucket(), gen_key()}}]), 82 | case lists:keyfind({B, K}, 1, maps:get(history, S, [])) of 83 | false -> 84 | {B, K, none, gen_vclock(), gen_last_modified()}; 85 | {_, PrevClock, _LastModifed} -> 86 | {B, K, undefined, gen_vclock(PrevClock), gen_last_modified()} 87 | end). 88 | 89 | gen_last_modified() -> 90 | [{1549, choose(448000, 448100), 0}]. 91 | 92 | 93 | %% generate a new store 94 | gen_store([], Store2) -> 95 | Store2; 96 | gen_store([{{B, K}, C1, LM1} | Store1], Store2) -> 97 | case lists:keyfind({B, K}, 1, Store2) of 98 | false -> 99 | [ {{B, K}, C1, LM1} | gen_store(Store1, Store2) ]; 100 | {_, C2, _} -> 101 | [ {{B, K}, gen_vclock(elements([C1, C2])), gen_last_modified()} | 102 | gen_store(Store1, lists:keydelete({B,K}, 1, Store2))] 103 | end. 104 | 105 | 106 | %% -- Common pre-/post-conditions -------------------------------------------- 107 | command_precondition_common(_S, _Cmd) -> 108 | true. 109 | 110 | precondition_common(_S, _Call) -> 111 | true. 112 | 113 | postcondition_common(_S, _Call, _Res) -> 114 | true. 115 | 116 | %% -- Operations ------------------------------------------------------------- 117 | 118 | object_split(Object) -> 119 | {_Size, _SiblingCount, _IndexHash, _LastMod, _UserData} = binary_to_term(Object). 120 | 121 | %% --- Operation: init --- 122 | start_pre(S) -> 123 | unstarted_controllers(S) =/= []. 124 | 125 | start_args(S) -> 126 | ?LET({Path, M}, elements(unstarted_controllers(S)), 127 | [ Path, 128 | {parallel, leveled_ko}, 129 | maps:get(store, M, []) == [], 130 | elements([{1, 1}, {0, 3600}]), %% if hours is set to 1 it means we cannot trigger a rebuild in a test 131 | [{0, 3}, {1, 3}, {2,3}], %% behaviour is not different for less 132 | {var, dir} 133 | ]). 134 | 135 | start_pre(S, [Path, _KeyStoreType, _IsEmpty, _RebuildSchedule, _PrefLists, _RootPath]) -> 136 | Controllers = maps:get(aae_controllers, S, []), 137 | case lists:keyfind(Path, 1, Controllers) of 138 | false -> 139 | %% Controller has not been started yet 140 | true; 141 | {_, M} -> 142 | %% Check whether the controller is already started 143 | not maps:is_key(aae_controller, M) 144 | end. 145 | 146 | start(Path, KeyStoreType, IsEmpty, RebuildSchedule, PrefLists, RootPath) -> 147 | case catch aae_controller:aae_start(KeyStoreType, IsEmpty, RebuildSchedule, PrefLists, 148 | filename:join(RootPath, Path), 149 | fun object_split/1, 150 | ?LOG_LEVELS) of 151 | {ok, Pid} -> Pid; 152 | Other -> Other 153 | end. 154 | 155 | start_next(S, Value, [Path, _KeyStoreType, IsEmpty, _RebuildSchedule, PrefLists, _RootPath]) -> 156 | Controllers = maps:get(aae_controllers, S), 157 | {_, Map} = lists:keyfind(Path, 1, Controllers), 158 | RebuildIsDue = (not IsEmpty andalso maps:get(store, Map, []) == []), 159 | S#{aae_controllers => 160 | lists:keyreplace(Path, 1, Controllers, {Path, Map#{aae_controller => Value, 161 | rebuild_due => RebuildIsDue, 162 | preflists => PrefLists}})}. 163 | 164 | start_post(_S, _Args, Res) -> 165 | is_pid(Res). 166 | 167 | start_features(_S, [_Path, _KeyStoreType, IsEmpty, RebuildSchedule, _PrefLists, _RootPath], _Res) -> 168 | [ {start, {schedule, RebuildSchedule}}, {start, {is_empty, IsEmpty}} ]. 169 | 170 | 171 | %% --- Operation: stop --- 172 | stop_pre(S) -> 173 | started_controllers(S) =/= []. 174 | 175 | stop_args(S) -> 176 | ?LET({Path, M}, elements(started_controllers(S)), 177 | [Path, maps:get(aae_controller, M)]). 178 | 179 | stop_pre(S, [Path, Pid]) -> 180 | {_, M} = lists:keyfind(Path, 1, maps:get(aae_controllers, S)), 181 | Pid == maps:get(aae_controller, M). %% for shrinking 182 | 183 | stop(_, Pid) -> 184 | catch aae_controller:aae_close(Pid). 185 | 186 | stop_next(S, _Value, [Path, _Pid]) -> 187 | Controllers = maps:get(aae_controllers, S), 188 | {_, M} = lists:keyfind(Path, 1, Controllers), 189 | S#{aae_controllers => 190 | lists:keyreplace(Path, 1, Controllers, {Path, maps:without([aae_controller], M)})}. 191 | 192 | stop_post(_S, [_, _Pid], Res) -> 193 | eq(Res, ok). 194 | 195 | %% --- Operation: next_rebuild --- 196 | nextrebuild_pre(S) -> 197 | started_controllers(S) =/= []. 198 | 199 | nextrebuild_args(S) -> 200 | ?LET({Path, M}, elements(started_controllers(S)), 201 | [Path, maps:get(aae_controller, M)]). 202 | 203 | nextrebuild_pre(S, [Path, Pid]) -> 204 | Controllers = maps:get(aae_controllers, S), 205 | {_, M} = lists:keyfind(Path, 1, Controllers), 206 | Pid == maps:get(aae_controller, M). %% for shrinking 207 | 208 | %% If we expected to be due, it should be due. 209 | nextrebuild(_, Pid) -> 210 | TS = aae_controller:aae_nextrebuild(Pid), 211 | os:timestamp() > TS. 212 | 213 | nextrebuild_post(S, [Path, _Pid], Res) -> 214 | Controllers = maps:get(aae_controllers, S), 215 | {_, M} = lists:keyfind(Path, 1, Controllers), 216 | not maps:get(rebuild_due, M) orelse Res. 217 | 218 | 219 | nextrebuild_features(_S, [_, _Pid], Res) -> 220 | [ {nextrebuild, Res} ]. 221 | 222 | 223 | %%--- Operation: put --- 224 | put_pre(S) -> 225 | started_controllers(S) =/= []. 226 | 227 | put_args(S) -> 228 | ?LET({{Path, M}, {B, K, PClock, VClock, LastMod}}, {elements(started_controllers(S)), gen_bkcm(S)}, 229 | [Path, maps:get(aae_controller, M), 230 | maps:get(preflists, M), B, K, VClock, PClock, {pos(), pos(), 0, LastMod, []}]). 231 | 232 | put_pre(_S, [_Path, _Pid, _PrefLists, _Bucket, _Key, _CurrentClock, _PrevClock, _MetaData]) -> 233 | true. 234 | 235 | put(_Path, Pid, PrefLists, Bucket, Key, CurrentClock, PrevClock, MetaData) -> 236 | PrefList = lists:nth((erlang:phash2({Bucket, Key}) rem length(PrefLists)) + 1, PrefLists), 237 | aae_controller:aae_put(Pid, PrefList, Bucket, Key, CurrentClock, PrevClock, term_to_binary(MetaData)). 238 | 239 | put_next(S, _Value, [Path, _Pid, _PrefLists, Bucket, Key, CurrentClock, _PrevClock, {_, _, _, LastMod, _}]) -> 240 | Controllers = maps:get(aae_controllers, S), 241 | {_, M} = lists:keyfind(Path, 1, Controllers), 242 | S#{aae_controllers => 243 | lists:keyreplace(Path, 1, Controllers, 244 | {Path, M#{store => 245 | [ {{B, K}, C, L} || {{B, K}, C, L} <- maps:get(store, M), {Bucket, Key} =/= {B, K}] ++ 246 | [ {{Bucket, Key}, CurrentClock, LastMod} ] 247 | }}), 248 | history => 249 | maps:get(history, S, []) ++ [{{Bucket, Key}, CurrentClock, LastMod}] 250 | }. 251 | 252 | 253 | put_post(_S, [_Path, _Pid, _PrefLists, _Bucket, _Key, _CurrentClock, _PrevClock, _MetaData], Res) -> 254 | eq(Res, ok). 255 | 256 | put_features(_S, [_Path, _Pid, _PrefLists, _Bucket, _Key, _CurrentClock, PrevClock, _MetaData], _Res) -> 257 | [ {put, PrevClock} ]. 258 | 259 | 260 | %% --- Operation: exchange --- 261 | exchange_pre(S) -> 262 | length(started_controllers(S)) >= 2. 263 | 264 | exchange_args(S) -> 265 | Controllers = started_controllers(S), 266 | ?LET({Path1, M1}, elements(Controllers), 267 | ?LET({Path2, M2}, elements(Controllers), %% possibly minus the already selected one 268 | [ Path1, Path2, 269 | [maps:get(aae_controller, M1), maps:get(preflists, M1)], %% BlueList 270 | [maps:get(aae_controller, M2), maps:get(preflists, M2)] %% PinkList 271 | ])). 272 | 273 | exchange_pre(S, [Path1, Path2, _Blue, _Pink]) -> 274 | lists:keymember(Path1, 1, started_controllers(S)) andalso 275 | lists:keymember(Path2, 1, started_controllers(S)). 276 | 277 | exchange(_, _, [BluePid, BluePrefLists], [PinkPid, PinkPrefLists]) -> 278 | BlueList = [{testutil:exchange_sendfun(BluePid), BluePrefLists}], 279 | PinkList = [{testutil:exchange_sendfun(PinkPid), PinkPrefLists}], 280 | QuickCheck = self(), 281 | {ok, Pid, _UUID} = aae_exchange:start(full, BlueList, PinkList, 282 | fun(KeyList) -> QuickCheck ! {self(), repair, KeyList} end, %% do not repair at all 283 | fun(Result) -> QuickCheck ! {self(), reply, Result} end, 284 | none, 285 | [{transition_pause_ms, ?EXCHANGE_PAUSE_MS}, 286 | {log_levels, ?LOG_LEVELS}]), 287 | receive 288 | {Pid, reply, {root_compare, 0}} -> 289 | {root_compare, 0}; 290 | {Pid, reply, Other} -> 291 | receive 292 | {Pid, repair, KeyList} -> 293 | {repair, Other, KeyList} 294 | after 5000 -> timeout 295 | end 296 | after 5000 -> timeout 297 | end. 298 | 299 | exchange_post(S, [Path1, Path2, _Blue, _Pink], Res) -> 300 | {_, M1} = lists:keyfind(Path1, 1, maps:get(aae_controllers, S, [])), 301 | {_, M2} = lists:keyfind(Path2, 1, maps:get(aae_controllers, S, [])), 302 | BlueStore = lists:usort(maps:get(store, M1, [])), 303 | PinkStore = lists:usort(maps:get(store, M2, [])), 304 | MatchBlueFun = 305 | fun({{B, K}, C, _L}, Acc) -> 306 | case lists:keyfind({B, K}, 1, PinkStore) of 307 | false -> 308 | [{{B, K}, {C, none}}|Acc]; 309 | {{B, K}, C, _} -> 310 | Acc; 311 | {{B, K}, NC, _} -> 312 | [{{B, K}, {C, NC}}|Acc] 313 | end 314 | end, 315 | MatchPinkFun = 316 | fun({{B, K}, C, _L}, Acc) -> 317 | case lists:keyfind({B, K}, 1, BlueStore) of 318 | false -> 319 | [{{B, K}, {none, C}}|Acc]; 320 | _ -> 321 | Acc 322 | end 323 | end, 324 | Acc0 = lists:foldl(MatchBlueFun, [], BlueStore), 325 | Expected = lists:usort(lists:foldl(MatchPinkFun, Acc0, PinkStore)), 326 | case Res of 327 | {root_compare, 0} -> 328 | eq(0, length(Expected)); 329 | {repair, {clock_compare, N}, KeyList} -> 330 | N == length(KeyList) 331 | andalso eq(lists:sort(KeyList), Expected); 332 | _ -> 333 | eq(Res, Expected) %% will print the difference 334 | end. 335 | 336 | exchange_features(_S, [_Path1, _Path2, _Blue, _Pink], Res) -> 337 | case Res of 338 | {root_compare, 0} -> 339 | root_compare; 340 | {repair, {clock_compare, N}, _KeyList} -> 341 | {clock_compare, N}; 342 | _ -> 343 | Res 344 | end. 345 | 346 | 347 | 348 | %% --- Operation: sync --- 349 | sync_pre(S) -> 350 | length(started_controllers(S)) >= 2. 351 | 352 | sync_args(S) -> 353 | Controllers = started_controllers(S), 354 | ?LET({Path1, M1}, elements(Controllers), 355 | ?LET({Path2, M2}, elements(Controllers -- [{Path1, M1}]), 356 | [ Path1, Path2, 357 | maps:get(preflists, M1), maps:get(preflists, M2), 358 | maps:get(aae_controller, M1), maps:get(aae_controller, M2), 359 | gen_store(maps:get(store, M1), maps:get(store, M2)) ])). 360 | 361 | 362 | sync_pre(S, [Path1, Path2, _, _, _, _, _Store]) -> 363 | lists:keymember(Path1, 1, started_controllers(S)) andalso 364 | lists:keymember(Path2, 1, started_controllers(S)). 365 | 366 | 367 | sync(_Path1, _Path2, _PrefLists1, _PrefLists2, _Pid1, _Pid2, []) -> 368 | ok; 369 | sync(Path1, Path2, PrefLists1, PrefLists2, Pid1, Pid2, [{{B, K}, VC, LastMod}|Store]) -> 370 | %% TODO: add meta data to the state and extract it again 371 | put(Path1, Pid1, PrefLists1, B, K, VC, undefined, {1, 1, 0, LastMod, []}), 372 | put(Path2, Pid2, PrefLists2, B, K, VC, undefined, {1, 1, 0, LastMod, []}), 373 | sync(Path1, Path2, PrefLists1, PrefLists2, Pid1, Pid2, Store). 374 | 375 | sync_next(S, _Value, [Path1, Path2, _PrefLists1, _PrefLists2, _Pid1, _Pid2, Store]) -> 376 | Controllers = maps:get(aae_controllers, S), 377 | {_, M1} = lists:keyfind(Path1, 1, Controllers), 378 | {_, M2} = lists:keyfind(Path2, 1, Controllers), 379 | S#{aae_controllers => 380 | lists:keyreplace(Path1, 1, 381 | lists:keyreplace(Path2, 1, Controllers, 382 | {Path2, M2#{store => Store}}), 383 | {Path1, M1#{store => Store}}), 384 | history => 385 | maps:get(history, S, []) ++ Store 386 | }. 387 | 388 | 389 | 390 | 391 | %% --- ... more operations 392 | 393 | %% -- Property --------------------------------------------------------------- 394 | prop_aae() -> 395 | Dir = "./aae_data", 396 | eqc:dont_print_counterexample( 397 | ?FORALL(Cmds, commands(?MODULE), 398 | begin 399 | os:cmd("rm -rf " ++ Dir), 400 | {H, S, Res} = run_commands(Cmds, [{dir, Dir}]), 401 | [ aae_controller:aae_close(maps:get(aae_controller, M)) || {_, M} <- started_controllers(S) ], 402 | CallFeatures = call_features(H), 403 | check_command_names(Cmds, 404 | measure(length, commands_length(Cmds), 405 | aggregate(with_title('Features'), CallFeatures, 406 | aggregate_feats(all_command_names(), CallFeatures, 407 | features(CallFeatures, 408 | pretty_commands(?MODULE, Cmds, {H, S, Res}, 409 | Res == ok)))))) 410 | end)). 411 | 412 | aggregate_feats([], _, Prop) -> Prop; 413 | aggregate_feats([atoms | Kinds], Features, Prop) -> 414 | {Atoms, Rest} = lists:partition(fun is_atom/1, Features), 415 | aggregate(with_title(atoms), Atoms, aggregate_feats(Kinds, Rest, Prop)); 416 | aggregate_feats([Tag | Kinds], Features, Prop) -> 417 | {Tuples, Rest} = lists:partition(fun(X) -> is_tuple(X) andalso element(1, X) == Tag end, Features), 418 | aggregate(with_title(Tag), [ Arg || {_, Arg} <- Tuples ], aggregate_feats(Kinds, Rest, Prop)). 419 | 420 | 421 | bugs() -> bugs(10). 422 | 423 | bugs(N) -> bugs(N, []). 424 | 425 | bugs(Time, Bugs) -> 426 | more_bugs(eqc:testing_time(Time, prop_aae()), 20, Bugs). 427 | 428 | 429 | %%% ---- state functions 430 | 431 | unstarted_controllers(S) -> 432 | Controllers = maps:get(aae_controllers, S, []), 433 | lists:filter(fun({_, M}) -> not maps:is_key(aae_controller, M) end, Controllers). 434 | 435 | started_controllers(S) -> 436 | Controllers = maps:get(aae_controllers, S, []), 437 | lists:filter(fun({_, M}) -> maps:is_key(aae_controller, M) end, Controllers). 438 | 439 | -endif. -------------------------------------------------------------------------------- /test/end_to_end/basic_SUITE.erl: -------------------------------------------------------------------------------- 1 | -module(basic_SUITE). 2 | -include_lib("common_test/include/ct.hrl"). 3 | -export([all/0, init_per_suite/1, end_per_suite/1]). 4 | -export([ 5 | dual_store_compare_medium_so/1, 6 | dual_store_compare_medium_ko/1, 7 | dual_store_compare_large_so/1, 8 | dual_store_compare_large_ko/1, 9 | store_notsupported/1, 10 | get_set_rebuild_schedule/1, 11 | get_set_storeheads/1, 12 | get_set_nextrebuild/1, 13 | splitfun_compare_functions/1 14 | ]). 15 | 16 | all() -> 17 | [ 18 | dual_store_compare_medium_so, 19 | dual_store_compare_medium_ko, 20 | dual_store_compare_large_so, 21 | dual_store_compare_large_ko, 22 | store_notsupported, 23 | get_set_rebuild_schedule, 24 | get_set_storeheads, 25 | get_set_nextrebuild, 26 | splitfun_compare_functions 27 | ]. 28 | 29 | init_per_suite(Config) -> 30 | testutil:init_per_suite([{suite, "basic"} | Config]), 31 | Config. 32 | 33 | end_per_suite(Config) -> 34 | testutil:end_per_suite(Config). 35 | 36 | get_set_rebuild_schedule(_Config) -> 37 | RootPath = testutil:reset_filestructure(), 38 | VnodePath1 = filename:join(RootPath, "vnode1/"), 39 | SplitF = fun(_) -> {_SomeSensibleSize = 42, 1, 0, undefined, <<>>} end, 40 | RS0 = {1, 300}, 41 | 42 | {ok, Cntrl} = 43 | aae_controller:aae_start( 44 | {parallel, leveled_ko}, 45 | true, 46 | RS0, 47 | [{2, 0}, {2, 1}], 48 | VnodePath1, 49 | SplitF 50 | ), 51 | 52 | ok = test_rebuild_schedule(Cntrl, RS0), 53 | 54 | aae_controller:aae_close(Cntrl), 55 | testutil:reset_filestructure(). 56 | 57 | test_rebuild_schedule(Cntrl, RS0) -> 58 | RS1 = {RS1a, RS1b} = aae_controller:aae_get_rebuild_schedule(Cntrl), 59 | RS1 = RS0, 60 | ok = aae_controller:aae_set_rebuild_schedule(Cntrl, {RS1a, RS1b + 1}), 61 | {RS2a, RS2b} = aae_controller:aae_get_rebuild_schedule(Cntrl), 62 | RS1a = RS2a, 63 | RS2b = RS1b + 1, 64 | ok = aae_controller:aae_set_rebuild_schedule(Cntrl, {RS1a + 1, RS1b}), 65 | {RS3a, RS3b} = aae_controller:aae_get_rebuild_schedule(Cntrl), 66 | RS3a = RS1a + 1, 67 | RS1b = RS3b, 68 | ok. 69 | 70 | get_set_nextrebuild(_Config) -> 71 | RootPath = testutil:reset_filestructure(), 72 | VnodePath1 = filename:join(RootPath, "vnode1/"), 73 | SplitF = fun(_) -> {42, 1, 0, null} end, 74 | 75 | {ok, Cntrl} = 76 | aae_controller:aae_start( 77 | {parallel, leveled_ko}, 78 | true, 79 | {1, 300}, 80 | [{2, 0}, {2, 1}], 81 | VnodePath1, 82 | SplitF 83 | ), 84 | 85 | NextRebuild0 = aae_controller:aae_nextrebuild(Cntrl), 86 | Now = os:timestamp(), 87 | ok = aae_controller:aae_prompt_nextrebuild(Cntrl, 10), 88 | NextRebuild1 = aae_controller:aae_nextrebuild(Cntrl), 89 | true = (NextRebuild1 /= NextRebuild0), 90 | Report = aae_controller:aae_produce_progress_report(Cntrl), 91 | NextRebuild1Reported = proplists:get_value(next_rebuild, Report), 92 | NextRebuild1Reported = NextRebuild1, 93 | ApproxTenSec = timer:now_diff(NextRebuild1, Now) div 1000000, 94 | true = (ApproxTenSec > 9), 95 | true = (ApproxTenSec < 11), 96 | 97 | aae_controller:aae_close(Cntrl), 98 | testutil:reset_filestructure(). 99 | 100 | -define(NKEYS, 15). 101 | -define(NKEYS_IN_RANGE, 9). 102 | -define(NKEYS_UPDATED, 5). 103 | -define(NEW_SIBLING_COUNT, 104 | (?NKEYS_IN_RANGE + (?NKEYS_IN_RANGE - ?NKEYS_UPDATED)) 105 | ). 106 | 107 | get_set_storeheads(_Config) -> 108 | RootPath = testutil:reset_filestructure(), 109 | VnodePath = filename:join(RootPath, "vnode1/"), 110 | Preflist = [{2, 0}, {2, 1}], 111 | 112 | StoreheadsOnSplitF = fun(_) -> 113 | {_SomeSensibleSize = 42, 1, 0, undefined, <<>>} 114 | end, 115 | StoreheadsOffSplitF = fun(_) -> 116 | {42, _DoubleSiblingCount = 2, 0, undefined, <<>>} 117 | end, 118 | 119 | {ok, Cntrl} = 120 | aae_controller:aae_start( 121 | {parallel, leveled_ko}, 122 | true, 123 | {1, 300}, 124 | Preflist, 125 | VnodePath, 126 | StoreheadsOffSplitF, 127 | %% have one function 128 | [info, warn, error, critical] 129 | ), 130 | 131 | Bucket = <<"b1">>, 132 | BKVList = testutil:gen_keys([], ?NKEYS, Bucket), 133 | {BKVList1, _} = lists:split(?NKEYS_UPDATED, BKVList), 134 | ok = testutil:put_keys(Cntrl, 2, BKVList, none), 135 | ct:print("put ~b keys: ~p\n", [?NKEYS, BKVList]), 136 | 137 | StartKey = list_to_binary(string:right(integer_to_list(0), 6, $0)), 138 | EndKey = list_to_binary(string:right(integer_to_list(10), 6, $0)), 139 | 140 | %% there be 9*2 siblings in the range 141 | SCFolder0 = key_range_folder(Cntrl, Bucket, StartKey, EndKey), 142 | 143 | %% test query 144 | SCF0 = SCFolder0(), 145 | ct:print( 146 | "storeheads is initially off: test query should return ~b siblings:\n~b indeed\n", 147 | [?NKEYS_IN_RANGE * 2, element(2, SCF0)] 148 | ), 149 | ?NKEYS_IN_RANGE * 2 = element(2, SCF0), 150 | 151 | %% update split_function 152 | ok = aae_controller:aae_set_object_splitfun(Cntrl, StoreheadsOnSplitF), 153 | ct:print("storeheads now set to on\n"), 154 | 155 | %% test query: no change in output 156 | SCFolder1 = key_range_folder(Cntrl, Bucket, StartKey, EndKey), 157 | SCF1 = SCFolder1(), 158 | ct:print( 159 | "after setting storeheads to on, expect no change in query output:\n" 160 | "number of siblings returned is still ~b\n", 161 | [element(2, SCF1)] 162 | ), 163 | ?NKEYS_IN_RANGE * 2 = element(2, SCF1), 164 | true = (SCF0 == SCF1), 165 | 166 | %% update some objects 167 | BKVList1Updated = 168 | [ 169 | {B, K, [{<>, C}]} 170 | || {B, K, [{V, C}]} <- BKVList1 171 | ], 172 | ok = testutil:put_keys(Cntrl, 2, BKVList1Updated, none), 173 | ct:print("update ~b objects\n", [?NKEYS_UPDATED]), 174 | 175 | %% test query to show partial change 176 | SCFolder2 = key_range_folder(Cntrl, Bucket, StartKey, EndKey), 177 | SCF2 = SCFolder2(), 178 | ct:print( 179 | "after updating, there should be a partial change (minus ~b siblings). Query returns ~b siblings:\n", 180 | [?NKEYS_UPDATED, element(2, SCF2)] 181 | ), 182 | true = (SCF0 /= SCF2), 183 | ?NEW_SIBLING_COUNT = element(2, SCF2), 184 | 185 | aae_controller:aae_close(Cntrl), 186 | RootPath = testutil:reset_filestructure(). 187 | 188 | splitfun_compare_functions(_Config) -> 189 | RootPath = testutil:reset_filestructure(), 190 | VnodePath = filename:join(RootPath, "vnode1/"), 191 | Preflist = [{2, 0}], 192 | 193 | SplitF_1 = mock_aae_from_object_binary_for_storeheads(true), 194 | SplitF_2 = mock_aae_from_object_binary_for_storeheads(false), 195 | 196 | {ok, Cntrl} = 197 | aae_controller:aae_start( 198 | {parallel, leveled_ko}, 199 | true, 200 | {1, 300}, 201 | Preflist, 202 | VnodePath, 203 | SplitF_1, 204 | %% have one function 205 | [info, warn, error, critical] 206 | ), 207 | 208 | %% this is essentially to test that two logically identical functions 209 | %% created separately, do indeed compare equal 210 | true = 211 | (aae_controller:wrapped_splitobjfun(SplitF_1) == 212 | aae_controller:aae_get_object_splitfun(Cntrl)), 213 | ok = aae_controller:aae_set_object_splitfun( 214 | Cntrl, aae_controller:wrapped_splitobjfun(SplitF_2) 215 | ), 216 | true = 217 | (aae_controller:wrapped_splitobjfun(SplitF_2) == 218 | aae_controller:aae_get_object_splitfun(Cntrl)), 219 | 220 | aae_controller:aae_close(Cntrl), 221 | RootPath = testutil:reset_filestructure(). 222 | 223 | -define(APOINTINTIME, {1747, 917445, 410090}). 224 | mock_aae_from_object_binary_for_storeheads(true) -> 225 | fun(_ObjBin) -> 226 | {_Size = 42, _SibCount = 1, 0, _LastMods = [?APOINTINTIME], <<>>} 227 | end; 228 | mock_aae_from_object_binary_for_storeheads(false) -> 229 | fun(_) -> 230 | {42, 1, 0, [?APOINTINTIME], term_to_binary(null)} 231 | end. 232 | 233 | key_range_folder(Cntrl, Bucket, StartKey, EndKey) -> 234 | Elements = [{sibcount, null}], 235 | SCFoldFun = 236 | fun(_FB, FK, FV, {FAccKL, FAccSc}) -> 237 | {sibcount, FSc} = lists:keyfind(sibcount, 1, FV), 238 | if 239 | (FK >= StartKey) and (FK < EndKey) -> 240 | {[FK | FAccKL], FAccSc + FSc}; 241 | el /= se -> 242 | {FAccKL, FAccSc} 243 | end 244 | end, 245 | SCInitAcc = {[], 0}, 246 | {async, Folder} = 247 | aae_controller:aae_fold( 248 | Cntrl, 249 | {key_range, Bucket, StartKey, EndKey}, 250 | all, 251 | SCFoldFun, 252 | SCInitAcc, 253 | Elements 254 | ), 255 | Folder. 256 | 257 | store_notsupported(_Config) -> 258 | RootPath = testutil:reset_filestructure(), 259 | VnodePath1 = filename:join(RootPath, "vnode1/"), 260 | SplitF = fun(_X) -> {rand:uniform(1000), 1, 0, null} end, 261 | RPid = self(), 262 | ReturnFun = fun(R) -> RPid ! {result, R} end, 263 | RepairFun = fun(_KL) -> null end, 264 | 265 | {ok, Cntrl1} = 266 | aae_controller:aae_start( 267 | {parallel, leveled_ko}, 268 | true, 269 | {1, 300}, 270 | [{2, 0}, {2, 1}], 271 | VnodePath1, 272 | SplitF, 273 | [info, warn, error, critical] 274 | ), 275 | 276 | BKVList = testutil:gen_keys([], 100), 277 | ok = testutil:put_keys(Cntrl1, 2, BKVList, none), 278 | 279 | {ok, _P1, GUID1} = 280 | aae_exchange:start( 281 | [{exchange_sendfun(Cntrl1), [{2, 0}]}], 282 | [{exchange_notsupported_sendfun(), [{3, 0}]}], 283 | RepairFun, 284 | ReturnFun 285 | ), 286 | io:format("Exchange id ~s~n", [GUID1]), 287 | {ExchangeState1, 0} = testutil:start_receiver(), 288 | io:format("ExchangeState ~w~n", [ExchangeState1]), 289 | true = ExchangeState1 == not_supported, 290 | aae_controller:aae_close(Cntrl1), 291 | RootPath = testutil:reset_filestructure(). 292 | 293 | dual_store_compare_medium_so(_Config) -> 294 | dual_store_compare_tester(10000, leveled_so). 295 | 296 | dual_store_compare_medium_ko(_Config) -> 297 | dual_store_compare_tester(10000, leveled_ko). 298 | 299 | dual_store_compare_large_so(_Config) -> 300 | dual_store_compare_tester(100000, leveled_so). 301 | 302 | dual_store_compare_large_ko(_Config) -> 303 | dual_store_compare_tester(100000, leveled_ko). 304 | 305 | dual_store_compare_tester(InitialKeyCount, StoreType) -> 306 | % Setup to AAE controllers, each representing the same data. One store 307 | % will be split into two three preflists, the other into two. The 308 | % preflists will be mapped as follows: 309 | % {2, 0} <-> {3, 0} 310 | % {2, 1} <-> {3, 1} & {3, 2} 311 | % 312 | % Think of these preflists in terms of needless partitions for test 313 | % purposes. Although this is a comparison between 2 'nodes', it is 314 | % more like a comparison between 2 clusters where n=1, there is 1 315 | % vnode, but data is still partitioned into either 2 or 3 partitions. 316 | % Don't try and make sense of this in term of a ring - the 317 | % mock_vnode_coverage_fold tests have a more Riak ring-like setup. 318 | 319 | RootPath = testutil:reset_filestructure(), 320 | VnodePath1 = filename:join(RootPath, "vnode1/"), 321 | VnodePath2 = filename:join(RootPath, "vnode2/"), 322 | SplitF = fun(_X) -> {rand:uniform(1000), 1, 0, null} end, 323 | RPid = self(), 324 | ReturnFun = fun(R) -> RPid ! {result, R} end, 325 | RepairFun = fun(_KL) -> null end, 326 | 327 | %% Add a key filter fun that never matches 328 | KFF = fun({B, _K}) -> B =/= <<"SkipThisBucket">> end, 329 | 330 | {ok, Cntrl1} = 331 | aae_controller:aae_start( 332 | {parallel, StoreType}, 333 | true, 334 | {1, 300}, 335 | [{2, 0}, {2, 1}], 336 | VnodePath1, 337 | SplitF, 338 | [warn, error, critical], 339 | [], 340 | KFF 341 | ), 342 | {ok, Cntrl2} = 343 | aae_controller:aae_start( 344 | {parallel, StoreType}, 345 | true, 346 | {1, 300}, 347 | [{3, 0}, {3, 1}, {3, 2}], 348 | VnodePath2, 349 | SplitF, 350 | [warn, error, critical], 351 | [], 352 | KFF 353 | ), 354 | 355 | initial_load(InitialKeyCount, Cntrl1, Cntrl2), 356 | 357 | SW1 = os:timestamp(), 358 | 359 | ok = aae_controller:aae_mergeroot( 360 | Cntrl1, 361 | [{2, 0}, {2, 1}], 362 | ReturnFun 363 | ), 364 | Root1A = testutil:start_receiver(), 365 | ok = aae_controller:aae_mergeroot( 366 | Cntrl2, 367 | [{3, 0}, {3, 1}, {3, 2}], 368 | ReturnFun 369 | ), 370 | Root2A = testutil:start_receiver(), 371 | true = Root1A == Root2A, 372 | 373 | ok = aae_controller:aae_fetchroot( 374 | Cntrl1, 375 | [{2, 0}], 376 | ReturnFun 377 | ), 378 | [{{2, 0}, Root1B}] = testutil:start_receiver(), 379 | ok = aae_controller:aae_fetchroot( 380 | Cntrl2, 381 | [{3, 0}], 382 | ReturnFun 383 | ), 384 | [{{3, 0}, Root2B}] = testutil:start_receiver(), 385 | true = Root1B == Root2B, 386 | 387 | ok = aae_controller:aae_mergeroot( 388 | Cntrl1, 389 | [{2, 1}], 390 | ReturnFun 391 | ), 392 | Root1C = testutil:start_receiver(), 393 | ok = aae_controller:aae_mergeroot( 394 | Cntrl2, 395 | [{3, 1}, {3, 2}], 396 | ReturnFun 397 | ), 398 | Root2C = testutil:start_receiver(), 399 | true = Root1C == Root2C, 400 | 401 | %% Turn down logging in Cntrl1 and Cntrl2 402 | ok = aae_controller:aae_loglevel(Cntrl1, [warn, error, critical]), 403 | ok = aae_controller:aae_loglevel(Cntrl2, [warn, error, critical]), 404 | 405 | io:format( 406 | "Direct partition compare complete in ~w ms~n", 407 | [timer:now_diff(os:timestamp(), SW1) / 1000] 408 | ), 409 | 410 | % Change log levels 411 | ok = aae_controller:aae_loglevel(Cntrl1, [info, warn, error, critical]), 412 | ok = aae_controller:aae_loglevel(Cntrl2, [info, warn, error, critical]), 413 | 414 | % Now do a comparison based based on some key range queries: 415 | SW2 = os:timestamp(), 416 | Bucket = integer_to_binary(1), 417 | StartKey = list_to_binary(string:right(integer_to_list(10), 6, $0)), 418 | EndKey = list_to_binary(string:right(integer_to_list(50), 6, $0)), 419 | Elements = [{sibcount, null}], 420 | SCFoldFun = 421 | fun(FB, FK, FV, {FAccKL, FAccSc}) -> 422 | {sibcount, FSc} = lists:keyfind(sibcount, 1, FV), 423 | true = FB == Bucket, 424 | true = FK >= StartKey, 425 | true = FK < EndKey, 426 | {[FK | FAccKL], FAccSc + FSc} 427 | end, 428 | SCInitAcc = {[], 0}, 429 | 430 | {async, SCFolder1} = 431 | aae_controller:aae_fold( 432 | Cntrl1, 433 | {key_range, Bucket, StartKey, EndKey}, 434 | all, 435 | SCFoldFun, 436 | SCInitAcc, 437 | Elements 438 | ), 439 | {async, SCFolder2} = 440 | aae_controller:aae_fold( 441 | Cntrl2, 442 | {key_range, Bucket, StartKey, EndKey}, 443 | all, 444 | SCFoldFun, 445 | SCInitAcc, 446 | Elements 447 | ), 448 | SCF1 = SCFolder1(), 449 | SCF2 = SCFolder2(), 450 | 451 | true = SCF1 == SCF2, 452 | true = element(2, SCF1) == 8, 453 | true = length(element(1, SCF1)) == 8, 454 | io:format( 455 | "Comparison through key range folder in ~w ms with results ~w~n", 456 | [timer:now_diff(os:timestamp(), SW2) / 1000, SCF1] 457 | ), 458 | 459 | % Confirm no differences when using different matching AAE exchanges 460 | SW3 = os:timestamp(), 461 | 462 | {ok, _P1, GUID1} = 463 | aae_exchange:start( 464 | [{exchange_sendfun(Cntrl1), [{2, 0}]}], 465 | [{exchange_sendfun(Cntrl2), [{3, 0}]}], 466 | RepairFun, 467 | ReturnFun 468 | ), 469 | io:format("Exchange id ~s~n", [GUID1]), 470 | {ExchangeState1, 0} = testutil:start_receiver(), 471 | true = ExchangeState1 == root_compare, 472 | 473 | {ok, _P2, GUID2} = 474 | aae_exchange:start( 475 | [{exchange_sendfun(Cntrl1), [{2, 1}]}], 476 | [{exchange_sendfun(Cntrl2), [{3, 1}, {3, 2}]}], 477 | RepairFun, 478 | ReturnFun 479 | ), 480 | io:format("Exchange id ~s~n", [GUID2]), 481 | {ExchangeState2, 0} = testutil:start_receiver(), 482 | true = ExchangeState2 == root_compare, 483 | 484 | {ok, _P3, GUID3} = 485 | aae_exchange:start( 486 | [{exchange_sendfun(Cntrl1), [{2, 0}, {2, 1}]}], 487 | [{exchange_sendfun(Cntrl2), [{3, 0}, {3, 1}, {3, 2}]}], 488 | RepairFun, 489 | ReturnFun 490 | ), 491 | io:format("Exchange id ~s~n", [GUID3]), 492 | {ExchangeState3, 0} = testutil:start_receiver(), 493 | true = ExchangeState3 == root_compare, 494 | 495 | {ok, _P4, GUID4} = 496 | aae_exchange:start( 497 | [ 498 | {exchange_sendfun(Cntrl1), [{2, 0}]}, 499 | {exchange_sendfun(Cntrl1), [{2, 1}]} 500 | ], 501 | [{exchange_sendfun(Cntrl2), [{3, 0}, {3, 1}, {3, 2}]}], 502 | RepairFun, 503 | ReturnFun 504 | ), 505 | io:format("Exchange id ~s~n", [GUID4]), 506 | {ExchangeState4, 0} = testutil:start_receiver(), 507 | true = ExchangeState4 == root_compare, 508 | 509 | BKVListN = create_discrepancy(Cntrl1, InitialKeyCount), 510 | 511 | {ok, _P6, GUID6} = 512 | aae_exchange:start( 513 | [ 514 | {exchange_sendfun(Cntrl1), [{2, 0}]}, 515 | {exchange_sendfun(Cntrl1), [{2, 1}]} 516 | ], 517 | [{exchange_sendfun(Cntrl2), [{3, 0}, {3, 1}, {3, 2}]}], 518 | RepairFun, 519 | ReturnFun 520 | ), 521 | io:format("Exchange id ~s~n", [GUID6]), 522 | {ExchangeState6, 10} = testutil:start_receiver(), 523 | true = ExchangeState6 == clock_compare, 524 | 525 | % Same again, but request a missing partition, and should get same result 526 | 527 | {ok, _P6a, GUID6a} = 528 | aae_exchange:start( 529 | [ 530 | {exchange_sendfun(Cntrl1), [{2, 0}]}, 531 | {exchange_sendfun(Cntrl1), [{2, 1}]} 532 | ], 533 | [{exchange_sendfun(Cntrl2), [{3, 0}, {3, 1}, {3, 2}, {3, 3}]}], 534 | RepairFun, 535 | ReturnFun 536 | ), 537 | io:format("Exchange id ~s~n", [GUID6a]), 538 | {ExchangeState6a, 10} = testutil:start_receiver(), 539 | true = ExchangeState6a == clock_compare, 540 | 541 | {ok, _P6b, GUID6b} = 542 | aae_exchange:start( 543 | full, 544 | [ 545 | {exchange_sendfun(Cntrl1), [{2, 0}]}, 546 | {exchange_sendfun(Cntrl1), [{2, 1}]} 547 | ], 548 | [{exchange_sendfun(Cntrl2), [{3, 0}, {3, 1}, {3, 2}, {3, 3}]}], 549 | RepairFun, 550 | ReturnFun, 551 | none, 552 | [{scan_timeout, 0}, {max_results, 256}] 553 | ), 554 | io:format("Exchange id ~s~n", [GUID6b]), 555 | {timeout, 0} = testutil:start_receiver(), 556 | 557 | % Nothing repaired last time. The deltas are all new keys though, so 558 | % We can repair by adding them in to the other vnode 559 | 560 | RepairFun0 = testutil:repair_fun(BKVListN, Cntrl2, 3), 561 | {ok, _P7, GUID7} = 562 | aae_exchange:start( 563 | [ 564 | {exchange_sendfun(Cntrl1), [{2, 0}]}, 565 | {exchange_sendfun(Cntrl1), [{2, 1}]} 566 | ], 567 | [{exchange_sendfun(Cntrl2), [{3, 0}, {3, 1}, {3, 2}]}], 568 | RepairFun0, 569 | ReturnFun 570 | ), 571 | io:format("Exchange id ~s~n", [GUID7]), 572 | {ExchangeState7, 10} = testutil:start_receiver(), 573 | true = ExchangeState7 == clock_compare, 574 | 575 | {ok, _P8, GUID8} = 576 | aae_exchange:start( 577 | [ 578 | {exchange_sendfun(Cntrl1), [{2, 0}]}, 579 | {exchange_sendfun(Cntrl1), [{2, 1}]} 580 | ], 581 | [{exchange_sendfun(Cntrl2), [{3, 0}, {3, 1}, {3, 2}]}], 582 | RepairFun, 583 | ReturnFun 584 | ), 585 | io:format("Exchange id ~s~n", [GUID8]), 586 | {ExchangeState8, 0} = testutil:start_receiver(), 587 | true = ExchangeState8 == root_compare, 588 | 589 | io:format( 590 | "Comparison through exchange complete in ~w ms~n", 591 | [timer:now_diff(os:timestamp(), SW3) / 1000] 592 | ), 593 | 594 | % Shutdown and tidy up 595 | ok = aae_controller:aae_close(Cntrl1), 596 | ok = aae_controller:aae_close(Cntrl2), 597 | RootPath = testutil:reset_filestructure(). 598 | 599 | initial_load(InitialKeyCount, Cntrl1, Cntrl2) -> 600 | SW0 = os:timestamp(), 601 | 602 | BKVListXS = testutil:gen_keys([], InitialKeyCount), 603 | {BKVList, _Discard} = lists:split(20, BKVListXS), 604 | % The first 20 keys discarded to create an overlap between the add 605 | % replace list 606 | ok = testutil:put_keys(Cntrl1, 2, BKVList, none), 607 | ok = testutil:put_keys(Cntrl2, 3, lists:reverse(BKVList), none), 608 | 609 | {BKVListRem, _Ignore} = lists:split(10, BKVList), 610 | ok = testutil:remove_keys(Cntrl1, 2, BKVListRem), 611 | ok = testutil:remove_keys(Cntrl2, 3, BKVListRem), 612 | 613 | % Change all of the keys - cheat by using undefined rather than replace 614 | % properly 615 | 616 | BKVListR = testutil:gen_keys([], 100), 617 | % As 100 > 20 expect 20 of these keys to be new, so no clock will be 618 | % returned from fetch_clock, and 80 of these will be updates 619 | ok = testutil:put_keys(Cntrl1, 2, BKVListR, undefined), 620 | ok = testutil:put_keys(Cntrl2, 3, BKVListR, undefined), 621 | 622 | io:format( 623 | "Initial put complete in ~w ms~n", 624 | [timer:now_diff(os:timestamp(), SW0) / 1000] 625 | ). 626 | 627 | create_discrepancy(Cntrl, InitialKeyCount) -> 628 | % Create a discrepancy and discover it through exchange 629 | BKVListN = testutil:gen_keys([], InitialKeyCount + 10, InitialKeyCount), 630 | _SL = lists:foldl( 631 | fun({B, K, _V}, Acc) -> 632 | BK = aae_util:make_binarykey(B, K), 633 | Seg = leveled_tictac:keyto_segment48(BK), 634 | Seg0 = aae_keystore:generate_treesegment(Seg), 635 | io:format( 636 | "Generate new key B ~w K ~w " ++ 637 | "for Segment ~w ~w ~w partition ~w ~w~n", 638 | [ 639 | B, 640 | K, 641 | Seg0, 642 | Seg0 bsr 8, 643 | Seg0 band 255, 644 | testutil:calc_preflist(K, 2), 645 | testutil:calc_preflist(K, 3) 646 | ] 647 | ), 648 | [Seg0 | Acc] 649 | end, 650 | [], 651 | BKVListN 652 | ), 653 | ok = testutil:put_keys(Cntrl, 2, BKVListN), 654 | BKVListN. 655 | 656 | exchange_sendfun(Cntrl) -> testutil:exchange_sendfun(Cntrl). 657 | 658 | exchange_notsupported_sendfun() -> 659 | SendFun = 660 | fun(_Msg, _Preflists, Colour) -> 661 | RPid = self(), 662 | aae_exchange:reply(RPid, not_supported, Colour) 663 | end, 664 | SendFun. 665 | -------------------------------------------------------------------------------- /test/end_to_end/mock_kv_vnode.erl: -------------------------------------------------------------------------------- 1 | %% -------- Overview --------- 2 | %% 3 | %% A simplified mock of riak_kv_vnode for testing 4 | 5 | -module(mock_kv_vnode). 6 | 7 | -behaviour(gen_server). 8 | 9 | -export([ 10 | init/1, 11 | handle_call/3, 12 | handle_cast/2, 13 | handle_info/2, 14 | terminate/2, 15 | code_change/3 16 | ]). 17 | 18 | -export([ 19 | open/5, 20 | put/4, 21 | read_repair/4, 22 | push/6, 23 | backend_delete/4, 24 | exchange_message/4, 25 | rebuild/2, 26 | rehash/4, 27 | rebuild_complete/2, 28 | fold_aae/6, 29 | bucketlist_aae/1, 30 | reset_keyfilter/1, 31 | close/1 32 | ]). 33 | 34 | -export([ 35 | extractclock_from_riakhead/1, 36 | from_aae_binary/1, 37 | new_v1/2, 38 | workerfun/1, 39 | rebuild_worker/1, 40 | fold_worker/0 41 | ]). 42 | 43 | -record(r_content, { 44 | metadata, 45 | value :: term() 46 | }). 47 | 48 | -record(r_object, { 49 | bucket, 50 | key, 51 | contents :: [#r_content{}], 52 | vclock = [], 53 | updatemetadata = dict:store(clean, true, dict:new()), 54 | updatevalue :: term() 55 | }). 56 | 57 | -record(options, { 58 | aae :: parallel_so | parallel_ko | native, 59 | index_ns :: list(tuple()), 60 | root_path :: list(), 61 | preflist_fun = null :: preflist_fun(), 62 | key_filter = none :: aae_controller:key_include_fun() 63 | }). 64 | 65 | -record(state, { 66 | root_path :: list(), 67 | index_ns :: list(tuple()), 68 | aae_controller :: pid(), 69 | vnode_store :: pid(), 70 | vnode_id :: binary(), 71 | aae_type :: tuple(), 72 | vnode_sqn = 1 :: integer(), 73 | preflist_fun = null :: preflist_fun(), 74 | aae_rebuild = false :: boolean() 75 | }). 76 | 77 | -include_lib("eunit/include/eunit.hrl"). 78 | 79 | -define(RIAK_TAG, o_rkv). 80 | -define(REBUILD_SCHEDULE, {1, 60}). 81 | -define(LASTMOD_LEN, 29). 82 | -define(V1_VERS, 1). 83 | -define(MAGIC, 53). 84 | -define(EMPTY_VTAG_BIN, <<"e">>). 85 | -define(MAGIC_KEYS, [<<48, 48, 48, 52, 57, 51>>]). 86 | -define(POKE_TIME, 1000). 87 | 88 | -type r_object() :: #r_object{}. 89 | -type preflist_fun() :: null | fun((term(), term()) -> non_neg_integer()). 90 | -type fold_objects_fun() :: fun((term(), term(), term(), term()) -> term()). 91 | -type folder() :: fun(() -> term()). 92 | 93 | %%%============================================================================ 94 | %%% API 95 | %%%============================================================================ 96 | 97 | -spec open( 98 | list(), 99 | atom(), 100 | list(tuple()), 101 | preflist_fun() | null, 102 | aae_controller:key_include_fun() 103 | ) -> {ok, pid()}. 104 | %% @doc 105 | %% Open a mock vnode 106 | open(Path, AAEType, IndexNs, PreflistFun, KFF) -> 107 | gen_server:start( 108 | ?MODULE, 109 | [ 110 | #options{ 111 | aae = AAEType, 112 | index_ns = IndexNs, 113 | root_path = Path, 114 | preflist_fun = PreflistFun, 115 | key_filter = KFF 116 | } 117 | ], 118 | [] 119 | ). 120 | 121 | -spec put(pid(), r_object(), tuple(), list(pid())) -> ok. 122 | %% @doc 123 | %% Put a new object in the store, updating AAE - and co-ordinating 124 | put(Vnode, Object, IndexN, OtherVnodes) -> 125 | gen_server:call(Vnode, {put, Object, IndexN, OtherVnodes}). 126 | 127 | -spec read_repair(pid(), r_object(), tuple(), list(pid())) -> ok. 128 | %% @doc 129 | %% Fetch the version vector from this store, and push the completed object 130 | %% to another 131 | read_repair(Vnode, Object, IndexN, OtherVnodes) -> 132 | gen_server:call(Vnode, {read_repair, Object, IndexN, OtherVnodes}). 133 | 134 | -spec push(pid(), binary(), binary(), list(tuple()), binary(), tuple()) -> ok. 135 | %% @doc 136 | %% Push a new object in the store, updating AAE 137 | push(Vnode, Bucket, Key, UpdClock, ObjectBin, IndexN) -> 138 | gen_server:cast(Vnode, {push, Bucket, Key, UpdClock, ObjectBin, IndexN}). 139 | 140 | -spec backend_delete(pid(), binary(), binary(), tuple()) -> ok. 141 | %% @doc 142 | %% Delete an object from the backend 143 | backend_delete(Vnode, Bucket, Key, IndexN) -> 144 | gen_server:call(Vnode, {delete, Bucket, Key, IndexN}). 145 | 146 | -spec rebuild(pid(), boolean()) -> {erlang:timestamp(), boolean()}. 147 | %% @doc 148 | %% Prompt for the next rebuild time, using ForceRebuild=true to override that 149 | %% time and trigger a rebuild. As well as the next rebuild time the response 150 | %$ includes if a rebuild is currently in progress 151 | rebuild(Vnode, ForceRebuild) -> 152 | gen_server:call(Vnode, {rebuild, ForceRebuild}). 153 | 154 | -spec rebuild_complete(pid(), store | tree) -> ok. 155 | %% @doc 156 | %% Prompt for the rebuild of the tree 157 | rebuild_complete(Vnode, Stage) -> 158 | gen_server:cast(Vnode, {rebuild_complete, Stage}). 159 | 160 | -spec rehash(pid(), binary(), binary(), tuple()) -> ok. 161 | %% @doc 162 | %% Prompt a given key to be rehashed 163 | rehash(Vnode, Bucket, Key, IndexN) -> 164 | gen_server:call(Vnode, {rehash, Bucket, Key, IndexN}). 165 | 166 | -spec fold_aae( 167 | pid(), 168 | aae_keystore:range_limiter(), 169 | aae_keystore:segment_limiter(), 170 | fold_objects_fun(), 171 | any(), 172 | list(aae_keystore:value_element()) 173 | ) -> {async, folder()}. 174 | %% @doc 175 | %% Fold over the heads in the aae store (which may be the key store when 176 | %% running in native mode) 177 | fold_aae(Vnode, Range, Segments, FoldObjectsFun, InitAcc, Elements) -> 178 | gen_server:call( 179 | Vnode, 180 | {fold_aae, Range, Segments, FoldObjectsFun, InitAcc, Elements} 181 | ). 182 | 183 | -type return_fun() :: fun((any()) -> ok). 184 | 185 | -spec exchange_message(pid(), tuple() | atom(), list(tuple()), return_fun()) -> 186 | ok. 187 | %% @doc 188 | %% Handle a message from an AAE exchange 189 | exchange_message(Vnode, Msg, IndexNs, ReturnFun) -> 190 | gen_server:call(Vnode, {aae, Msg, IndexNs, ReturnFun}). 191 | 192 | -spec reset_keyfilter(pid()) -> ok. 193 | reset_keyfilter(Pid) -> 194 | gen_server:cast(Pid, reset_keyfilter). 195 | 196 | -spec bucketlist_aae(pid()) -> {async, fun(() -> list())}. 197 | %% @doc 198 | %% List buckets via AAE store 199 | bucketlist_aae(Vnode) -> 200 | gen_server:call(Vnode, bucketlist_aae). 201 | 202 | -spec close(pid()) -> ok. 203 | %% @doc 204 | %% Close the vnode, and any aae controller 205 | close(Vnode) -> 206 | gen_server:call(Vnode, close). 207 | 208 | %%%============================================================================ 209 | %%% gen_server callbacks 210 | %%%============================================================================ 211 | 212 | init([Opts]) -> 213 | % Start the vnode backend 214 | % Get the shutdown GUID 215 | % Delete the shutdown GUID 216 | % Check is_empty 217 | % Start the aae_controller 218 | % Report back OK 219 | RP = Opts#options.root_path, 220 | {ok, VnSt} = 221 | leveled_bookie:book_start(RP, 4000, 100000000, none), 222 | IsEmpty = leveled_bookie:book_isempty(VnSt, ?RIAK_TAG), 223 | BackendOpts = aae_keystore:store_generate_backendoptions(), 224 | {KeyStoreType, UpdBackendOpts} = 225 | case Opts#options.aae of 226 | native -> 227 | { 228 | {native, leveled_nko, VnSt}, 229 | BackendOpts 230 | }; 231 | parallel_so -> 232 | { 233 | {parallel, leveled_so}, 234 | aae_keystore:store_setbackendoption( 235 | max_pencillercachesize, 236 | 12000, 237 | BackendOpts 238 | ) 239 | }; 240 | parallel_ko -> 241 | AltOpts = 242 | [ 243 | {max_journalobjectcount, 1000}, 244 | {database_id, 65534}, 245 | {snapshot_timeout_short, 360}, 246 | {snapshot_timeout_long, 3600}, 247 | {compression_method, zstd}, 248 | { 249 | forced_logs, 250 | [b0015, b0016, b0017, b0018, p0032, sst12] 251 | }, 252 | {log_level, warn}, 253 | {stats_logfrequency, 120} 254 | ], 255 | 256 | { 257 | {parallel, leveled_ko}, 258 | lists:foldl( 259 | fun({K, S}, AccOpts) -> 260 | aae_keystore:store_setbackendoption( 261 | K, S, AccOpts 262 | ) 263 | end, 264 | BackendOpts, 265 | AltOpts 266 | ) 267 | } 268 | end, 269 | {ok, AAECntrl} = 270 | aae_controller:aae_start( 271 | KeyStoreType, 272 | IsEmpty, 273 | ?REBUILD_SCHEDULE, 274 | Opts#options.index_ns, 275 | RP, 276 | fun from_aae_binary/1, 277 | undefined, 278 | UpdBackendOpts, 279 | Opts#options.key_filter 280 | ), 281 | erlang:send_after(?POKE_TIME, self(), poke), 282 | {ok, #state{ 283 | root_path = RP, 284 | aae_type = KeyStoreType, 285 | vnode_store = VnSt, 286 | index_ns = Opts#options.index_ns, 287 | aae_controller = AAECntrl, 288 | vnode_id = list_to_binary(leveled_util:generate_uuid()), 289 | preflist_fun = Opts#options.preflist_fun 290 | }}. 291 | 292 | handle_call({read_repair, Object, IndexN, OtherVnodes}, _From, State) -> 293 | Bucket = Object#r_object.bucket, 294 | Key = Object#r_object.key, 295 | case 296 | leveled_bookie:book_head( 297 | State#state.vnode_store, 298 | Bucket, 299 | Key, 300 | ?RIAK_TAG 301 | ) 302 | of 303 | not_found -> 304 | {reply, ok, State}; 305 | {ok, Head} -> 306 | Clock = extractclock_from_riakhead(Head), 307 | ObjectBin = new_v1(Clock, Object#r_object.contents), 308 | PushFun = 309 | fun(VN) -> 310 | push(VN, Bucket, Key, Clock, ObjectBin, IndexN) 311 | end, 312 | lists:foreach(PushFun, OtherVnodes), 313 | {reply, ok, State} 314 | end; 315 | handle_call({put, Object, IndexN, OtherVnodes}, _From, State) -> 316 | % Get Bucket and Key from object 317 | % Do head request 318 | % Compare clock, update clock 319 | % Send update to other stores 320 | % Update AAE 321 | % Report back OK 322 | Bucket = Object#r_object.bucket, 323 | Key = Object#r_object.key, 324 | 325 | {UpdClock, PrevClock} = 326 | case 327 | leveled_bookie:book_head( 328 | State#state.vnode_store, 329 | Bucket, 330 | Key, 331 | ?RIAK_TAG 332 | ) 333 | of 334 | not_found -> 335 | {[{State#state.vnode_id, State#state.vnode_sqn}], none}; 336 | {ok, Head} -> 337 | Clock0 = 338 | extractclock_from_riakhead(Head), 339 | Clock1 = 340 | [{State#state.vnode_id, State#state.vnode_sqn} | Clock0], 341 | {lists:ukeysort(1, Clock1), Clock0} 342 | end, 343 | ObjectBin = new_v1(UpdClock, Object#r_object.contents), 344 | VVEBin = to_aae_binary(ObjectBin), 345 | leveled_bookie:book_put( 346 | State#state.vnode_store, 347 | Bucket, 348 | Key, 349 | ObjectBin, 350 | [], 351 | ?RIAK_TAG 352 | ), 353 | 354 | ok = aae_controller:aae_put( 355 | State#state.aae_controller, 356 | IndexN, 357 | Bucket, 358 | Key, 359 | UpdClock, 360 | PrevClock, 361 | VVEBin 362 | ), 363 | 364 | lists:foreach( 365 | fun(VN) -> 366 | push(VN, Bucket, Key, UpdClock, ObjectBin, IndexN) 367 | end, 368 | OtherVnodes 369 | ), 370 | 371 | {reply, ok, State#state{vnode_sqn = State#state.vnode_sqn + 1}}; 372 | handle_call({delete, Bucket, Key, IndexN}, _From, State) -> 373 | PrevClock = 374 | case 375 | leveled_bookie:book_head( 376 | State#state.vnode_store, 377 | Bucket, 378 | Key, 379 | ?RIAK_TAG 380 | ) 381 | of 382 | not_found -> 383 | none; 384 | {ok, Head} -> 385 | extractclock_from_riakhead(Head) 386 | end, 387 | leveled_bookie:book_put( 388 | State#state.vnode_store, 389 | Bucket, 390 | Key, 391 | delete, 392 | [], 393 | ?RIAK_TAG 394 | ), 395 | ok = aae_controller:aae_put( 396 | State#state.aae_controller, 397 | IndexN, 398 | Bucket, 399 | Key, 400 | none, 401 | PrevClock, 402 | <<>> 403 | ), 404 | {reply, ok, State}; 405 | handle_call({rebuild, true}, _From, State) -> 406 | % To rebuild the store an Object SplitFun will be required if is is a 407 | % parallel store, which will depend on the preflist_fun. 408 | NRT = aae_controller:aae_nextrebuild(State#state.aae_controller), 409 | 410 | SplitFun = 411 | fun(B, K, V) -> 412 | PreflistFun = State#state.preflist_fun, 413 | IndexN = PreflistFun(B, K), 414 | Clock = extractclock_from_riakhead(V), 415 | {IndexN, Clock} 416 | end, 417 | Vnode = self(), 418 | ReturnFun = 419 | fun(ok) -> 420 | ok = rebuild_complete(Vnode, store) 421 | end, 422 | 423 | case 424 | aae_controller:aae_rebuildstore( 425 | State#state.aae_controller, 426 | SplitFun 427 | ) 428 | of 429 | ok -> 430 | % This store is rebuilt already (i.e. it is native), so nothing to 431 | % do here other than prompt the status change 432 | ReturnFun(ok); 433 | {ok, FoldFun, FinishFun} -> 434 | Worker = workerfun({rebuild_worker, [ReturnFun]}), 435 | % Now need to get a fold query to run over the vnode store to 436 | % rebuild the parallel store. The aae_controller has provided 437 | % the object fold fun which should load the parallel store, and 438 | % the finish fun which should tell the controller the fold is 439 | % complete and prompt the finishing of the rebuild activity 440 | {async, Runner} = 441 | leveled_bookie:book_headfold( 442 | State#state.vnode_store, 443 | ?RIAK_TAG, 444 | {FoldFun, []}, 445 | true, 446 | true, 447 | false 448 | ), 449 | % dispatch the work to the worker 450 | Worker(Runner, FinishFun) 451 | end, 452 | {reply, {NRT, true}, State#state{aae_rebuild = true}}; 453 | handle_call({rebuild, false}, _From, State) -> 454 | % Check next rebuild 455 | % Reply with next rebuild TS - and the status to indicate an ongoing 456 | % rebuild 457 | NRT = aae_controller:aae_nextrebuild(State#state.aae_controller), 458 | {reply, {NRT, State#state.aae_rebuild}, State}; 459 | handle_call({rehash, Bucket, Key, IndexN}, _From, State) -> 460 | case 461 | leveled_bookie:book_head( 462 | State#state.vnode_store, 463 | Bucket, 464 | Key, 465 | ?RIAK_TAG 466 | ) 467 | of 468 | not_found -> 469 | ok = aae_controller:aae_put( 470 | State#state.aae_controller, 471 | IndexN, 472 | Bucket, 473 | Key, 474 | none, 475 | undefined, 476 | <<>> 477 | ); 478 | {ok, Head} -> 479 | C0 = extractclock_from_riakhead(Head), 480 | ok = aae_controller:aae_put( 481 | State#state.aae_controller, 482 | IndexN, 483 | Bucket, 484 | Key, 485 | C0, 486 | undefined, 487 | to_aae_binary(Head) 488 | ) 489 | end, 490 | {reply, ok, State}; 491 | handle_call({aae, Msg, IndexNs, ReturnFun}, _From, State) -> 492 | case Msg of 493 | fetch_root -> 494 | aae_controller:aae_mergeroot( 495 | State#state.aae_controller, 496 | IndexNs, 497 | ReturnFun 498 | ); 499 | {fetch_branches, BranchIDs} -> 500 | aae_controller:aae_mergebranches( 501 | State#state.aae_controller, 502 | IndexNs, 503 | BranchIDs, 504 | ReturnFun 505 | ); 506 | {fetch_clocks, SegmentIDs} -> 507 | aae_controller:aae_fetchclocks( 508 | State#state.aae_controller, 509 | IndexNs, 510 | SegmentIDs, 511 | ReturnFun, 512 | State#state.preflist_fun 513 | ); 514 | {fetch_clocks, SegmentIDs, MR} -> 515 | aae_controller:aae_fetchclocks( 516 | State#state.aae_controller, 517 | IndexNs, 518 | all, 519 | SegmentIDs, 520 | MR, 521 | ReturnFun, 522 | State#state.preflist_fun 523 | ); 524 | {merge_tree_range, B, KR, TS, SF, MR, HM} -> 525 | NullExtractFun = 526 | fun({B0, K0}, V0) -> 527 | {aae_util:make_binarykey(B0, K0), V0} 528 | end, 529 | {FoldFun, Elements} = 530 | case HM of 531 | pre_hash -> 532 | { 533 | fun(BF, KF, EFs, TreeAcc) -> 534 | {hash, CH} = lists:keyfind(hash, 1, EFs), 535 | leveled_tictac:add_kv( 536 | TreeAcc, 537 | {BF, KF}, 538 | {is_hash, CH}, 539 | NullExtractFun 540 | ) 541 | end, 542 | [{hash, null}] 543 | }; 544 | {rehash, IV} -> 545 | { 546 | fun(BF, KF, EFs, TreeAcc) -> 547 | {clock, VC} = lists:keyfind(clock, 1, EFs), 548 | CH = erlang:phash2({IV, lists:sort(VC)}), 549 | leveled_tictac:add_kv( 550 | TreeAcc, 551 | {BF, KF}, 552 | {is_hash, CH}, 553 | NullExtractFun 554 | ) 555 | end, 556 | [{clock, null}] 557 | } 558 | end, 559 | InitAcc = leveled_tictac:new_tree(State#state.vnode_id, TS), 560 | RangeLimiter = aaefold_setrangelimiter(B, KR), 561 | ModifiedLimiter = aaefold_setmodifiedlimiter(MR), 562 | {async, Folder} = 563 | aae_controller:aae_fold( 564 | State#state.aae_controller, 565 | RangeLimiter, 566 | SF, 567 | ModifiedLimiter, 568 | false, 569 | FoldFun, 570 | InitAcc, 571 | Elements 572 | ), 573 | Worker = workerfun({fold_worker, []}), 574 | Worker(Folder, ReturnFun); 575 | {fetch_clocks_range, B, KR, SF, MR} -> 576 | FoldFun = 577 | fun(BF, KF, EFs, KeyClockAcc) -> 578 | magickey_check(KF, State#state.aae_type), 579 | {clock, VV} = lists:keyfind(clock, 1, EFs), 580 | [{BF, KF, VV} | KeyClockAcc] 581 | end, 582 | RangeLimiter = aaefold_setrangelimiter(B, KR), 583 | ModifiedLimiter = aaefold_setmodifiedlimiter(MR), 584 | {async, Folder} = 585 | aae_controller:aae_fold( 586 | State#state.aae_controller, 587 | RangeLimiter, 588 | SF, 589 | ModifiedLimiter, 590 | false, 591 | FoldFun, 592 | [], 593 | [{clock, null}] 594 | ), 595 | Worker = workerfun({fold_worker, []}), 596 | Worker(Folder, ReturnFun) 597 | end, 598 | {reply, ok, State}; 599 | handle_call( 600 | {fold_aae, Range, Segments, FoldFun, InitAcc, Elements}, 601 | _From, 602 | State 603 | ) -> 604 | R = aae_controller:aae_fold( 605 | State#state.aae_controller, 606 | Range, 607 | Segments, 608 | FoldFun, 609 | InitAcc, 610 | Elements 611 | ), 612 | {reply, R, State}; 613 | handle_call(bucketlist_aae, _From, State) -> 614 | R = aae_controller:aae_bucketlist(State#state.aae_controller), 615 | {reply, R, State}; 616 | handle_call(close, _From, State) -> 617 | ok = aae_controller:aae_close(State#state.aae_controller), 618 | ok = leveled_bookie:book_close(State#state.vnode_store), 619 | {stop, normal, ok, State}. 620 | 621 | handle_cast({push, Bucket, Key, UpdClock, ObjectBin, IndexN}, State) -> 622 | % As PUT, but don't increment vclock, replace regardless of current state 623 | PrevClock = 624 | case 625 | leveled_bookie:book_head( 626 | State#state.vnode_store, 627 | Bucket, 628 | Key, 629 | ?RIAK_TAG 630 | ) 631 | of 632 | not_found -> 633 | none; 634 | {ok, Head} -> 635 | extractclock_from_riakhead(Head) 636 | end, 637 | leveled_bookie:book_put( 638 | State#state.vnode_store, 639 | Bucket, 640 | Key, 641 | ObjectBin, 642 | [], 643 | ?RIAK_TAG 644 | ), 645 | 646 | ok = aae_controller:aae_put( 647 | State#state.aae_controller, 648 | IndexN, 649 | Bucket, 650 | Key, 651 | UpdClock, 652 | PrevClock, 653 | to_aae_binary(ObjectBin) 654 | ), 655 | 656 | {noreply, State}; 657 | handle_cast({rebuild_complete, store}, State) -> 658 | % Trigger a rebuild of the tree. Will require a non-null preflist_fun 659 | % if the store is native (as the native store will not store the IndexN, 660 | % and so a recalculation will be required) 661 | Vnode = self(), 662 | ReturnFun = 663 | fun(ok) -> 664 | ok = rebuild_complete(Vnode, tree) 665 | end, 666 | Worker = workerfun({rebuild_worker, [ReturnFun]}), 667 | case 668 | aae_controller:aae_rebuildtrees( 669 | State#state.aae_controller, 670 | State#state.index_ns, 671 | State#state.preflist_fun, 672 | Worker, 673 | false 674 | ) 675 | of 676 | ok -> 677 | {noreply, State#state{aae_rebuild = true}}; 678 | loading -> 679 | gen_server:cast(self(), {rebuild_complete, store}), 680 | timer:sleep(1000), 681 | {noreply, State} 682 | end; 683 | handle_cast({rebuild_complete, tree}, State) -> 684 | {noreply, State#state{aae_rebuild = false}}; 685 | handle_cast(reset_keyfilter, State) -> 686 | ok = aae_controller:aae_reset_key_filter(State#state.aae_controller), 687 | {noreply, State}. 688 | 689 | handle_info(poke, State) -> 690 | ok = aae_controller:aae_ping( 691 | State#state.aae_controller, 692 | os:timestamp(), 693 | self() 694 | ), 695 | {noreply, State}; 696 | handle_info({aae_pong, QueueTime}, State) -> 697 | io:format("Queuetime in microseconds ~w~n", [QueueTime]), 698 | erlang:send_after(?POKE_TIME, self(), poke), 699 | {noreply, State}. 700 | 701 | terminate(_Reason, _State) -> 702 | ok. 703 | 704 | code_change(_OldVsn, State, _Extra) -> 705 | {ok, State}. 706 | 707 | %%%============================================================================ 708 | %%% External functions 709 | %%%============================================================================ 710 | 711 | -spec extractclock_from_riakhead(binary()) -> list(tuple()). 712 | %% @doc 713 | %% Extract the vector clock from a riak binary object (without doing a full 714 | %% binary to objetc conversion) 715 | extractclock_from_riakhead( 716 | <> 718 | ) -> 719 | lists:usort(binary_to_term(VclockBin)); 720 | extractclock_from_riakhead(RiakHead) -> 721 | {proxy_object, HeadBin, _Size, _F} = binary_to_term(RiakHead), 722 | extractclock_from_riakhead(HeadBin). 723 | 724 | %% V1 Riak Object Binary Encoding 725 | %% -type binobj_header() :: <<53:8, Version:8, VClockLen:32, VClockBin/binary, 726 | %% SibCount:32>>. 727 | %% -type binobj_flags() :: <>. 728 | %% -type binobj_umeta_pair() :: <>. 729 | %% -type binobj_meta() :: <>. 731 | %% -type binobj_value() :: <>. 733 | %% -type binobj() :: <>. 734 | new_v1(Vclock, Siblings) -> 735 | VclockBin = term_to_binary(Vclock), 736 | VclockLen = byte_size(VclockBin), 737 | SibCount = length(Siblings), 738 | SibsBin = bin_contents(Siblings), 739 | <>. 741 | 742 | bin_content(#r_content{metadata = Meta0, value = Val}) -> 743 | TypeTag = 1, 744 | ValBin = encode_maybe_binary(Val, TypeTag), 745 | ValLen = byte_size(ValBin), 746 | MetaBin = meta_bin(Meta0), 747 | MetaLen = byte_size(MetaBin), 748 | <>. 750 | 751 | encode_maybe_binary(Value, TypeTag) when is_binary(Value) -> 752 | <>. 753 | 754 | bin_contents(Contents) -> 755 | F = fun(Content, Acc) -> 756 | <> 757 | end, 758 | lists:foldl(F, <<>>, Contents). 759 | 760 | meta_bin(MetaData) -> 761 | {last_modified_date, {Mega, Secs, Micro}} = 762 | lists:keyfind(last_modified_date, 1, MetaData), 763 | LastModBin = <>, 764 | Deleted = <<0>>, 765 | RestBin = term_to_binary(MetaData), 766 | VTagBin = ?EMPTY_VTAG_BIN, 767 | VTagLen = byte_size(VTagBin), 768 | <>. 770 | 771 | workerfun({WorkerFun, Args}) -> 772 | WorkerPid = spawn(?MODULE, WorkerFun, Args), 773 | fun(FoldFun, FinishFun) -> 774 | WorkerPid ! {fold, FoldFun, FinishFun} 775 | end. 776 | 777 | rebuild_worker(ReturnFun) -> 778 | receive 779 | {fold, FoldFun, FinishFun} -> 780 | FinishFun(FoldFun()), 781 | ReturnFun(ok) 782 | end. 783 | 784 | fold_worker() -> 785 | receive 786 | {fold, FoldFun, ReturnFun} -> 787 | SW0 = os:timestamp(), 788 | R = FoldFun(), 789 | io:format( 790 | "FoldFun took ~w ms~n", 791 | [timer:now_diff(os:timestamp(), SW0) div 1000] 792 | ), 793 | ReturnFun(R) 794 | end. 795 | 796 | from_aae_binary(AAEBin) -> 797 | <> = AAEBin, 800 | {ObjectSize, SibCount, IndexHash, [{LMDmeg, LMDsec, LMDmcr}], MDOnly}. 801 | 802 | %%%============================================================================ 803 | %%% Internal functions 804 | %%%============================================================================ 805 | 806 | %% @doc 807 | %% Convert the format of the range limiter to one compatible with the aae store 808 | aaefold_setrangelimiter(all, all) -> 809 | all; 810 | aaefold_setrangelimiter(Bucket, all) -> 811 | {buckets, [Bucket]}; 812 | aaefold_setrangelimiter(Bucket, {StartKey, EndKey}) -> 813 | {key_range, Bucket, StartKey, EndKey}. 814 | 815 | %% @doc 816 | %% Convert the format of the date limiter to one compatible with the aae store 817 | aaefold_setmodifiedlimiter({LowModDate, HighModDate}) when 818 | is_integer(LowModDate), is_integer(HighModDate) 819 | -> 820 | {LowModDate, HighModDate}; 821 | aaefold_setmodifiedlimiter(_) -> 822 | all. 823 | 824 | to_aae_binary(ObjectBin) -> 825 | ObjectSize = byte_size(ObjectBin), 826 | <> = ObjectBin, 829 | 830 | % faking here 831 | IndexHash = erlang:phash2([]), 832 | 833 | {{LMDmeg, LMDsec, LMDmcr}, MD} = 834 | strip_metabinary(SibCount, SibsBin, {0, 0, 0}, <<>>), 835 | 836 | <>. 838 | 839 | strip_metabinary(0, <<>>, LMD, MetaBinAcc) -> 840 | {LMD, MetaBinAcc}; 841 | strip_metabinary(SibCount, SibBin, LMD, MetaBinAcc) -> 842 | <> = SibBin, 844 | <> = MetaBin, 846 | LMD0 = max({LMDmega, LMDsec, LMDmicro}, LMD), 847 | strip_metabinary( 848 | SibCount - 1, 849 | Rest, 850 | LMD0, 851 | <> 852 | ). 853 | 854 | magickey_check(Key, VnodeType) -> 855 | case lists:member(Key, ?MAGIC_KEYS) of 856 | true -> 857 | io:format("Magic key ~w at VnodeType ~w~n", [Key, VnodeType]); 858 | false -> 859 | ok 860 | end. 861 | 862 | %%%============================================================================ 863 | %%% Test 864 | %%%============================================================================ 865 | 866 | -ifdef(TEST). 867 | 868 | -endif. 869 | -------------------------------------------------------------------------------- /src/aae_treecache.erl: -------------------------------------------------------------------------------- 1 | %% -------- Overview --------- 2 | %% 3 | 4 | -module(aae_treecache). 5 | 6 | -behaviour(gen_server). 7 | 8 | -include("aae.hrl"). 9 | 10 | -export([ 11 | init/1, 12 | handle_call/3, 13 | handle_cast/2, 14 | handle_info/2, 15 | terminate/2, 16 | code_change/3, 17 | format_status/1 18 | ]). 19 | 20 | -export([ 21 | cache_open/3, 22 | cache_new/3, 23 | cache_alter/4, 24 | cache_root/1, 25 | cache_leaves/2, 26 | cache_markdirtysegments/3, 27 | cache_replacedirtysegments/3, 28 | cache_destroy/1, 29 | cache_startload/1, 30 | cache_completeload/2, 31 | cache_loglevel/2, 32 | cache_close/1, 33 | cache_segment_count/1 34 | ]). 35 | 36 | -define(PENDING_EXT, ".pnd"). 37 | -define(FINAL_EXT, ".aae"). 38 | -define(START_SQN, 1). 39 | -define(SYNC_TIMEOUT, 30000). 40 | 41 | -record(state, { 42 | save_sqn = 0 :: integer(), 43 | is_restored = false :: boolean(), 44 | tree :: leveled_tictac:tictactree() | undefined, 45 | root_path :: list() | undefined, 46 | partition_id :: {integer(), integer()} | integer() | undefined, 47 | loading = false :: boolean(), 48 | dirty_segments = [] :: list(), 49 | active_fold :: string() | undefined, 50 | change_queue = [] :: list() | redacted, 51 | queued_changes = 0 :: non_neg_integer(), 52 | safe_save = false :: boolean() 53 | }). 54 | 55 | -type partition_id() :: integer() | {integer(), integer()}. 56 | 57 | %%%============================================================================ 58 | %%% API 59 | %%%============================================================================ 60 | 61 | -spec cache_open( 62 | list(), partition_id(), aae_util:log_levels() 63 | ) -> {boolean(), pid()}. 64 | %% @doc 65 | %% Open a tree cache, using any previously saved one for this tree cache as a 66 | %% starting point. Return is_empty boolean as true to indicate if a new cache 67 | %% was created, as well as the PID of this FSM 68 | cache_open(RootPath, PartitionID, LogLevels) -> 69 | Opts = [ 70 | {root_path, RootPath}, 71 | {partition_id, PartitionID}, 72 | {log_levels, LogLevels} 73 | ], 74 | {ok, Pid} = gen_server:start_link(?MODULE, [Opts], []), 75 | IsRestored = gen_server:call(Pid, is_restored, infinity), 76 | {IsRestored, Pid}. 77 | 78 | -spec cache_new( 79 | list(), partition_id(), aae_util:log_levels() 80 | ) -> {ok, pid()}. 81 | %% @doc 82 | %% Open a tree cache, without restoring from file 83 | cache_new(RootPath, PartitionID, LogLevels) -> 84 | Opts = [ 85 | {root_path, RootPath}, 86 | {partition_id, PartitionID}, 87 | {ignore_disk, true}, 88 | {log_levels, LogLevels} 89 | ], 90 | {ok, Pid} = gen_server:start_link(?MODULE, [Opts], []), 91 | {ok, Pid}. 92 | 93 | -spec cache_destroy(pid()) -> ok. 94 | %% @doc 95 | %% Close a cache without saving 96 | cache_destroy(AAECache) -> 97 | gen_server:cast(AAECache, destroy). 98 | 99 | -spec cache_segment_count(pid()) -> non_neg_integer(). 100 | %% @doc 101 | %% Expose dirty_segments length, for aae-progress-report. 102 | cache_segment_count(AAECache) -> 103 | gen_server:call(AAECache, segment_count, ?SYNC_TIMEOUT). 104 | 105 | -spec cache_close(pid()) -> ok. 106 | %% @doc 107 | %% Close a cache with saving 108 | cache_close(AAECache) -> 109 | gen_server:call(AAECache, close, ?SYNC_TIMEOUT). 110 | 111 | -spec cache_alter(pid(), binary(), integer() | none, integer() | none) -> ok. 112 | %% @doc 113 | %% Change the hash tree to reflect an addition and removal of a hash value 114 | cache_alter(AAECache, Key, CurrentHash, OldHash) -> 115 | gen_server:cast(AAECache, {alter, Key, CurrentHash, OldHash}). 116 | 117 | -spec cache_root(pid()) -> binary(). 118 | %% @doc 119 | %% Fetch the root of the cache tree to compare 120 | cache_root(Pid) -> 121 | gen_server:call(Pid, fetch_root, infinity). 122 | 123 | -spec cache_leaves(pid(), list(integer())) -> list(). 124 | %% @doc 125 | %% Fetch the leaves for a given list of branch IDs. 126 | cache_leaves(Pid, BranchIDs) -> 127 | gen_server:call(Pid, {fetch_leaves, BranchIDs}, infinity). 128 | 129 | -spec cache_markdirtysegments(pid(), list(integer()), string()) -> ok. 130 | %% @doc 131 | %% Mark dirty segments. These segments are currently subject to a fetch_clocks 132 | %% fold. If they aren't touched until the fold is complete, the segment can be 133 | %% safely replaced with the value in the fold. 134 | %% 135 | %% The FoldGUID is used to identify the request that prompted the marking. 136 | %% This becomes the active_fold, replacing any previous marking. Dirty 137 | %% segments can only be replaced by the last active fold. Need to avoid race 138 | %% conditions between multiple dirtysegment markings (as well as updates 139 | %% clearing dirty segments) 140 | cache_markdirtysegments(Pid, SegmentIDs, FoldGUID) -> 141 | gen_server:cast(Pid, {mark_dirtysegments, SegmentIDs, FoldGUID}). 142 | 143 | -spec cache_replacedirtysegments( 144 | pid(), 145 | list({integer(), integer()}), 146 | string() 147 | ) -> ok. 148 | %% @doc 149 | %% When a fold_clocks is complete, replace any dirty_segments which remain 150 | %% clean from other interventions 151 | cache_replacedirtysegments(Pid, ReplacementSegments, FoldGUID) -> 152 | gen_server:cast( 153 | Pid, 154 | {replace_dirtysegments, ReplacementSegments, FoldGUID} 155 | ). 156 | 157 | -spec cache_startload(pid()) -> ok. 158 | %% @doc 159 | %% Sets the cache loading state to true, now as well as maintaining the 160 | %% current tree the cache should keep a queue of all the changes from this 161 | %% point. 162 | %% 163 | %% Eventually cache_completeload should be called with a tree built from 164 | %% a loading process snapshotted at the startload point, and the changes can 165 | %% all be applied 166 | cache_startload(Pid) -> 167 | gen_server:cast(Pid, start_load). 168 | 169 | -spec cache_completeload(pid(), leveled_tictac:tictactree()) -> ok. 170 | %% @doc 171 | %% Take a tree which has been produced from a fold of the KeyStore, and make 172 | %% this the new tree 173 | cache_completeload(Pid, LoadedTree) -> 174 | gen_server:cast(Pid, {complete_load, LoadedTree}). 175 | 176 | -spec cache_loglevel(pid(), aae_util:log_levels()) -> ok. 177 | %% @doc 178 | %% Alter the log level at runtime 179 | cache_loglevel(Pid, LogLevels) -> 180 | gen_server:cast(Pid, {log_levels, LogLevels}). 181 | 182 | %%%============================================================================ 183 | %%% gen_server callbacks 184 | %%%============================================================================ 185 | 186 | init([Opts]) -> 187 | PartitionID = 188 | case aae_util:get_opt(partition_id, Opts) of 189 | {Index, N} when is_integer(Index), is_integer(N) -> 190 | {Index, N}; 191 | Index when is_integer(Index) -> 192 | Index 193 | end, 194 | RootPath = 195 | case aae_util:get_opt(root_path, Opts) of 196 | RPOpt when is_list(RPOpt) -> 197 | aae_util:check_rootpath(RPOpt) 198 | end, 199 | IgnoreDisk = 200 | case aae_util:get_opt(ignore_disk, Opts, false) of 201 | IgnoreOpt when is_boolean(IgnoreOpt) -> 202 | IgnoreOpt 203 | end, 204 | case aae_util:get_opt(log_levels, Opts) of 205 | LLOpt when is_list(LLOpt) -> 206 | aae_util:set_loglevel(LLOpt); 207 | undefined -> 208 | ok 209 | end, 210 | RootPath0 = filename:join(RootPath, flatten_id(PartitionID)) ++ "/", 211 | {StartTree, SaveSQN, IsRestored} = 212 | case {open_from_disk(RootPath0), IgnoreDisk} of 213 | % Always run open_from_disk even if the result is to be ignored, 214 | % as any files present must still be cleared 215 | {{Tree, SQN}, false} when Tree =/= none -> 216 | {Tree, SQN, true}; 217 | _ -> 218 | { 219 | leveled_tictac:new_tree(PartitionID, ?TREE_SIZE), 220 | ?START_SQN, 221 | false 222 | } 223 | end, 224 | ?STD_LOG(c0005, [IsRestored, PartitionID]), 225 | process_flag(trap_exit, true), 226 | {ok, 227 | #state{ 228 | save_sqn = SaveSQN, 229 | tree = StartTree, 230 | is_restored = IsRestored, 231 | root_path = RootPath0, 232 | partition_id = PartitionID, 233 | safe_save = IsRestored or IgnoreDisk 234 | }, 235 | hibernate}. 236 | 237 | handle_call(is_restored, _From, State) -> 238 | {reply, State#state.is_restored, State}; 239 | handle_call(fetch_root, _From, State = #state{tree = Tree}) when 240 | Tree =/= undefined 241 | -> 242 | {reply, leveled_tictac:fetch_root(State#state.tree), State}; 243 | handle_call({fetch_leaves, BranchIDs}, _From, State = #state{tree = Tree}) when 244 | Tree =/= undefined 245 | -> 246 | {reply, leveled_tictac:fetch_leaves(State#state.tree, BranchIDs), State}; 247 | handle_call(segment_count, _From, State = #state{dirty_segments = A}) -> 248 | {reply, length(A), State}; 249 | handle_call(close, _From, State) -> 250 | case {State#state.safe_save, State#state.tree, State#state.root_path} of 251 | {true, Tree, RP} when Tree =/= undefined, RP =/= undefined -> 252 | save_to_disk( 253 | RP, State#state.save_sqn, Tree 254 | ); 255 | _ -> 256 | ok 257 | end, 258 | {stop, normal, ok, State}. 259 | 260 | handle_cast( 261 | {alter, Key, CurrentHash, OldHash}, State = #state{change_queue = CQ} 262 | ) when is_list(CQ) -> 263 | {Tree0, Segment} = 264 | leveled_tictac:add_kv( 265 | State#state.tree, 266 | Key, 267 | {CurrentHash, OldHash}, 268 | fun alterhash_fun/2, 269 | true 270 | ), 271 | State0 = 272 | case State#state.loading of 273 | true -> 274 | QCnt = State#state.queued_changes, 275 | State#state{ 276 | change_queue = [{Key, CurrentHash, OldHash} | CQ], 277 | queued_changes = QCnt + 1 278 | }; 279 | false -> 280 | State 281 | end, 282 | case State#state.dirty_segments of 283 | [] -> 284 | {noreply, State0#state{tree = Tree0}}; 285 | DirtyList -> 286 | DirtyList0 = lists:delete(Segment, DirtyList), 287 | {noreply, State0#state{tree = Tree0, dirty_segments = DirtyList0}} 288 | end; 289 | handle_cast(start_load, State = #state{loading = Loading}) when 290 | Loading == false 291 | -> 292 | {noreply, State#state{ 293 | loading = true, 294 | change_queue = [], 295 | queued_changes = 0, 296 | dirty_segments = [], 297 | active_fold = undefined 298 | }}; 299 | handle_cast({complete_load, Tree}, State = #state{loading = Loading}) when 300 | Loading == true 301 | -> 302 | LoadFun = 303 | fun({Key, CH, OH}, AccTree) -> 304 | leveled_tictac:add_kv( 305 | AccTree, Key, {CH, OH}, fun alterhash_fun/2 306 | ) 307 | end, 308 | Tree0 = lists:foldr(LoadFun, Tree, State#state.change_queue), 309 | ?STD_LOG(c0008, [length(State#state.change_queue)]), 310 | {noreply, 311 | State#state{ 312 | loading = false, 313 | change_queue = [], 314 | queued_changes = 0, 315 | tree = Tree0, 316 | safe_save = true 317 | }, 318 | hibernate}; 319 | handle_cast({mark_dirtysegments, SegmentList, FoldGUID}, State) -> 320 | case State#state.loading of 321 | true -> 322 | % don't mess about with dirty segments, loading anyway 323 | {noreply, State}; 324 | false -> 325 | {noreply, State#state{ 326 | dirty_segments = SegmentList, 327 | active_fold = FoldGUID 328 | }} 329 | end; 330 | handle_cast({replace_dirtysegments, SegmentMap, FoldGUID}, State) -> 331 | ChangeSegmentFoldFun = 332 | fun({SID, NewHash}, TreeAcc) -> 333 | case lists:member(SID, State#state.dirty_segments) of 334 | true -> 335 | ?STD_LOG(c0006, [State#state.partition_id, SID, NewHash]), 336 | leveled_tictac:alter_segment(SID, NewHash, TreeAcc); 337 | false -> 338 | TreeAcc 339 | end 340 | end, 341 | case State#state.active_fold of 342 | FoldGUID -> 343 | UpdTree = 344 | lists:foldl( 345 | ChangeSegmentFoldFun, 346 | State#state.tree, 347 | SegmentMap 348 | ), 349 | {noreply, State#state{tree = UpdTree}}; 350 | _ -> 351 | {noreply, State} 352 | end; 353 | handle_cast(destroy, State) -> 354 | ?STD_LOG(c0004, [State#state.partition_id]), 355 | {stop, normal, State}; 356 | handle_cast({log_levels, LogLevels}, State) -> 357 | ok = aae_util:set_loglevel(LogLevels), 358 | {noreply, State}. 359 | 360 | handle_info(_Info, State) -> 361 | {stop, normal, State}. 362 | 363 | format_status(Status) -> 364 | case maps:get(reason, Status, normal) of 365 | terminate -> 366 | State = maps:get(state, Status), 367 | maps:update( 368 | state, 369 | State#state{change_queue = redacted}, 370 | Status 371 | ); 372 | _ -> 373 | Status 374 | end. 375 | 376 | terminate(_Reason, _State) -> 377 | ok. 378 | 379 | code_change(_OldVsn, State, _Extra) -> 380 | {ok, State}. 381 | 382 | %%%============================================================================ 383 | %%% Internal functions 384 | %%%============================================================================ 385 | 386 | -spec flatten_id(partition_id()) -> list(). 387 | %% @doc 388 | %% Flatten partition ID to make a folder name 389 | flatten_id({Index, N}) -> 390 | integer_to_list(Index) ++ "_" ++ integer_to_list(N); 391 | flatten_id(ID) -> 392 | integer_to_list(ID). 393 | 394 | -spec save_to_disk(list(), integer(), leveled_tictac:tictactree()) -> ok. 395 | %% @doc 396 | %% Save the TreeCache to disk, with a checksum so thatit can be 397 | %% validated on read. 398 | save_to_disk(RootPath, SaveSQN, TreeCache) -> 399 | Serialised = term_to_binary(leveled_tictac:export_tree(TreeCache)), 400 | CRC32 = erlang:crc32(Serialised), 401 | ok = filelib:ensure_dir(RootPath), 402 | PendingName = integer_to_list(SaveSQN) ++ ?PENDING_EXT, 403 | ?STD_LOG(c0003, [RootPath, PendingName]), 404 | ok = file:write_file( 405 | filename:join(RootPath, PendingName), 406 | <>, 407 | [raw] 408 | ), 409 | ok = 410 | file:rename( 411 | filename:join(RootPath, PendingName), 412 | form_cache_filename(RootPath, SaveSQN) 413 | ), 414 | ok. 415 | 416 | -spec open_from_disk(list()) -> {leveled_tictac:tictactree() | none, integer()}. 417 | %% @doc 418 | %% Open most recently saved TicTac tree cache file on disk, deleting all 419 | %% others both used and unused - to save an out of date tree from being used 420 | %% following a subsequent crash 421 | open_from_disk(RootPath) -> 422 | ok = filelib:ensure_dir(RootPath), 423 | {ok, Filenames} = file:list_dir(RootPath), 424 | FileFilterFun = 425 | fun(FN, FinalFiles) -> 426 | case filename:extension(FN) of 427 | ?PENDING_EXT -> 428 | ?STD_LOG(c0001, [FN]), 429 | ok = file:delete(filename:join(RootPath, FN)), 430 | FinalFiles; 431 | ?FINAL_EXT -> 432 | BaseFN = 433 | filename:basename(filename:rootname(FN, ?FINAL_EXT)), 434 | [list_to_integer(BaseFN) | FinalFiles]; 435 | _ -> 436 | FinalFiles 437 | end 438 | end, 439 | SQNList = 440 | lists:reverse(lists:sort(lists:foldl(FileFilterFun, [], Filenames))), 441 | case SQNList of 442 | [] -> 443 | {none, 1}; 444 | [HeadSQN | Tail] -> 445 | DeleteFun = 446 | fun(SQN) -> 447 | ok = file:delete(form_cache_filename(RootPath, SQN)) 448 | end, 449 | lists:foreach(DeleteFun, Tail), 450 | FileToUse = form_cache_filename(RootPath, HeadSQN), 451 | case aae_util:safe_open(FileToUse) of 452 | {ok, STC} -> 453 | ok = file:delete(FileToUse), 454 | { 455 | leveled_tictac:import_tree(binary_to_term(STC)), 456 | HeadSQN + 1 457 | }; 458 | {error, Reason} -> 459 | ?STD_LOG(c0002, [FileToUse, Reason]), 460 | {none, 1} 461 | end 462 | end. 463 | 464 | -spec form_cache_filename(list(), integer()) -> list(). 465 | %% @doc 466 | %% Return the cache filename by combining the Root Path with the SQN 467 | form_cache_filename(RootPath, SaveSQN) -> 468 | filename:join(RootPath, integer_to_list(SaveSQN) ++ ?FINAL_EXT). 469 | 470 | -spec alterhash_fun(term(), term()) -> {binary(), {is_hash, integer()}}. 471 | %% @doc 472 | %% Function to calculate the hash change need to make an alter into a straight 473 | %% add as the BinExtractfun in leveled_tictac 474 | alterhash_fun(Key, {CurrentHash, OldHash}) when 475 | is_binary(Key), 476 | is_integer(CurrentHash) orelse CurrentHash == none, 477 | is_integer(OldHash) orelse OldHash == none 478 | -> 479 | % TODO: Should move this function to leveled_tictac 480 | % - requires secret knowledge of implementation to perform 481 | % alter 482 | % 483 | % What we know about the addition of a value into a leveled_tictac tree is 484 | % that an addition is made be doing: 485 | % SegHash bxor (AltKeyHash bxor ClockHash) 486 | % 487 | % The ClockHash in this case is the output of this function. When an 488 | % alteration is being made the resulting Hash needs to still include the 489 | % AltKeyHash, so it is necessary apply bxor AltKeyHash an odd number of 490 | % times. Hence an alteration or a null change must include the AltKeyHash 491 | % within the ClockHash 492 | UpdateHash = 493 | case {CurrentHash, OldHash} of 494 | {none, OldHash} when is_integer(OldHash) -> 495 | % Remove - treat like adding back in 496 | % the tictac will bxor this with the key - so don't need to 497 | % bxor this here again 498 | OldHash; 499 | {CurrentHash, none} when is_integer(CurrentHash) -> 500 | % Nothing to remove - straight add 501 | CurrentHash; 502 | {none, none} -> 503 | % This may be prompted in rehash. 504 | % In this case a neutral update is required (when bxor'd with 505 | % the key hash it should produce no change) - so return the 506 | % relevant hash of the key 507 | {_SegmentHash, AltKeyHash} = 508 | leveled_tictac:keyto_doublesegment32(Key), 509 | AltKeyHash; 510 | {CurrentHash, OldHash} when 511 | is_integer(CurrentHash), is_integer(OldHash) 512 | -> 513 | % Alter - need to account for hashing with key 514 | % to remove the original 515 | {_SegmentHash, AltKeyHash} = 516 | leveled_tictac:keyto_doublesegment32(Key), 517 | CurrentHash bxor (OldHash bxor AltKeyHash) 518 | end, 519 | {Key, {is_hash, UpdateHash}}. 520 | 521 | %%%============================================================================ 522 | %%% Test 523 | %%%============================================================================ 524 | 525 | -ifdef(TEST). 526 | 527 | -include_lib("eunit/include/eunit.hrl"). 528 | 529 | setup_savedcaches(RootPath) -> 530 | Tree0 = leveled_tictac:new_tree(test), 531 | Tree1 = leveled_tictac:add_kv( 532 | Tree0, 533 | {<<"K1">>}, 534 | {<<"V1">>}, 535 | fun({K}, {V}) -> {K, V} end 536 | ), 537 | Tree2 = leveled_tictac:add_kv( 538 | Tree1, 539 | {<<"K2">>}, 540 | {<<"V2">>}, 541 | fun({K}, {V}) -> {K, V} end 542 | ), 543 | ok = save_to_disk(RootPath, 1, Tree1), 544 | ok = save_to_disk(RootPath, 2, Tree2), 545 | Tree2. 546 | 547 | clean_saveopen_test() -> 548 | % Check that pending files ar eignored, and that the highest SQN that is 549 | % not pending is the one opened 550 | RootPath = "test/cache0/", 551 | aae_util:clean_subdir(RootPath), 552 | Tree2 = setup_savedcaches(RootPath), 553 | NextFN = filename:join(RootPath, integer_to_list(3) ++ ?PENDING_EXT), 554 | ok = file:write_file(NextFN, <<"delete">>), 555 | UnrelatedFN = filename:join(RootPath, "alt.file"), 556 | ok = file:write_file(UnrelatedFN, <<"no_delete">>), 557 | 558 | {Tree3, SaveSQN} = 559 | case open_from_disk(RootPath) of 560 | {OT3, OT3SQN} when OT3 =/= none -> 561 | {OT3, OT3SQN} 562 | end, 563 | ?assertMatch(3, SaveSQN), 564 | ?assertMatch([], leveled_tictac:find_dirtyleaves(Tree2, Tree3)), 565 | ?assertMatch({none, 1}, open_from_disk(RootPath)), 566 | 567 | ?assertMatch({ok, <<"no_delete">>}, file:read_file(UnrelatedFN)), 568 | ?assertMatch({error, enoent}, file:read_file(NextFN)), 569 | aae_util:clean_subdir(RootPath). 570 | 571 | clear_old_cache_test() -> 572 | RootPath = "test/oldcache0/", 573 | PartitionID = 1, 574 | RP0 = filename:join(RootPath, integer_to_list(PartitionID)) ++ "/", 575 | aae_util:clean_subdir(RP0), 576 | _Tree2 = setup_savedcaches(RP0), 577 | {ok, FN0s} = file:list_dir(RP0), 578 | ?assertMatch(2, length(FN0s)), 579 | {ok, Cpid} = cache_new(RootPath, 1, undefined), 580 | {ok, FN1s} = file:list_dir(RP0), 581 | ?assertMatch(0, length(FN1s)), 582 | ok = cache_close(Cpid), 583 | {ok, FN2s} = file:list_dir(RP0), 584 | ?assertMatch(1, length(FN2s)), 585 | aae_util:clean_subdir(RootPath). 586 | 587 | dirty_saveopen_test() -> 588 | RootPath = "test/dirtycache0/", 589 | aae_util:clean_subdir(RootPath), 590 | RP0 = filename:join(RootPath, integer_to_list(1)) ++ "/", 591 | {ok, Cpid0} = cache_new(RootPath, 1, undefined), 592 | Hash0 = erlang:phash2({<<"K1">>, <<"C1">>}), 593 | cache_alter(Cpid0, <<"K1">>, Hash0, none), 594 | ok = cache_close(Cpid0), 595 | ?assertMatch(true, filelib:is_file(form_cache_filename(RP0, 1))), 596 | {true, Cpid1} = cache_open(RootPath, 1, undefined), 597 | Hash1 = erlang:phash2({<<"K1">>, <<"C2">>}), 598 | cache_alter(Cpid1, <<"K1">>, Hash1, Hash0), 599 | ok = cache_close(Cpid1), 600 | ?assertMatch(true, filelib:is_file(form_cache_filename(RP0, 2))), 601 | aae_util:clean_subdir(RootPath), 602 | {false, Cpid2} = cache_open(RootPath, 1, undefined), 603 | Hash2 = erlang:phash2({<<"K1">>, <<"C3">>}), 604 | cache_alter(Cpid2, <<"K1">>, Hash2, Hash1), 605 | ok = cache_close(Cpid2), 606 | ?assertMatch(false, filelib:is_file(form_cache_filename(RP0, 1))), 607 | ?assertMatch(false, filelib:is_file(form_cache_filename(RP0, 2))), 608 | ?assertMatch(false, filelib:is_file(form_cache_filename(RP0, 3))), 609 | {false, Cpid3} = cache_open(RootPath, 1, undefined), 610 | Hash3 = erlang:phash2({<<"K1">>, <<"C4">>}), 611 | cache_alter(Cpid3, <<"K1">>, Hash3, Hash2), 612 | ok = cache_close(Cpid3), 613 | ?assertMatch(false, filelib:is_file(form_cache_filename(RP0, 1))), 614 | ?assertMatch(false, filelib:is_file(form_cache_filename(RP0, 2))), 615 | ?assertMatch(false, filelib:is_file(form_cache_filename(RP0, 3))), 616 | ?assertMatch(false, filelib:is_file(form_cache_filename(RP0, 4))), 617 | {false, Cpid4} = cache_open(RootPath, 1, undefined), 618 | cache_startload(Cpid4), 619 | cache_alter(Cpid4, <<"K1">>, Hash3, none), 620 | T0 = leveled_tictac:new_tree(raw, ?TREE_SIZE), 621 | cache_completeload(Cpid4, T0), 622 | ok = cache_close(Cpid4), 623 | ?assertMatch(true, filelib:is_file(form_cache_filename(RP0, 1))), 624 | {true, Cpid5} = cache_open(RootPath, 1, undefined), 625 | R0 = cache_root(Cpid5), 626 | [BranchID] = 627 | leveled_tictac:find_dirtysegments(R0, leveled_tictac:fetch_root(T0)), 628 | [{BranchID, Branch5}] = cache_leaves(Cpid5, [BranchID]), 629 | [{BranchID, Branch0}] = leveled_tictac:fetch_leaves(T0, [BranchID]), 630 | [SegmentID] = 631 | leveled_tictac:find_dirtysegments(Branch0, Branch5), 632 | Pos = SegmentID * 4, 633 | <<_Pre:Pos/binary, HashToCheck:32/integer, _Post/binary>> = Branch5, 634 | {_SegmentHash, AltHash} = leveled_tictac:keyto_doublesegment32(<<"K1">>), 635 | ?assertMatch(Hash3, HashToCheck bxor AltHash), 636 | ok = cache_close(Cpid5), 637 | ?assertMatch(true, filelib:is_file(form_cache_filename(RP0, 2))), 638 | aae_util:clean_subdir(RootPath). 639 | 640 | corrupt_save_test_() -> 641 | {timeout, 60, fun corrupt_save_tester/0}. 642 | 643 | corrupt_save_tester() -> 644 | % If any byte is corrupted on disk - then the result should be a failure 645 | % to open and the TreeCache reverting to empty 646 | RootPath = "test/cachecs/", 647 | aae_util:clean_subdir(RootPath), 648 | _Tree2 = setup_savedcaches(RootPath), 649 | BestFN = form_cache_filename(RootPath, 2), 650 | {ok, LatestCache} = file:read_file(BestFN), 651 | FlipByteFun = 652 | fun(Offset) -> 653 | aae_util:flip_byte(LatestCache, 1, Offset) 654 | end, 655 | BrokenCaches = 656 | lists:map(FlipByteFun, lists:seq(1, byte_size(LatestCache) - 1)), 657 | BrokenCacheCheckFun = 658 | fun(BrokenCache) -> 659 | ok = file:write_file(BestFN, BrokenCache), 660 | R = open_from_disk(RootPath), 661 | ?assertMatch({none, 1}, R) 662 | end, 663 | ok = lists:foreach(BrokenCacheCheckFun, BrokenCaches), 664 | aae_util:clean_subdir(RootPath). 665 | 666 | format_status_test() -> 667 | RootPath = "test/formatstatus/", 668 | PartitionID = 99, 669 | aae_util:clean_subdir(RootPath ++ "/" ++ integer_to_list(PartitionID)), 670 | {ok, C0} = cache_new(RootPath, PartitionID, undefined), 671 | {status, _C0, {module, gen_server}, SItemL} = sys:get_status(C0), 672 | {data, [{"State", S}]} = lists:nth(3, lists:nth(5, SItemL)), 673 | ?assert(is_list(S#state.change_queue)), 674 | RedactedStatus = format_status(#{reason => terminate, state => S}), 675 | RST = maps:get(state, RedactedStatus), 676 | ?assertMatch(redacted, RST#state.change_queue), 677 | NormStatus = format_status(#{reason => normal, state => S}), 678 | NST = maps:get(state, NormStatus), 679 | ?assert(is_list(NST#state.change_queue)), 680 | ok = cache_destroy(C0). 681 | 682 | simple_test() -> 683 | RootPath = "test/cache1/", 684 | PartitionID = 99, 685 | aae_util:clean_subdir(RootPath ++ "/" ++ integer_to_list(PartitionID)), 686 | 687 | GenerateKeyFun = aae_util:test_key_generator(hash), 688 | 689 | InitialKeys = lists:map(GenerateKeyFun, lists:seq(1, 100)), 690 | AlternateKeys = lists:map(GenerateKeyFun, lists:seq(61, 80)), 691 | RemoveKeys = lists:map(GenerateKeyFun, lists:seq(81, 100)), 692 | 693 | {ok, AAECache0} = cache_new(RootPath, PartitionID, undefined), 694 | 695 | {AddFun, AlterFun, RemoveFun} = test_setup_funs(InitialKeys), 696 | 697 | lists:foreach(AddFun(AAECache0), InitialKeys), 698 | 699 | ok = cache_close(AAECache0), 700 | 701 | {true, AAECache1} = cache_open(RootPath, PartitionID, undefined), 702 | 703 | lists:foreach(AlterFun(AAECache1), AlternateKeys), 704 | lists:foreach(RemoveFun(AAECache1), RemoveKeys), 705 | 706 | %% Now build the equivalent outside of the process 707 | %% Accouting up-fron for the removals and the alterations 708 | KHL0 = lists:sublist(InitialKeys, 60) ++ AlternateKeys, 709 | DirectAddFun = 710 | fun({K, H}, TreeAcc) -> 711 | leveled_tictac:add_kv( 712 | TreeAcc, 713 | K, 714 | H, 715 | fun(Key, Value) -> 716 | {Key, {is_hash, Value}} 717 | end 718 | ) 719 | end, 720 | CompareTree = 721 | lists:foldl( 722 | DirectAddFun, 723 | leveled_tictac:new_tree(raw, ?TREE_SIZE), 724 | KHL0 725 | ), 726 | CompareRoot = leveled_tictac:fetch_root(CompareTree), 727 | Root = cache_root(AAECache1), 728 | ?assertMatch(Root, CompareRoot), 729 | 730 | ok = cache_destroy(AAECache1). 731 | 732 | replace_test() -> 733 | RootPath = "test/cache1/", 734 | PartitionID = 99, 735 | aae_util:clean_subdir(RootPath ++ "/" ++ integer_to_list(PartitionID)), 736 | GenerateKeyFun = aae_util:test_key_generator(hash), 737 | 738 | InitialKeys = lists:map(GenerateKeyFun, lists:seq(1, 100)), 739 | AlternateKeys = lists:map(GenerateKeyFun, lists:seq(61, 80)), 740 | RemoveKeys = lists:map(GenerateKeyFun, lists:seq(81, 100)), 741 | 742 | {ok, AAECache0} = cache_new(RootPath, PartitionID, undefined), 743 | 744 | {AddFun, AlterFun, RemoveFun} = test_setup_funs(InitialKeys), 745 | 746 | lists:foreach(AddFun(AAECache0), InitialKeys), 747 | ok = cache_startload(AAECache0), 748 | 749 | lists:foreach(AlterFun(AAECache0), AlternateKeys), 750 | lists:foreach(RemoveFun(AAECache0), RemoveKeys), 751 | 752 | %% Now build the equivalent outside of the process 753 | %% Accouting up-fron for the removals and the alterations 754 | KHL0 = lists:sublist(InitialKeys, 60) ++ AlternateKeys, 755 | DirectAddFun = 756 | fun({K, H}, TreeAcc) -> 757 | leveled_tictac:add_kv( 758 | TreeAcc, 759 | K, 760 | H, 761 | fun(Key, Value) -> {Key, {is_hash, Value}} end 762 | ) 763 | end, 764 | CompareTree = 765 | lists:foldl( 766 | DirectAddFun, leveled_tictac:new_tree(raw, ?TREE_SIZE), KHL0 767 | ), 768 | 769 | %% The load tree is a tree as would have been produced by a fold over a 770 | %% snapshot taken at the time all the initial keys added. 771 | %% 772 | %% If we now complete the load using this tree, the comparison should 773 | %% still match. The cache should be replaced by one playing the stored 774 | %% alterations ont the load tree. 775 | 776 | LoadTree = 777 | lists:foldl( 778 | DirectAddFun, 779 | leveled_tictac:new_tree(raw, ?TREE_SIZE), 780 | InitialKeys 781 | ), 782 | 783 | ok = cache_completeload(AAECache0, LoadTree), 784 | 785 | CompareRoot = leveled_tictac:fetch_root(CompareTree), 786 | Root = cache_root(AAECache0), 787 | ?assertMatch(Root, CompareRoot), 788 | 789 | cache_alter(AAECache0, <<"K_With0Hash">>, 0, none), 790 | % Key added with a Vclock that hashes to 0 791 | cache_alter(AAECache0, <<"K_With0Hash">>, (1 bsl 27) - 1, 0), 792 | % Key now has a Vclock that hashes to 2 ^ 27 -1 (the top of the hash range) 793 | CompareTree1 = DirectAddFun( 794 | {<<"K_With0Hash">>, (1 bsl 27) - 1}, CompareTree 795 | ), 796 | AlterRoot = cache_root(AAECache0), 797 | AlterComapreRoot = leveled_tictac:fetch_root(CompareTree1), 798 | % Altering a key which had a hash of 0 has the same impact as inserting from scratch 799 | ?assertMatch(AlterRoot, AlterComapreRoot), 800 | 801 | cache_alter(AAECache0, <<"K_With0Hash">>, none, (1 bsl 27) - 1), 802 | 803 | % Removing the key => as if it was never there 804 | NewRoot = cache_root(AAECache0), 805 | ?assertMatch(Root, NewRoot), 806 | 807 | cache_alter(AAECache0, <<"K_WithNeutralChange">>, 1, none), 808 | cache_alter(AAECache0, <<"K_WithNeutralChange">>, none, none), 809 | cache_alter(AAECache0, <<"K_WithNeutralChange">>, none, 1), 810 | 811 | UnchangedRoot = cache_root(AAECache0), 812 | ?assertMatch(Root, UnchangedRoot), 813 | 814 | ok = cache_destroy(AAECache0). 815 | 816 | dirty_segment_test() -> 817 | % Segments based on 818 | GetSegFun = 819 | fun(BinaryKey) -> 820 | SegmentID = leveled_tictac:keyto_segment48(BinaryKey), 821 | aae_keystore:generate_treesegment(SegmentID) 822 | end, 823 | % Have clashes with keys of integer_to_binary/1 and integers - 824 | % [4241217,2576207,2363385] 825 | RootPath = "test/dirtysegment/", 826 | PartitionID = 99, 827 | aae_util:clean_subdir(RootPath ++ "/" ++ integer_to_list(PartitionID)), 828 | 829 | {ok, AAECache0} = cache_new(RootPath, PartitionID, undefined), 830 | AddFun = 831 | fun(I) -> 832 | K = integer_to_binary(I), 833 | H = erlang:phash2(rand:uniform(100000)), 834 | cache_alter(AAECache0, K, H, none) 835 | end, 836 | 837 | lists:foreach(AddFun, lists:seq(2350000, 2380000)), 838 | 839 | K0 = integer_to_binary(2363385), 840 | K1 = integer_to_binary(2576207), 841 | K2 = integer_to_binary(4241217), 842 | S0 = GetSegFun(K0), 843 | S1 = GetSegFun(K1), 844 | S2 = GetSegFun(K2), 845 | ?assertMatch(true, S0 == S1), 846 | ?assertMatch(true, S0 == S2), 847 | BranchID = S0 bsr 8, 848 | LeafID = S0 band 255, 849 | 850 | Leaf0 = get_leaf(AAECache0, BranchID, LeafID), 851 | 852 | ?assertMatch(false, Leaf0 == 0), 853 | 854 | H1 = erlang:phash2(rand:uniform(100000)), 855 | H2 = erlang:phash2(rand:uniform(100000)), 856 | {_HK1, TTH1} = leveled_tictac:tictac_hash(K1, {is_hash, H1}), 857 | {_HK2, TTH2} = leveled_tictac:tictac_hash(K2, {is_hash, H2}), 858 | 859 | cache_alter(AAECache0, K1, H1, none), 860 | 861 | Leaf1 = get_leaf(AAECache0, BranchID, LeafID), 862 | ?assertMatch(Leaf1, Leaf0 bxor TTH1), 863 | 864 | GUID0 = leveled_util:generate_uuid(), 865 | NOTGUID = "NOT GUID", 866 | 867 | cache_markdirtysegments(AAECache0, [S0], GUID0), 868 | % Replace with wrong GUID ignored 869 | cache_replacedirtysegments(AAECache0, [{S0, Leaf0}], NOTGUID), 870 | ?assertMatch(Leaf1, get_leaf(AAECache0, BranchID, LeafID)), 871 | 872 | % Replace with right GUID succeeds 873 | cache_replacedirtysegments(AAECache0, [{S0, Leaf0}], GUID0), 874 | ?assertMatch(Leaf0, get_leaf(AAECache0, BranchID, LeafID)), 875 | 876 | GUID1 = leveled_util:generate_uuid(), 877 | cache_markdirtysegments(AAECache0, [S0], GUID1), 878 | cache_alter(AAECache0, K2, H2, none), 879 | Leaf2 = get_leaf(AAECache0, BranchID, LeafID), 880 | ?assertMatch(Leaf2, Leaf0 bxor TTH2), 881 | cache_replacedirtysegments(AAECache0, [{S0, Leaf0}], GUID1), 882 | % Replace has been ignored due to update - so still Leaf2 883 | ?assertMatch(Leaf2, get_leaf(AAECache0, BranchID, LeafID)), 884 | 885 | GUID2 = leveled_util:generate_uuid(), 886 | cache_markdirtysegments(AAECache0, [S0], GUID2), 887 | cache_startload(AAECache0), 888 | cache_replacedirtysegments(AAECache0, [{S0, Leaf0}], GUID2), 889 | % Replace has been ignored due to load - so still Leaf2 890 | ?assertMatch(Leaf2, get_leaf(AAECache0, BranchID, LeafID)), 891 | 892 | ok = cache_destroy(AAECache0). 893 | 894 | get_leaf(AAECache0, BranchID, LeafID) -> 895 | [{BranchID, LeafBin}] = cache_leaves(AAECache0, [BranchID]), 896 | LeafStartPos = LeafID * 4, 897 | <<_Pre:LeafStartPos/binary, Leaf:32/integer, _Rest/binary>> = LeafBin, 898 | Leaf. 899 | 900 | coverage_cheat_test() -> 901 | {ok, _State1} = code_change(null, #state{}, null), 902 | {stop, normal, _State2} = handle_info({'EXIT', self(), "Test"}, #state{}). 903 | 904 | test_setup_funs(InitialKeys) -> 905 | AddFun = 906 | fun(CachePid) -> 907 | fun({K, H}) -> 908 | cache_alter(CachePid, K, H, none) 909 | end 910 | end, 911 | AlterFun = 912 | fun(CachePid) -> 913 | fun({K, H}) -> 914 | {K, OH} = lists:keyfind(K, 1, InitialKeys), 915 | cache_alter(CachePid, K, H, OH) 916 | end 917 | end, 918 | RemoveFun = 919 | fun(CachePid) -> 920 | fun({K, _H}) -> 921 | {K, OH} = lists:keyfind(K, 1, InitialKeys), 922 | cache_alter(CachePid, K, none, OH) 923 | end 924 | end, 925 | {AddFun, AlterFun, RemoveFun}. 926 | 927 | -endif. 928 | --------------------------------------------------------------------------------