├── rebar3
├── .gitignore
├── src
    ├── kv_index_tictactree.app.src
    ├── aae_runner.erl
    ├── aae_util.erl
    └── aae_treecache.erl
├── test
    ├── end_to_end
    │   ├── testutil.hrl
    │   ├── fold_SUITE.erl
    │   ├── testutil.erl
    │   ├── basic_SUITE.erl
    │   └── mock_kv_vnode.erl
    ├── timeout_test.erl
    └── property
    │   └── aae_eqc.erl
├── .github
    └── workflows
    │   └── erlang.yml
├── rebar.config
├── include
    └── aae.hrl
├── docs
    ├── RIAK_2_AAE.md
    ├── SEGMENT_FILTERED_SST.md
    ├── TICTAC.md
    ├── GENERAL_TICTACAAE_FOR_RIAK.md
    ├── RIAK_3_AAE.md
    └── DESIGN.md
├── README.md
└── LICENSE


/rebar3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/martinsumner/kv_index_tictactree/HEAD/rebar3


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .eunit
 2 | deps
 3 | *.o
 4 | *.beam
 5 | *.plt
 6 | erl_crash.dump
 7 | ebin/*.beam
 8 | rel/example_project
 9 | .concrete/DEV_MODE
10 | .rebar
11 | .DS_Store
12 | _build/*
13 | rebar.lock
14 | aae_data/*
15 | 


--------------------------------------------------------------------------------
/src/kv_index_tictactree.app.src:
--------------------------------------------------------------------------------
 1 | {application, kv_index_tictactree, [
 2 |     {description, "AAE helper service for KV vnode"},
 3 |     {vsn, git},
 4 |     {registered, []},
 5 |     {applications, [
 6 |         kernel,
 7 |         stdlib,
 8 |         leveled
 9 |     ]},
10 |     {maintainers, ["Martin Sumner"]},
11 |     {licenses, ["Apache"]},
12 |     {links, [{"Github", "https://github.com/martinsumner/kv_index_tictactree"}]},
13 |     {env, [{root_path, "test"}]}
14 | ]}.
15 | 


--------------------------------------------------------------------------------
/test/end_to_end/testutil.hrl:
--------------------------------------------------------------------------------
 1 | 
 2 | -record(r_content, {
 3 |                     metadata,
 4 |                     value :: term()
 5 |                     }).
 6 | 
 7 | -record(r_object, {
 8 |                     bucket,
 9 |                     key,
10 |                     contents :: [#r_content{}],
11 |                     vclock = [],
12 |                     updatemetadata=dict:store(clean, true, dict:new()),
13 |                     updatevalue :: term()}).
14 | 
15 | 
16 | -define(BUCKET_TYPE, <<"BucketType">>).


--------------------------------------------------------------------------------
/.github/workflows/erlang.yml:
--------------------------------------------------------------------------------
 1 | name: Erlang CI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |     - develop-3.4
 7 |   pull_request:
 8 |     branches:
 9 |     - develop-3.4
10 | 
11 | jobs:
12 | 
13 |   build:
14 | 
15 |     name: Test on ${{ matrix.os }} with OTP ${{ matrix.otp }}
16 |     runs-on: ${{ matrix.os }}
17 | 
18 |     strategy:
19 |       fail-fast: false
20 |       matrix:
21 |         otp: [24, 26]
22 |         os: [ubuntu-latest]
23 | 
24 |     steps:
25 |     - uses: actions/checkout@v4
26 |     - name: Install Erlang/OTP
27 |       uses: erlef/setup-beam@v1
28 |       with:
29 |         otp-version: ${{ matrix.otp }}
30 |     - name: Compile
31 |       run: ./rebar3 compile
32 |     - name: Check format
33 |       run: ./rebar3 fmt --check
34 |     - name: Run tests
35 |       run: ./rebar3 do xref, dialyzer, eunit, ct
36 | 


--------------------------------------------------------------------------------
/test/timeout_test.erl:
--------------------------------------------------------------------------------
 1 | -module(timeout_test).
 2 | -behaviour(gen_server).
 3 | -include_lib("eunit/include/eunit.hrl").
 4 | 
 5 | -export([start_link/0, stop/0]).
 6 | -export([
 7 |     init/1,
 8 |     handle_call/3,
 9 |     handle_cast/2,
10 |     handle_info/2,
11 |     code_change/3,
12 |     terminate/2
13 | ]).
14 | 
15 | start_link() -> gen_server:start_link({local, ?MODULE}, ?MODULE, [], []).
16 | stop() -> gen_server:call(?MODULE, stop).
17 | 
18 | init([]) -> {ok, []}.
19 | 
20 | handle_call({test}, _From, State) -> {reply, timer:sleep(1000), State};
21 | handle_call(stop, _From, State) -> {stop, normal, ok, State}.
22 | 
23 | handle_cast(_Request, State) -> {noreply, State}.
24 | handle_info(_, State) -> {noreply, State}.
25 | code_change(_Old, State, _Extra) -> {ok, State}.
26 | terminate(_Reason, _State) -> ok.
27 | 
28 | -ifdef(TEST).
29 | 
30 | wait_on_sync_test() ->
31 |     {ok, P} = start_link(),
32 |     ?assertMatch(
33 |         timeout,
34 |         aae_controller:wait_on_sync(gen_server, call, P, {test}, 100)
35 |     ),
36 |     ?assertMatch(
37 |         ok,
38 |         aae_controller:wait_on_sync(gen_server, call, P, {test}, 2000)
39 |     ),
40 |     stop().
41 | 
42 | -endif.
43 | 


--------------------------------------------------------------------------------
/rebar.config:
--------------------------------------------------------------------------------
 1 | {minimum_otp_vsn, "22.3"}.
 2 | 
 3 | {erl_opts, [warnings_as_errors]}.
 4 | 
 5 | {cover_excl_mods, [
 6 |     testutil,
 7 |     basic_SUITE,
 8 |     fold_SUITE,
 9 |     mockvnode_SUITE,
10 |     mock_kv_vnode
11 | ]}.
12 | 
13 | {erlfmt, [
14 |     write,
15 |     {print_width, 80},
16 |     {files, [
17 |         "{src,include}/*.{hrl,erl,app.src}",
18 |         "test/end_to_end/*.erl",
19 |         "rebar.config"
20 |     ]},
21 |     {exclude_files, ["src/erlfmt_parse.erl"]}
22 | ]}.
23 | 
24 | {project_plugins, [
25 |     {eqwalizer_rebar3,
26 |         {git_subdir, "https://github.com/OpenRiak/eqwalizer.git",
27 |             {branch, "openriak-3.4"}, "eqwalizer_rebar3"}},
28 |     {erlfmt, {git, "https://github.com/OpenRiak/erlfmt.git", {branch, "main"}}}
29 | ]}.
30 | 
31 | {profiles, [
32 |     {eqc, [
33 |         {deps, [meck, fqc]},
34 |         {erl_opts, [debug_info, {d, 'EQC'}]},
35 |         {extra_src_dirs, ["test/end_to_end", "test/property"]},
36 |         {plugins, [rebar_eqc]}
37 |     ]},
38 |     {test, [{extra_src_dirs, ["test/end_to_end", "test/property"]}]}
39 | ]}.
40 | 
41 | {xref_checks, [undefined_function_calls, undefined_functions, locals_not_used]}.
42 | 
43 | {ct_opts, [{dir, ["test/end_to_end"]}]}.
44 | 
45 | {deps, [
46 |     {leveled,
47 |         {git, "https://github.com/martinsumner/leveled",
48 |             {branch, "develop-3.4"}}},
49 |     {eqwalizer_support,
50 |         {git_subdir, "https://github.com/OpenRiak/eqwalizer.git",
51 |             {branch, "openriak-3.4"}, "eqwalizer_support"}}
52 | ]}.
53 | 


--------------------------------------------------------------------------------
/include/aae.hrl:
--------------------------------------------------------------------------------
 1 | %%%============================================================================
 2 | %%% Non-configurable defaults
 3 | %%%============================================================================
 4 | 
 5 | -define(TREE_SIZE, large).
 6 | -define(MAGIC, 53).
 7 | 
 8 | %%%============================================================================
 9 | %%% Tags
10 | %%%============================================================================
11 | -define(HEAD_TAG, h).
12 | -define(RIAK_TAG, o_rkv).
13 | 
14 | -if(?OTP_RELEASE < 26).
15 | -type dynamic() :: any().
16 | -endif.
17 | 
18 | %%%============================================================================
19 | %%% Helper Functions
20 | %%%============================================================================
21 | 
22 | -define(LOG_LOCATION, #{
23 |     mfa => {?MODULE, ?FUNCTION_NAME, ?FUNCTION_ARITY},
24 |     line => ?LINE,
25 |     file => ?FILE
26 | }).
27 | 
28 | -define(STD_LOG(LogRef, Subs),
29 |     ?STD_LOG_INT(
30 |         element(1, aae_util:get_log(LogRef)),
31 |         LogRef,
32 |         Subs,
33 |         leveled_log:get_opts()
34 |     )
35 | ).
36 | 
37 | -define(STD_LOG_INT(LogLevel, LogRef, Subs, LogOpts),
38 |     case
39 |         logger:allow(LogLevel, ?MODULE) andalso
40 |             leveled_log:should_i_log(LogLevel, LogRef, LogOpts)
41 |     of
42 |         true ->
43 |             erlang:apply(
44 |                 logger,
45 |                 macro_log,
46 |                 [
47 |                     ?LOG_LOCATION
48 |                     | aae_util:log(LogLevel, LogRef, LogOpts, Subs)
49 |                 ]
50 |             );
51 |         false ->
52 |             ok
53 |     end
54 | ).
55 | 


--------------------------------------------------------------------------------
/src/aae_runner.erl:
--------------------------------------------------------------------------------
  1 | %% -------- Overview ---------
  2 | %%
  3 | %% Runner used for fetch_clock queries on this AAE vnode
  4 | 
  5 | -module(aae_runner).
  6 | 
  7 | -behaviour(gen_server).
  8 | 
  9 | -include("aae.hrl").
 10 | 
 11 | -export([
 12 |     init/1,
 13 |     handle_call/3,
 14 |     handle_cast/2,
 15 |     handle_info/2,
 16 |     terminate/2,
 17 |     code_change/3
 18 | ]).
 19 | 
 20 | -export([
 21 |     runner_start/1,
 22 |     runner_work/2,
 23 |     runner_stop/1
 24 | ]).
 25 | 
 26 | -record(state, {
 27 |     result_size = 0 :: integer(),
 28 |     query_count = 0 :: integer(),
 29 |     query_time = 0 :: integer(),
 30 |     aae_controller :: pid() | undefined
 31 | }).
 32 | 
 33 | -define(LOG_FREQUENCY, 10).
 34 | 
 35 | -define(PROMPT_MILLISECONDS, 2000).
 36 | 
 37 | %%%============================================================================
 38 | %%% API
 39 | %%%============================================================================
 40 | 
 41 | -spec runner_start(aae_util:log_levels()) -> {ok, pid()}.
 42 | %% @doc
 43 | %% Start an AAE runner to manage folds
 44 | runner_start(LogLevels) ->
 45 |     {ok, Pid} = gen_server:start_link(?MODULE, [LogLevels, self()], []),
 46 |     {ok, Pid}.
 47 | 
 48 | -spec runner_work(pid(), aae_controller:runner_work() | queue_empty) -> ok.
 49 | %% @doc
 50 | %% Be cast some work
 51 | runner_work(Runner, Work) ->
 52 |     gen_server:cast(Runner, Work).
 53 | 
 54 | -spec runner_stop(pid()) -> ok.
 55 | %% @doc
 56 | %% Close the runner
 57 | runner_stop(Runner) ->
 58 |     gen_server:call(Runner, close, 30000).
 59 | 
 60 | %%%============================================================================
 61 | %%% gen_server callbacks
 62 | %%%============================================================================
 63 | 
 64 | init([LogLevels, Controller]) ->
 65 |     ok = aae_util:set_loglevel(LogLevels),
 66 |     {ok, #state{aae_controller = Controller}, ?PROMPT_MILLISECONDS}.
 67 | 
 68 | handle_call(close, _From, State) ->
 69 |     {stop, normal, ok, State}.
 70 | 
 71 | handle_cast(queue_empty, State) ->
 72 |     {noreply, State, ?PROMPT_MILLISECONDS};
 73 | handle_cast({work, Folder, ReturnFun, SizeFun}, State) ->
 74 |     SW = os:timestamp(),
 75 |     State0 =
 76 |         try Folder() of
 77 |             query_backlog ->
 78 |                 ?STD_LOG(r0002, []),
 79 |                 ReturnFun({error, query_backlog}),
 80 |                 State;
 81 |             Results ->
 82 |                 QueryTime = timer:now_diff(os:timestamp(), SW),
 83 |                 ?STD_LOG(r0003, [QueryTime]),
 84 |                 RS0 = State#state.result_size + SizeFun(Results),
 85 |                 QT0 = State#state.query_time + QueryTime,
 86 |                 QC0 = State#state.query_count + 1,
 87 |                 {RS1, QT1, QC1} =
 88 |                     maybe_log(
 89 |                         RS0,
 90 |                         QT0,
 91 |                         QC0,
 92 |                         ?LOG_FREQUENCY
 93 |                     ),
 94 | 
 95 |                 ReturnFun(Results),
 96 | 
 97 |                 State#state{
 98 |                     result_size = RS1,
 99 |                     query_time = QT1,
100 |                     query_count = QC1
101 |                 }
102 |         catch
103 |             Error:Pattern ->
104 |                 ?STD_LOG(r0005, [Error, Pattern]),
105 |                 ReturnFun({error, Error}),
106 |                 State
107 |         end,
108 |     {noreply, State0, 0}.
109 | 
110 | handle_info(timeout, State = #state{aae_controller = C}) when C =/= undefined ->
111 |     ?STD_LOG(r0004, []),
112 |     ok = aae_controller:aae_runnerprompt(C),
113 |     {noreply, State}.
114 | 
115 | terminate(_Reason, State) ->
116 |     _ = maybe_log(
117 |         State#state.result_size,
118 |         State#state.query_time,
119 |         State#state.query_count,
120 |         1
121 |     ),
122 |     ok.
123 | 
124 | code_change(_OldVsn, State, _Extra) ->
125 |     {ok, State}.
126 | 
127 | %%%============================================================================
128 | %%% Internal functions
129 | %%%============================================================================
130 | 
131 | maybe_log(RS_Acc, QT_Acc, QC_Acc, LogFreq) when QC_Acc < LogFreq ->
132 |     {RS_Acc, QT_Acc, QC_Acc};
133 | maybe_log(RS_Acc, QT_Acc, QC_Acc, _LogFreq) ->
134 |     ?STD_LOG(r0001, [RS_Acc, QT_Acc, QC_Acc]),
135 |     {0, 0, 0}.
136 | 
137 | %%%============================================================================
138 | %%% Test
139 | %%%============================================================================
140 | 
141 | -ifdef(TEST).
142 | 
143 | -include_lib("eunit/include/eunit.hrl").
144 | 
145 | runner_fail_test() ->
146 |     {ok, R} = runner_start(undefined),
147 |     TestProcess = self(),
148 |     CheckFun =
149 |         fun(ReturnTuple) ->
150 |             ?assertMatch(error, element(1, ReturnTuple)),
151 |             TestProcess ! error
152 |         end,
153 |     ReturnFun = aae_controller:generate_returnfun("ABCD", CheckFun),
154 |     FoldFun = fun() -> throw(noproc) end,
155 |     SizeFun = fun(_Results) -> 0 end,
156 |     runner_work(R, {work, FoldFun, ReturnFun, SizeFun}),
157 |     error = start_receiver(),
158 |     ok = runner_stop(R).
159 | 
160 | start_receiver() ->
161 |     receive
162 |         error ->
163 |             error
164 |     end.
165 | 
166 | coverage_cheat_test() ->
167 |     {ok, _State1} = code_change(null, #state{}, null).
168 | 
169 | -endif.
170 | 


--------------------------------------------------------------------------------
/docs/RIAK_2_AAE.md:
--------------------------------------------------------------------------------
 1 | # AAE Implementation in Riak 2.2.5
 2 | 
 3 | - Each riak vnode has a kv_index_hashtree process if AAE is enabled.
 4 | 
 5 | - That kv_index_hashtree process keeps a single key store (that contains all the keys and hashes of all the keys and values within that vnode store) - so it is a parallel key store duplicating the data (but with just hashes and not the whole object).
 6 | 
 7 | - The AAE key store is ordered by Segment ID - with Segment ID being the hash of the Key representing the Key's location in the Merkle Tree.
 8 | 
 9 |     - So if the object key is {Bucket1, Key1}, the Key in the AAE store is something like (hash{Bucket1, Key1}, Bucket1, Key1) - and the value is the hash for that object.
10 | 
11 |     - "the hash for that object" used to mean the hash of the whole object, but now it is just the hash of the version vector.
12 | 
13 | - As well as the parallel key store, the kv_index_hashtree process keeps a merkle tree for each IndexN that the vnode supports.
14 | 
15 | - IndexN is a reference to a combination of a n-val and a preflist supported by the vnode.  If all buckets in the store are n-val 3, then there will be 3 IndexNs per vnode (and hence 3 merkle trees).  If there are some buckets with an n-val of 3 and some with an n-val of 4 - there will be 7 merkle trees.
16 | 
17 | - Each merkle tree also has an associated list of "Dirty Segments" - with a dirty segment meaning a SegmentId within the tree which has had a change since it was last calculated, and so shouldn't be trusted any more.
18 | 
19 | - When a vnode receives a PUT request, it passes the new key, the new value, and the IndexN to the kv_index_hashtree process after the vnode has been updated.
20 | the kv_index_hashtree process hashes the value (actually the version vector), and hashes the Key to get the Segment ID; and queues up an insert into the Key store that represents this new Key and Hash.
21 | 
22 | - The hashtree process then marks the SegmentID as being dirty for the Merkle tree whose IndexN matches the update.
23 | 
24 | - The Riak cluster has a kv_entropy_manager process, and this will determine what vnodes have common IndexNs with what other vnodes - and it will schedule exchanges to take place between these vnodes to compare the merkle trees for their common IndexNs.
25 | 
26 | - When an exchange request is received by the kv_index_hashtree process, it first must update the Merkle tree for that IndexN.  
27 | 
28 |     - To do that it looks at the list of dirty segments, and for each dirty segment it fold over all keys in that segment in its local AAE keystore to get a list of [{K, H}] for that SegmentID and IndexN.
29 | 
30 |     - It then does a sha hash on that list - and that represents the new hash value for that segment in the tree.  Once all leaves have been updated, it works up the tree recalculating branch and root hashes as necessary.
31 | 
32 | - the AAE processes then exchange and compare the trees - to find a list of segment IDs that differ between the vnodes for that IndexN.  Hopefully the list is small.
33 | 
34 | - Now for each segmentID in this list of mismatched segments it has to fold over the key store again to find all the [{K, H}] for those SegmentIds and the relevant IndexN - and this is then compared with exchange vnodes.
35 | 
36 | - If any keys are found to have different hashes, then read repair is invoked for those keys.  This essential is managed just by doing a normal GET request, with a subtle difference.
37 | 
38 | - If the difference in hashes is because of a real difference between the values in the vnodes, then read_repair should fix it ... however if no difference is found, it prompts a rehash at each riak_kv_vnode for that Key.
39 | 
40 |     - The rehash just takes the current value for the key out of the vnode backend and passes it as if it were a new PUT to the kv_index_hashtree process.
41 | 
42 |     - This means that if there are discrepancies between the kv_index_hashtree key store and the vnode store (normally due to uncontrolled shutdown), they don't keep prompting read repairs over and over again - the rehash should fix the kv_index_hashtree store.
43 | 
44 | - The anti-entropy process described so far fixes differences, when an update is never received by the vnode, but doesn't handle the situation where an update is received, but subsequently lost by a vnode (e.g. disk corruption).
45 | 
46 | - To protect against disk-based loss, the AAE keystore is periodically (normally once per month) rebuilt from the actual vnode store.  This is a carefully scheduled and throttled process, as it requires a full vnode scan to complete, with page cache side effects etc.
47 | 
48 | - For the period of the rebuild, which can be many hours, AAE exchanges stop for that vnode (but continue for other vnodes).
49 | 
50 | This covers off most of what happens for intra-cluster AAE. The same mechanism can also be used for inter-cluster AAE - but the two clusters have to be partitioned identically.
51 | 
52 | - This is used for some Riak <-> Riak DC synchronisation (the alternative is a key-listing comparison), and also I think for the riak <-> solr synchronisation
53 | the current Riak <-> Riak one is mangled though, and has all kinds of issues (although not ones that are too hard to resolve or workaround)
54 | 
55 | - There are some issues with this setup:
56 | 
57 |     - there is an overhead of running a parallel keystore (which is necessary due to the need to frequently find things by segment)
58 | 
59 |     - every distinct key updated ultimately leeds to a range query in the keystore (because of dirty segments) - this has an overhead
60 | 
61 |     - it is possible for the rebuild to cause a "one slow node" scenario
62 | 
63 |     - although it is much, much more efficient than key-listing for full-sync multi-DC replication - it requires the n-vals and ring-sizes to be consistent (which is not helpful when trying to find a way to safely change your ring size, or trying to find an efficient backup process)
64 | 
65 | Those are the issues scheduled to be addressed in Riak 3.0
66 | 


--------------------------------------------------------------------------------
/docs/SEGMENT_FILTERED_SST.md:
--------------------------------------------------------------------------------
 1 | # Segment filtering in LSM trees
 2 | 
 3 | In data stores designed around Log Structured Merge Trees, data it stored in a tree of sorted files (SST files).  To find a key in the tree, the SST file whose key range
 4 | covers the key is checked starting at the top level, and working down until the first instance of the key is found.
 5 | 
 6 | It is important to read performance that SST files that don't contain the key can provide a negative response in an efficient manner, so that the level can be skipped through without delay.  In order to achieve this, some form of [bloom filter](https://en.wikipedia.org/wiki/Bloom_filter) is generally used.
 7 | 
 8 | Within the [leveled LSM tree](https://github.com/martinsumner/leveled/tree/master/src), there has been attempt to align these filters with the hashing to a position in a (Tictac) Merkle tree - to allow for the same index to be used to both accelerate fetch misses, but also to skip blocks within an SST when scanning an LSM tree in key order to find a subset of keys associated with particular leaves within a Merkle Tree that represents the data in the store.
 9 | 
10 | There are two methods which have been investigated for implement this capability:
11 | 
12 | - A simple slot-based segment-index (the actual method currently implemented in Leveled);
13 | - A potentially more efficient rice-encoded filter.
14 | 
15 | The idea for both is that a single hash function is used (rather than multiple hash functions as in a bloom filter), and that hash function produces the position in the Merkle tree.  The same filter can then be used to check for presence, of an individual keys, or for multiple keys located in a subset of segment IDs.  
16 | 
17 | ## Slot-based Segment Index
18 | 
19 | Within Leveled, two bloom filters are used.  Each file has its own multi-hash bloom filter, which is designed to be small with a relatively high false positive rate.  This is used as an initial filter to prevent lookups to a file.
20 | 
21 | Once this has been passed, the SST file is divided into compressed blocks of (24 to 28 keys and values), each set of five blocks is held within a slot, and the SST file maintains a mapping of key ranges to slots.  A slot can contain up to 128 Keys/Values, some of which may be non-lookup keys (e.g. index entries) which don't require creation of a bloom entry as they will never be directly access outside of folds.
22 | 
23 | The second level of bloom filter, the simple segment index, is kept in-memory for each slot.  The segment index is built from either 2-byte positions, or 1-byte gaps.  
24 | 
25 | 2-byte positions are of the form `<<1:1/integer, SegID:15/integer>>`, where SegID is 15 bits of the Merkle tree segment ID for the key.
26 | 
27 | 1-byte gaps are of the form `<<0:1/integer, Gap:7/integer>>`, where Gap is the count of entries in the slot between the last indexed entry and this which have no index entry.
28 | 
29 | This is less efficient than a bloom filter, as for the size the false positive rate is 1:256 as opposed to 1:2180 for an optimised bloom of equivalent size.  However, it yields additional information, notably the block or blocks which contain the Key, and the position of the Key within the block.
30 | 
31 | ## Rice-encoded Segment Filter
32 | 
33 | A Rice-encoded filter is a way of providing improved memory efficiency compared to the Slot-based Segment Index.  Rice-encoding filter is a bloom filter based on a single hash function (as above), but now the filter is packed using [rice encoding](https://en.wikipedia.org/wiki/Golomb_coding), which encodes the bloom an array of deltas based on the assumption of roughly equal spacing between deltas - which should be an expected outcome of using a 'good' hash function.
34 | 
35 | So if there are 15-bits to the hash in the bloom, and 128 keys in the bloom, then it can be assumed that the deltas are around 256 numbers apart, and so an 8-bit remainder is used, and:
36 | 
37 | - A delta of 255 would be represented as 0 1111 1111;
38 | - A delta of 257 would be represented as 10 0000 0001;
39 | - A delta of 1000 would be represented as 1110 1101 1000.
40 | 
41 | So the approximate overall size of the filter will be around 10 bits per key, for the same false positive rate as using a 16-bit per key Segment Index.
42 | 
43 | This no longer reveals the position of the entry in the slot - but, most of the value in revealing the position comes from knowing just the block identifier (as the difference in small blocks between lists:nth/2 and lists:keyfind/3 is expected to be marginal).
44 | 
45 | The block knowledge problem could be resolved by bit-shifting the segment ID and adding a 3-bit position ID to the segmentID before encoding, and this would allow for either a 3-bit reduction in size, or a 8x improvement in false positive rate. 
46 | 
47 | 
48 | ## Efficiency of Segment-filtering
49 | 
50 | In the slot-based segment index there is a false-positive rate per slot of 1:256.  With the same size Rice-encoded filter this can be improved to 1:2048 if the exact position is dropped from the requirement, and the same size filter is used.
51 | 
52 | If there is a LSM tree containing 10M keys, than across the tree there will be approximately 90K slots, and 450K blocks.
53 | 
54 | If we want to scan for 32 different segment IDs  - there will be around 600 blocks which need to be opened to find all the keys within those segments.  Using a slot-based segment index around 90% of the blocks will be skipped, but 100 times more blocks will be opened than is optimal.  Using the 4-bits of improvement in the rice encoded example will lead to an improvement greater than one order of magnitude - with > 98% skipped, and only around 6K blocks unnecessarily being opened.
55 | 
56 | However, this assumes that the Segment IDs we're looking for are evenly distributed.  In the current implementation when we look for a subset of SegmentIDs, it will look for the subset SegmentIDs that are numerically closest.
57 | 
58 | In these filters we use only 15 or 19-bits of the SegmentID.  If we chose those bits by performing a bsr operation on the 20-bit SegmentID, we can gain further efficiency by finding sets of SegmentIDs that overlap to the common SegmentIDs within filters.
59 | 


--------------------------------------------------------------------------------
/test/end_to_end/fold_SUITE.erl:
--------------------------------------------------------------------------------
  1 | -module(fold_SUITE).
  2 | -include_lib("common_test/include/ct.hrl").
  3 | -export([all/0, init_per_suite/1, end_per_suite/1]).
  4 | -export([
  5 |     aae_fold_keyorder/1,
  6 |     aae_fold_segmentorder/1
  7 | ]).
  8 | 
  9 | all() ->
 10 |     [
 11 |         aae_fold_keyorder,
 12 |         aae_fold_segmentorder
 13 |     ].
 14 | 
 15 | init_per_suite(Config) ->
 16 |     testutil:init_per_suite([{suite, "fold"} | Config]),
 17 |     Config.
 18 | 
 19 | end_per_suite(Config) ->
 20 |     testutil:end_per_suite(Config).
 21 | 
 22 | aae_fold_keyorder(_Config) ->
 23 |     aae_fold_tester(leveled_ko, 50000).
 24 | 
 25 | aae_fold_segmentorder(_Config) ->
 26 |     aae_fold_tester(leveled_so, 50000).
 27 | 
 28 | aae_fold_tester(ParallelStoreType, KeyCount) ->
 29 |     RootPath = testutil:reset_filestructure(),
 30 |     FoldPath1 = filename:join(RootPath, "folder1/"),
 31 |     SplitF =
 32 |         fun(X) ->
 33 |             T = binary_to_term(X),
 34 |             {rand:uniform(1000), 1, 0, element(1, T), element(2, T)}
 35 |         end,
 36 | 
 37 |     {ok, Cntrl1} =
 38 |         aae_controller:aae_start(
 39 |             {parallel, ParallelStoreType},
 40 |             true,
 41 |             {1, 300},
 42 |             [{2, 0}, {2, 1}],
 43 |             FoldPath1,
 44 |             SplitF
 45 |         ),
 46 | 
 47 |     BKVListXS = testutil:gen_keys([], KeyCount),
 48 | 
 49 |     {SWLowMegaS, SWLowS, _SWLowMicroS} = os:timestamp(),
 50 |     timer:sleep(1000),
 51 |     ok = testutil:put_keys(Cntrl1, 2, BKVListXS, none),
 52 |     timer:sleep(1000),
 53 |     {SWHighMegaS, SWHighS, _SWHighMicroS} = os:timestamp(),
 54 |     BucketList = [integer_to_binary(1), integer_to_binary(3)],
 55 |     FoldElements = [{clock, null}, {md, null}],
 56 |     FoldFun =
 57 |         fun(B, _K, ElementList, {B1Count, B3Count}) ->
 58 |             {clock, FoldClock} = lists:keyfind(clock, 1, ElementList),
 59 |             {md, FoldMD} = lists:keyfind(md, 1, ElementList),
 60 |             case binary_to_term(FoldMD) of
 61 |                 [{clock, FoldClock}] ->
 62 |                     case B of
 63 |                         <<"1">> ->
 64 |                             {B1Count + 1, B3Count};
 65 |                         <<"3">> ->
 66 |                             {B1Count, B3Count + 1}
 67 |                     end
 68 |             end
 69 |         end,
 70 |     InitAcc = {0, 0},
 71 |     {async, Runner1} =
 72 |         aae_controller:aae_fold(
 73 |             Cntrl1,
 74 |             {buckets, BucketList},
 75 |             all,
 76 |             all,
 77 |             false,
 78 |             FoldFun,
 79 |             InitAcc,
 80 |             FoldElements
 81 |         ),
 82 |     true = {KeyCount div 5, KeyCount div 5} == Runner1(),
 83 | 
 84 |     {async, Runner2} =
 85 |         aae_controller:aae_fold(
 86 |             Cntrl1,
 87 |             {buckets, BucketList},
 88 |             all,
 89 |             {SWLowMegaS * 1000000 + SWLowS, SWHighMegaS * 1000000 + SWHighS},
 90 |             false,
 91 |             FoldFun,
 92 |             InitAcc,
 93 |             FoldElements
 94 |         ),
 95 |     true = {KeyCount div 5, KeyCount div 5} == Runner2(),
 96 | 
 97 |     {async, Runner3} =
 98 |         aae_controller:aae_fold(
 99 |             Cntrl1,
100 |             {buckets, BucketList},
101 |             all,
102 |             {0, SWLowMegaS * 1000000 + SWLowS},
103 |             false,
104 |             FoldFun,
105 |             InitAcc,
106 |             FoldElements
107 |         ),
108 | 
109 |     {0, 0} = Runner3(),
110 | 
111 |     {async, Runner4} =
112 |         aae_controller:aae_fold(
113 |             Cntrl1,
114 |             {buckets, BucketList},
115 |             all,
116 |             {
117 |                 SWHighMegaS * 1000000 + SWHighS,
118 |                 SWHighMegaS * 1000000 + SWHighS + 60
119 |             },
120 |             false,
121 |             FoldFun,
122 |             InitAcc,
123 |             FoldElements
124 |         ),
125 |     {0, 0} = Runner4(),
126 | 
127 |     {async, Runner5} =
128 |         aae_controller:aae_fold(
129 |             Cntrl1,
130 |             {buckets, BucketList},
131 |             all,
132 |             all,
133 |             2000,
134 |             FoldFun,
135 |             InitAcc,
136 |             FoldElements
137 |         ),
138 |     case ParallelStoreType of
139 |         leveled_ko ->
140 |             {0, {2000, 0}} = Runner5();
141 |         leveled_so ->
142 |             true =
143 |                 {-1, {KeyCount div 5, KeyCount div 5}} == Runner5()
144 |     end,
145 | 
146 |     {async, Runner6} =
147 |         aae_controller:aae_fold(
148 |             Cntrl1,
149 |             {buckets, BucketList},
150 |             all,
151 |             {SWLowMegaS * 1000000 + SWLowS, SWHighMegaS * 1000000 + SWHighS},
152 |             2000,
153 |             FoldFun,
154 |             InitAcc,
155 |             FoldElements
156 |         ),
157 |     case ParallelStoreType of
158 |         leveled_ko ->
159 |             {0, {2000, 0}} = Runner6();
160 |         leveled_so ->
161 |             true =
162 |                 {-1, {KeyCount div 5, KeyCount div 5}} == Runner6()
163 |     end,
164 | 
165 |     BKVSL = lists:sublist(BKVListXS, KeyCount - 1000, 128),
166 |     SegMapFun =
167 |         fun({B, K, _VV}) ->
168 |             BinK = aae_util:make_binarykey(B, K),
169 |             Seg32 = leveled_tictac:keyto_segment32(BinK),
170 |             leveled_tictac:get_segment(Seg32, small)
171 |         end,
172 |     SegList = lists:map(SegMapFun, BKVSL),
173 |     BKVSL_ByBL =
174 |         lists:filter(
175 |             fun({B, _K, _V}) -> lists:member(B, BucketList) end,
176 |             BKVSL
177 |         ),
178 |     FoldClocksElements = [{clock, null}],
179 |     FoldClocksFun =
180 |         fun(B, K, ElementList, Acc) ->
181 |             {clock, FoldClock} = lists:keyfind(clock, 1, ElementList),
182 |             [{B, K, FoldClock} | Acc]
183 |         end,
184 | 
185 |     {async, Runner7} =
186 |         aae_controller:aae_fold(
187 |             Cntrl1,
188 |             {buckets, BucketList},
189 |             {segments, SegList, small},
190 |             all,
191 |             false,
192 |             FoldClocksFun,
193 |             [],
194 |             FoldClocksElements
195 |         ),
196 | 
197 |     FetchedClocks = Runner7(),
198 |     io:format(
199 |         "Fetched ~w clocks with segment filter~n",
200 |         [length(FetchedClocks)]
201 |     ),
202 |     true =
203 |         [] == lists:subtract(BKVSL_ByBL, FetchedClocks),
204 |     % Found all the Keys and clocks in the list
205 |     true =
206 |         (KeyCount div 64) > length(lists:subtract(FetchedClocks, BKVSL_ByBL)),
207 |     % Didn't find "too many" others due to collisions on segment
208 | 
209 |     {async, Runner8} = aae_controller:aae_bucketlist(Cntrl1),
210 |     ListOfBuckets = Runner8(),
211 |     true = length(ListOfBuckets) == 5,
212 |     true = lists:usort(ListOfBuckets) == ListOfBuckets,
213 |     % There are five buckets - they are found in the expected order
214 | 
215 |     ok = aae_controller:aae_close(Cntrl1),
216 |     RootPath = testutil:reset_filestructure().
217 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # KV Tictac Tree
 2 | 
 3 | [![Build Status](https://github.com/martinsumner/kv_index_tictactree/actions/workflows/erlang.yml/badge.svg?branch=develop-3.4)](https://github.com/martinsumner/kv_index_tictactree/actions)
 4 | 
 5 | An Active Anti-Entropy library for Key-Value stores in Erlang.
 6 | 
 7 | This is currently a working prototype with basic testing.  The target for the library is to be fully integrated with [Riak KV](https://github.com/basho/riak_kv) for Release 3.0 (Autumn 2018).
 8 | 
 9 | The library could in theory be used by any Erlang application wanting to use Merkle trees to compare different data stores, it is designed for Riak but not coupled to Riak.  It is not though a general substitute for Merkle trees when the cryptographic strength of Merkle trees is of importance (e.g. a blockchain implementation).
10 | 
11 | ## Overview
12 | 
13 | Library to provide an Active-Anti-Entropy (AAE) capability in a KV store.  The AAE functionality is based on that normally provided through [Merkle Trees](https://github.com/basho/riak_core/blob/2.1.9/src/hashtree.erl), but with two changes from standard practice:
14 | 
15 | - The Merkle trees are not cryptographically secure (as it is assumed that the system will use them only for comparison between trusted actors over secure channels).  This relaxation of security reduces significantly the cost of maintenance, without reducing their effectiveness for comparison over private channels.  To differentiate from secure Merkle trees the name TicTac Merkle trees is used.  [Further details on Tictac trees can be found here](docs/TICTAC.md).
16 | 
17 | - Indexing of key stores within the AAE system can be 2-dimensional, where the store supports scanning by segment within the store as well as the natural order for the store (e.g. key order).  The key store used is a Log-Structured Merge tree but the bloom-style indexes that are used within the store to accelerate normal access have been dual-purposed to align with the hashes used to map to a key into the Merkle tree, and therefore to accelerate access per-segment without requiring ordering by segment.  [Further details on making bloom-based indexes in LSM trees dual prupose can be found here](docs/SEGMENT_FILTERED_SST.md)
18 | 
19 | The purpose of these changes, and other small improvements to standard Merkle tree anti-entropy, are to allow for:
20 | 
21 | - Supporting Active Anti-Entropy without the need to maintain and synchronise additional `parallel` key stores to provide a tree-ordered view of the store.  this depends on the primary store having `native` AAE support, and a `parallel` store may still be used where this support is not available.
22 | 
23 | - Cached views of TicTac Merkle trees to be maintained in memory by applying deltas to the trees, so as to avoid the scanning of dirty segments at the point of exchange and allow for immediate exchanges.  Also the cache will be maintained and kept up-to-date during rebuild activity, to prevent loss of anti-entropy validation during any background rebuild processes.
24 | 
25 | - False positive avoidance by double-checking each stage of the exchange (separated by a pause), utilising the low cost of querying the tree, and avoiding the false-negative exchanges associated with timing differences between changes reaching different vnodes.
26 | 
27 | - The rapid merging of TicTac Merkle trees across data partitions - so a tree for the whole store can be quickly built from cached views of partitions within the store, and be compared with a matching store that may be partitioned using a different layout.
28 | 
29 | - A consistent set of features to be made available between AAE in both `parallel` and `native` key store mode - including the ability to query the AAE store to discover information which otherwise would require expensive object folds.
30 | 
31 | - Fully asynchronous API to the AAE controller so that the actual partition (vnode) management process can run an AAE controller without being blocked by AAE activity.
32 | 
33 | - Allow for AAE exchanges to compare Keys and Clocks for mismatched segments, not just Keys and Hashes, so repair functions can be targeted at the side of the exchange which is behind - avoiding needlessly duplicated 2-way repairs.
34 | 
35 | 
36 | ## Primary Actors
37 | 
38 | The primary actor in the library is the controller (`aae_controller`) - which provides the API to startup and shutdown a server for which will manage TicTac tree caches (`aae_treecache`) and a parallel Key Store (`aae_keystore` - which may be empty when run in `native` mode).  The `aae_controller` can be updated by the actual vnode (partition) manager, and accessed by AAE Exchanges (either directly or also via the vnode manager).
39 | 
40 | The AAE exchanges (`aae_exchange`) are finite-state machines which are initialised with a Blue List and a Pink List to compare.  In the simplest form the two lists can be a single vnode and partition identifier each - or they could be different coverage plans consisting of multiple vnodes and multiple partition identifiers by vnode.  The exchanges pass through two root comparison stages (to compare the root of the trees, taking the intersection of branch mismatches from both comparisons), two branch comparison stages, and then a Key and logical identifier exchange based on the leaf segment ID differences found, and finally a repair.
41 | 
42 | The AAE exchange should work the same way if two partitions are bing compared, or two coverage queries across multiple partitions are being compared.
43 | 
44 | [More detail on the design can be found here](docs/DESIGN.md).
45 | 
46 | [Some further background information can be found here](https://github.com/martinsumner/leveled/blob/master/docs/ANTI_ENTROPY.md).
47 | 
48 | ## Using the Library
49 | 
50 | Following the [current tests](https://github.com/martinsumner/kv_index_tictactree/blob/master/test/end_to_end/basic_SUITE.erl) presently provides the simplest guide to using the library.  There is also a [`mock_kv_vnode`](https://github.com/martinsumner/kv_index_tictactree/blob/master/test/end_to_end/mock_kv_vnode.erl) process used in these tests, and provides a sample view of how an `aae_controller` could be integrated.
51 | 
52 | There are two main branches:
53 | 
54 | [`develop-3.1 - default`](https://github.com/martinsumner/kv_index_tictactree/tree/develop-3.1): Target for the Riak 3.1 release with support for OTP 22 and OTP 24;
55 | 
56 | [`develop-3.0`](https://github.com/martinsumner/kv_index_tictactree/tree/develop-3.0): Used in the Riak 3.0 release with support for OTP 20 and OTP 22;
57 | 
58 | [`develop-2.9`](https://github.com/martinsumner/kv_index_tictactree/tree/develop-2.9): Used in the Riak 2.9 release with support for OTP R16 through to OTP 20.
59 | 
60 | ### Contributing and Testing
61 | 
62 | The acceptance criteria for updating kv_index_tictactree is that it passes rebar3 dialyzer, xref, eunit, and ct with 100% coverage.
63 | 
64 | To have rebar3 execute the full set of tests, run:
65 | 
66 | `rebar3 as test do xref, dialyzer, cover --reset, eunit --cover, ct --cover, cover --verbose`
67 | 
68 | For those with a Quickcheck license, property-based tests can also be run using:
69 | 
70 | `rebar3 as eqc do eunit --module=aae_eqc`
71 | 
72 | 
73 | ### Riak KV
74 | 
75 | [This overview](docs/RIAK_2_AAE.md) details how the current (Riak KV 2.2.5) AAE implementation works.
76 | 
77 | [This overview](docs/RIAK_3_AAE.md) details how the target (Riak KV 3.0) AAE implementation is expected to work utilising KV Tictac Trees.
78 | 


--------------------------------------------------------------------------------
/docs/TICTAC.md:
--------------------------------------------------------------------------------
 1 | # Merkle Trees and Tictac Trees
 2 | 
 3 | A [Merkle Tree](https://en.wikipedia.org/wiki/Merkle_tree) is a tree of hashes arranges so that the value of the hash of any branch, is the hash of the accumulation of hashes below it.  This allows for trees which represent the same data, to confirm this synchronisation by transferring only the root of the tree.  Also, where there are small deltas between the trees, for the tree to be traversed to quickly identify which tree-positions those deltas are in.
 4 | 
 5 | 
 6 | ## Standard Merkle Tree (Riak)
 7 | 
 8 | In Riak KV 2.2.5, the hashtree has two levels, each 1024 hashes wide - meaning o(1m) overall segments within each tree.  The position of a key in the tree is determined by taking the `erlang:phash2/1` hash of the Bucket and Key.  The hash of an individual element to be added to the tree is found by taking a `erlang:phash2/1` [hash of the sorted version vector](https://github.com/basho/riak_kv/blob/2.1.7/src/riak_object.erl#L667-L670).  To calculate the hash of a segment, the hashtree process takes all of the Keys and Hashes in the segment and performs:
 9 | 
10 | `hash([{K1, H1}, {K2, H2} .... {Kn, Hn}])`
11 | 
12 | This time using a sha hash from the erlang crypto library.  This hashing of the list of all sub-elements is then used up to the root of the tree to calculate the parent hashes.
13 | 
14 | ## Alternative Merkle Tree (Tictac)
15 | 
16 | The Tictac trees still use as the value to hash the sorted list of the version vector, but now the hash for the segment ID is built up as follows:
17 | 
18 | `hash({K1, VV1}) xor hash({K2, VV2}) xor .... hash({Kn, VVn})`
19 | 
20 | This change weakens the cryptographic security of the Merkle tree, in that it directly exposes deltas i.e. the addition of the same Key and Version Vector will always result in the same hash delta in the tree, regardless of the starting point of the tree.  Whereas, if the same change has been made to two different trees with strong Merkle trees, the delta in the tree would not be predictable.
21 | 
22 | In this context, it is determined that the cryptographic strength isn't important.  All actors already have access to data, have a secure communication path, and the purpose of the tree is simply to identify deltas and not to determine the integrity of a change.
23 | 
24 | ### Supporting PUT
25 | 
26 | The result of this change is that if we know for K1, the old version vector (VV1a) and a new version vector (VV1b), we can determine the hash change to be applied to the tree with just this knowledge:
27 | 
28 | `Delta = hash({K1, VV1a}) xor hash({K1, VV1b})`
29 | 
30 | This Delta can then be applied to reach level of the tree up to the root, and the tree reflects the change.  Whereas, with the traditional Merkle tree it is first necessary to find **all** the Keys and Hashes within the changed segment, so that a *new* hash for that segment can be calculated (rather than a *delta* applied).
31 | 
32 | As well as changing the process of combining hashes, the hash algorithm is changed to (4-bytes of) md5 for both keys and version vectors (relaxing the unnecessary cryptographic strength, and making it easier to produce the Trees outside of Riak, by not depending on the erlang hash function).
33 | 
34 | In the current Merkle tree implementation, every change to a tree requires a scan of a key store, but with a TicTac tree, prior knowledge of the old version vector and the current version vector is all that is required to produce the delta.  
35 | 
36 | Within Riak, for most PUTs the `riak_kv_vnode` will read before write, and so the old version vector is already known - so no extra read cost is required to update the tree.  If the update is following the write once path, by definition (assuming developer competence) the previous version vector can be assumed to be empty.
37 | 
38 | The exceptional scenario is for updates using Last Write Wins, with a backend not supporting secondary indexes (currently only Bitcask), where the old version vector will not be known by the `riak_kv_vnode` and cannot be assumed to be empty.  There are four options for handling this scenario:
39 | 
40 | - Do not support AAE for such buckets;
41 | - Force the riak_kv_vnode to end the read-less write optimisation if Tictac AAE is enabled;
42 | - Pass the old version vector through as undefined, and require the Tictac AAE process to use its own Keystore to discover the old version vector before updating;
43 | - Pass the old version vector through as undefined, and require the Tictac AAE process to seek all Keys and Hashes in the segment to recalculate the hash in this case.
44 | 
45 | The best approach is to be determined, but it is assumed it would be better to be one that places responsibility for change on the AAE process, not the existing vnode code.
46 | 
47 | ### Coverage Implications
48 | 
49 | In most cases, the cost of altering the tree on PUT is reduced dramatically by switching to Tictac trees.  However, the biggest benefits is with regards to merging trees.
50 | 
51 | Currently, when trying to perform a full-sync operation between two Riak clusters, this can be done either through key-listing or AAE exchange.  The key-listing approach compares the two clusters one vnode at a time (over a covering set of vnodes), and in this case this is an implementation choice to throttle the process.  
52 | 
53 | The AAE exchange approach also runs the comparison one vnode at a time, but there is no choice in this regards - as the AAE trees are separated out on a per-vnode basis, and it is impossible to merge two Merkle trees without access to all the underlying keys and hashes within both Merkle Trees.
54 | 
55 | However, to merge two Tictac trees that cover non-overlapping sets of data, for each segment the result is simply:
56 | 
57 | `hash(SegA) xor hash(SegB)`
58 | 
59 | So it would be possible to take just the trees from a covering set of vnodes, and without any knowledge of the underlying Keys and Hashes merge those trees (or indeed just the roots of those trees).  This means that a covering set of vnodes can efficiently combined all their Tictac trees to produce a single Tictac tree to represent the whole cluster.
60 | 
61 | Crucially, this allows for synchronisation between database clusters with different patterns of data partitioning.  This would mean that an AAE full-sync process could be run:
62 | 
63 | - To aid in migration between clusters of different ring size, working around the issues of ring re-sizing being deprecated in Riak.
64 | - As part of a backup approach, as it will be possible to AAE full-sync replica to a cluster that not just has a different node counts(e.g. a node count 1), backends (e.g. to one that is rsync friendly), ring-size (one that is optimal for a smaller cluster size) but also different n-vals (e.g. n-val of 1).
65 | - To make synchronisation between Riak and an alternate database management system easier, assuming that alternative database can also maintain a database-wide Tictac tree.
66 | 
67 | 
68 | ## Naming
69 | 
70 | The name Tictac is taken from the [Tic-Tac language used by on-course bookmakers](https://en.wikipedia.org/wiki/Tic-tac), which was a non-secure but efficient way of communicating deltas in a wide market to a participant in the market.
71 | 
72 | This variation in Merkle trees is not novel, in that the use of the less secure XOR operation is known to be used within the [Cassandra database](http://distributeddatastore.blogspot.co.uk/2013/07/cassandra-using-merkle-trees-to-detect.html).  However, the overall pattern of anti-entropy in Cassandra is different, with [trees being built and destroyed on demand](https://wiki.apache.org/cassandra/AntiEntropy) rather than being cached and merged.
73 | 


--------------------------------------------------------------------------------
/test/end_to_end/testutil.erl:
--------------------------------------------------------------------------------
  1 | -module(testutil).
  2 | 
  3 | -export([
  4 |     gen_keys/2,
  5 |     gen_keys/3,
  6 |     put_keys/3,
  7 |     put_keys/4,
  8 |     remove_keys/3,
  9 |     gen_riakobjects/3,
 10 |     get_modify_functions/1
 11 | ]).
 12 | -export([calc_preflist/2]).
 13 | -export([
 14 |     start_receiver/0,
 15 |     exchange_sendfun/1,
 16 |     exchange_vnodesendfun/1,
 17 |     repair_fun/3
 18 | ]).
 19 | -export([
 20 |     reset_filestructure/0,
 21 |     reset_filestructure/2
 22 | ]).
 23 | 
 24 | -export([init_per_suite/1, end_per_suite/1]).
 25 | 
 26 | -include("testutil.hrl").
 27 | 
 28 | -define(ROOT_PATH, "test/").
 29 | 
 30 | init_per_suite(Config) ->
 31 |     LogTemplate =
 32 |         [
 33 |             time,
 34 |             " [",
 35 |             level,
 36 |             "] ",
 37 |             {pid, [pid, "@"], []},
 38 |             {mfa, [mfa, ":"], []},
 39 |             " ",
 40 |             msg,
 41 |             "\n"
 42 |         ],
 43 |     LogFormatter =
 44 |         {
 45 |             logger_formatter,
 46 |             #{
 47 |                 time_designator => $\s,
 48 |                 template => LogTemplate
 49 |             }
 50 |         },
 51 |     {suite, SUITEName} = lists:keyfind(suite, 1, Config),
 52 |     FileName = "kvtictac_" ++ SUITEName ++ "_ct.log",
 53 |     LogConfig =
 54 |         #{
 55 |             config =>
 56 |                 #{
 57 |                     file => FileName,
 58 |                     max_no_files => 5
 59 |                 }
 60 |         },
 61 | 
 62 |     ok = logger:add_handler(logfile, logger_std_h, LogConfig),
 63 |     ok = logger:set_handler_config(logfile, formatter, LogFormatter),
 64 |     ok = logger:set_handler_config(logfile, level, info),
 65 | 
 66 |     ok = logger:set_handler_config(default, level, notice),
 67 |     ok = logger:set_handler_config(cth_log_redirect, level, notice),
 68 | 
 69 |     ok = logger:set_primary_config(level, info),
 70 | 
 71 |     Config.
 72 | 
 73 | end_per_suite(_Config) ->
 74 |     ok = logger:remove_handler(logfile),
 75 |     ok = logger:set_primary_config(level, notice),
 76 |     ok = logger:set_handler_config(default, level, all),
 77 |     ok = logger:set_handler_config(cth_log_redirect, level, all),
 78 | 
 79 |     ok.
 80 | 
 81 | reset_filestructure() ->
 82 |     reset_filestructure(0, ?ROOT_PATH).
 83 | 
 84 | reset_filestructure(Wait, RootPath) ->
 85 |     io:format(
 86 |         "Waiting ~w ms to give a chance for all file closes " ++
 87 |             "to complete~n",
 88 |         [Wait]
 89 |     ),
 90 |     timer:sleep(Wait),
 91 |     clear_all(RootPath),
 92 |     RootPath.
 93 | 
 94 | clear_all(RootPath) ->
 95 |     ok = filelib:ensure_dir(RootPath),
 96 |     {ok, FNs} = file:list_dir(RootPath),
 97 |     FoldFun =
 98 |         fun(FN) ->
 99 |             FFP = filename:join(RootPath, FN),
100 |             case filelib:is_dir(FFP) of
101 |                 true ->
102 |                     clear_all(FFP ++ "/");
103 |                 false ->
104 |                     case filelib:is_file(FFP) of
105 |                         true ->
106 |                             file:delete(FFP);
107 |                         false ->
108 |                             ok
109 |                     end
110 |             end
111 |         end,
112 |     lists:foreach(FoldFun, FNs).
113 | 
114 | gen_keys(KeyList, Count) ->
115 |     gen_keys(KeyList, Count, spread_over_buckets).
116 | 
117 | gen_keys(KeyList, Count, Floor) when is_integer(Floor) ->
118 |     gen_keys(KeyList, Count, spread_over_buckets, Floor);
119 | gen_keys(KeyList, Count, BucketSpec) ->
120 |     gen_keys(KeyList, Count, BucketSpec, 0).
121 | 
122 | gen_keys(KeyList, Count, _, Floor) when Count == Floor ->
123 |     KeyList;
124 | gen_keys(KeyList, Count, BucketSpec, Floor) ->
125 |     Bucket =
126 |         case BucketSpec of
127 |             spread_over_buckets -> integer_to_binary(Count rem 5);
128 |             _ -> BucketSpec
129 |         end,
130 |     Key = list_to_binary(string:right(integer_to_list(Count), 6, $0)),
131 |     VersionVector = add_randomincrement([]),
132 |     gen_keys(
133 |         [{Bucket, Key, VersionVector} | KeyList],
134 |         Count - 1,
135 |         BucketSpec,
136 |         Floor
137 |     ).
138 | 
139 | put_keys(Cntrl, NVal, KL) ->
140 |     put_keys(Cntrl, NVal, KL, none).
141 | 
142 | put_keys(_Cntrl, _Nval, [], _PrevVV) ->
143 |     ok;
144 | put_keys(Cntrl, Nval, [{Bucket, Key, VersionVector} | Tail], PrevVV) ->
145 |     ok = aae_controller:aae_put(
146 |         Cntrl,
147 |         calc_preflist(Key, Nval),
148 |         Bucket,
149 |         Key,
150 |         VersionVector,
151 |         PrevVV,
152 |         term_to_binary(
153 |             {[os:timestamp()], term_to_binary([{clock, VersionVector}])}
154 |         )
155 |     ),
156 |     put_keys(Cntrl, Nval, Tail, PrevVV).
157 | 
158 | remove_keys(_Cntrl, _Nval, []) ->
159 |     ok;
160 | remove_keys(Cntrl, Nval, [{Bucket, Key, _VV} | Tail]) ->
161 |     ok = aae_controller:aae_put(
162 |         Cntrl,
163 |         calc_preflist(Key, Nval),
164 |         Bucket,
165 |         Key,
166 |         none,
167 |         undefined,
168 |         <<>>
169 |     ),
170 |     remove_keys(Cntrl, Nval, Tail).
171 | 
172 | gen_riakobjects(0, ObjectList, _TupleBuckets) ->
173 |     ObjectList;
174 | gen_riakobjects(Count, ObjectList, TupleBuckets) ->
175 |     Bucket =
176 |         case TupleBuckets of
177 |             true ->
178 |                 {?BUCKET_TYPE, integer_to_binary(Count rem 5)};
179 |             false ->
180 |                 integer_to_binary(Count rem 5)
181 |         end,
182 |     Key = list_to_binary(string:right(integer_to_list(Count), 6, $0)),
183 |     Value = crypto:strong_rand_bytes(512),
184 |     MD = [
185 |         {last_modified_date, os:timestamp()},
186 |         {random, rand:uniform(3)}
187 |     ],
188 |     Obj = #r_object{
189 |         bucket = Bucket,
190 |         key = Key,
191 |         contents = [#r_content{metadata = MD, value = Value}]
192 |     },
193 |     gen_riakobjects(Count - 1, [Obj | ObjectList], TupleBuckets).
194 | 
195 | get_modify_functions(PreflistFun) ->
196 |     PutFun =
197 |         fun(Store1, Store2) ->
198 |             OtherStores =
199 |                 case Store2 of
200 |                     none -> [];
201 |                     Store2 -> [Store2]
202 |                 end,
203 |             fun(Object) ->
204 |                 PL = PreflistFun(null, Object#r_object.key),
205 |                 mock_kv_vnode:put(Store1, Object, PL, OtherStores)
206 |             end
207 |         end,
208 |     DeleteFun =
209 |         fun(Stores) ->
210 |             fun(Object) ->
211 |                 PL = PreflistFun(null, Object#r_object.key),
212 |                 lists:foreach(
213 |                     fun(Store) ->
214 |                         mock_kv_vnode:backend_delete(
215 |                             Store,
216 |                             Object#r_object.bucket,
217 |                             Object#r_object.key,
218 |                             PL
219 |                         )
220 |                     end,
221 |                     Stores
222 |                 )
223 |             end
224 |         end,
225 |     RehashFun =
226 |         fun(Stores) ->
227 |             fun(Object) ->
228 |                 PL = PreflistFun(null, Object#r_object.key),
229 |                 lists:foreach(
230 |                     fun(Store) ->
231 |                         mock_kv_vnode:rehash(
232 |                             Store,
233 |                             Object#r_object.bucket,
234 |                             Object#r_object.key,
235 |                             PL
236 |                         )
237 |                     end,
238 |                     Stores
239 |                 )
240 |             end
241 |         end,
242 |     {PutFun, DeleteFun, RehashFun}.
243 | 
244 | add_randomincrement(Clock) ->
245 |     RandIncr = rand:uniform(100),
246 |     RandNode =
247 |         lists:nth(
248 |             rand:uniform(9),
249 |             [
250 |                 <<"a">>,
251 |                 <<"b">>,
252 |                 <<"c">>,
253 |                 <<"d">>,
254 |                 <<"e">>,
255 |                 <<"f">>,
256 |                 <<"g">>,
257 |                 <<"h">>,
258 |                 <<"i">>
259 |             ]
260 |         ),
261 |     UpdClock =
262 |         case lists:keytake(RandNode, 1, Clock) of
263 |             false ->
264 |                 [{RandNode, RandIncr} | Clock];
265 |             {value, {RandNode, Incr0}, Rest} ->
266 |                 [{RandNode, Incr0 + RandIncr} | Rest]
267 |         end,
268 |     lists:usort(UpdClock).
269 | 
270 | calc_preflist(Key, 2) ->
271 |     case erlang:phash2(Key) band 3 of
272 |         0 ->
273 |             {2, 0};
274 |         _ ->
275 |             {2, 1}
276 |     end;
277 | calc_preflist(Key, 3) ->
278 |     case erlang:phash2(Key) band 3 of
279 |         0 ->
280 |             {3, 0};
281 |         1 ->
282 |             {3, 1};
283 |         _ ->
284 |             {3, 2}
285 |     end.
286 | 
287 | start_receiver() ->
288 |     receive
289 |         {result, Reply} ->
290 |             Reply
291 |     end.
292 | 
293 | exchange_sendfun(Cntrl) ->
294 |     SendFun =
295 |         fun(Msg, Preflists, Colour) ->
296 |             RPid = self(),
297 |             ReturnFun =
298 |                 fun(R) ->
299 |                     aae_exchange:reply(RPid, R, Colour)
300 |                 end,
301 |             case Msg of
302 |                 fetch_root ->
303 |                     aae_controller:aae_mergeroot(
304 |                         Cntrl,
305 |                         Preflists,
306 |                         ReturnFun
307 |                     );
308 |                 {fetch_branches, BranchIDs} ->
309 |                     aae_controller:aae_mergebranches(
310 |                         Cntrl,
311 |                         Preflists,
312 |                         BranchIDs,
313 |                         ReturnFun
314 |                     );
315 |                 {fetch_clocks, SegmentIDs} ->
316 |                     aae_controller:aae_fetchclocks(
317 |                         Cntrl,
318 |                         Preflists,
319 |                         SegmentIDs,
320 |                         ReturnFun,
321 |                         null
322 |                     )
323 |             end
324 |         end,
325 |     SendFun.
326 | 
327 | exchange_vnodesendfun(VN) ->
328 |     fun(Msg, Preflists, Colour) ->
329 |         RPid = self(),
330 |         ReturnFun =
331 |             fun(R) ->
332 |                 aae_exchange:reply(RPid, R, Colour)
333 |             end,
334 |         mock_kv_vnode:exchange_message(VN, Msg, Preflists, ReturnFun)
335 |     end.
336 | 
337 | repair_fun(SourceList, Cntrl, NVal) ->
338 |     Lookup = lists:map(fun({B, K, V}) -> {{B, K}, V} end, SourceList),
339 |     RepairFun =
340 |         fun(BucketKeyL) ->
341 |             FoldFun =
342 |                 fun({{B0, K0}, _VCDelta}, Acc) ->
343 |                     {{B0, K0}, V0} = lists:keyfind({B0, K0}, 1, Lookup),
344 |                     [{B0, K0, V0} | Acc]
345 |                 end,
346 |             KVL = lists:foldl(FoldFun, [], BucketKeyL),
347 |             ok = put_keys(Cntrl, NVal, KVL)
348 |         end,
349 |     RepairFun.
350 | 


--------------------------------------------------------------------------------
/docs/GENERAL_TICTACAAE_FOR_RIAK.md:
--------------------------------------------------------------------------------
  1 | # Background
  2 | 
  3 | Further helpful details on the background to this work can be found in the previous [Anti-Entropy](ANTI_ENTROPY.md) write-up.
  4 | 
  5 | The aim is to provide a better answer to the active anti-entropy in Riak.  Specifically, it would be preferable to resolve the following issues:
  6 | 
  7 | - Rebuild times.  Both the cost of rebuilds but also the cost in the failure of AAE-dependent processes during rebuilds.  For example, due to the [rate-limiting of rebuilds](https://github.com/basho/riak_kv/blob/2.1.7/src/riak_kv_index_hashtree.erl#L98-L101), rebuilding a single vnode can take o(10) hours.  during this rebuild time, these partitions are not subject to internal AAE, and Multi-Data Centre AAE [may be blocked altogether](https://github.com/basho/riak_repl/issues/772).
  8 | 
  9 | - Version inconsistencies.  The process of trying to make the transition from one version of AAE to another smooth, is potentially [too disruptive](https://github.com/basho/riak_kv/issues/1659), and leaves a long legacy in [future versions](https://github.com/basho/riak_kv/issues/1656).
 10 | 
 11 | - Cost of AAE.  Every AAE exchange requires in effect a [range scan](https://github.com/basho/riak_core/blob/2.1.9/src/hashtree.erl#L65-L72) in the key-store for every key updated since the last exchange for that partition.  This contributes to a 10% performance overhead associated with running AAE.
 12 | 
 13 | - Support for native AAE support within backends.  The Leveled backend can support optimisations for by-segment scanning over its native key-store, potentially rendering the need to keep (and periodically rebuild) a dedicated key-store for AAE unnecessary.  It would be beneficial to have an improved AAE that can exploit this advantage, without preventing the anti-entropy solution form being used on backends that would require a dedicated anti-entropy store.
 14 | 
 15 | # Overview Of Needs
 16 | 
 17 | The high level changes proposed are:
 18 | 
 19 | - Have an AAE solution per vnode where the key-store is both optional (and so can be avoided where native support renders it unnecessary), and has swappable backends (including a pure Erlang alternative to Leveldb).
 20 | 
 21 | - Keep the actual AAE Merkle Trees cached using TicTac trees to support updates to the tree without scanning.
 22 | 
 23 | - Use per-partition TicTac trees so that the Merkle trees can be merged across vnodes, to make AAE backed synchronisation possible between clusters of different ring sizes.
 24 | 
 25 | - Allow rebuilds to take place in parallel to maintaining the old store and cache of the Merkle tree - so exchanges can continue through the rebuild process.
 26 | 
 27 | - Formalise the use of dotted version vector as the basis for the object hash to reduce the cost of object binary changes and copying.  Also allow for intelligent comparisons between clusters by exchanging keys & clocks, not just keys & hashes.
 28 | 
 29 | - Have the new AAE solution work in parallel to the legacy solution, so that migration is controlled through administration/configuration, and the legacy solution can be permanently forgotten by the cluster.
 30 | 
 31 | - Externalise the AAE functions, so that the same functions can be used for synchronisation with different database platforms, without requiring internal changes to Riak.
 32 | 
 33 | # AAE design
 34 | 
 35 | ## Actors, States and Messages
 36 | 
 37 | The primary actors
 38 | 
 39 | - AAEController (1 per vnode) - gen_fsm
 40 | 
 41 | - KeyStore (1 per Controller) - gen_server
 42 | 
 43 | - TreeCache (n per Controller) - gen_fsm
 44 | 
 45 | - DiskLog (temporary - 1 per Controller) - gen_server
 46 | 
 47 | ### AAEController
 48 | 
 49 | The AAEController will have 3 states: `starting`, `replacing-store`, `replacing-tree` and `steady`.  In all states except `starting` an exchange will be possible.
 50 | 
 51 | The AAEController can receive data updates from the vnode in four forms:
 52 | 
 53 | - {IndexN, Bucket, Key, CurrentClock, unidentified} for PUTs marshalled via the blind_put (for LWW buckets without 2i support in the backend e.g. LWW -> Bitcask), or when a rehash request has been made for a single object;
 54 | 
 55 | - {IndexN, Bucket, Key, CurrentClock, PreviousClock} for standard object updates (PreviousClock will be none for fresh objects);
 56 | 
 57 | - {IndexN, Bucket, Key, none, PreviousClock} for actual backend deletes (e.g. post tombstone).
 58 | 
 59 | The AAE Controller will handle the casting or calling of these messages by casting a message to the appropriate TreeCache to prompt an update, and then adding the update to a queue to be batch written to the KeyStore.  There is an additional penalty for changes where the PreviousClock is unidentified in that they will require a range scan of the KeyStore to generate the TreeCache update message.
 60 | 
 61 | The AAE controller may also receive requests to retrieve the branch or leaf hashes for a given partition TreeCache, as well as trigger rebuilds or rehashes.
 62 | 
 63 | ### KeyStore
 64 | 
 65 | The KeyStore needs to support four operations:
 66 | 
 67 | - A batch PUT of objects
 68 | 
 69 | - An object fold bounded by a range
 70 | 
 71 | - An is_empty check
 72 | 
 73 | - A GET of a single object
 74 | 
 75 | On startup the AAEController must be informed by the vnode the is_empty status of the actual vnode key store, and this should match the is_empty status of the AAE key store.  If there is a mismatch then the KeyStore must be rebuilt before the AAEController can exit the `starting` state.
 76 | 
 77 | As vnode changes are made, these changes should be reflected in batches in the KeyStore.  The Key for the entry in the KeyStore should be a tuple of `{IndexN, SegmentID, Bucket, Key}` where SegmentID is the hash of the Bucket and Key used to map the identifier to a location in the merkle tree.  The Value of the object should be a tuple of `{VectorClock, Hash}`.
 78 | 
 79 | Activity in the KeyStore should be optimised for the vast majority of traffic being PUTs. Queries are only used for the KeyStore when:
 80 | 
 81 | - Folding over all objects by IndexN and SegmentID to return Keys/Clocks for a given segment;
 82 | 
 83 | - Folding over all objects to recalculate an AAE tree for each IndexN;
 84 | 
 85 | - Fetching of a specific object by IndexN, SegmentID, Bucket and Key to recalculate a specific hash in the AAE tree when the update to the AAEController has a PreviousClock of `unidentified`.  
 86 | 
 87 | When a KeyStore needs to be rebuilt, a new KeyStore is started, but the old KeyStore should continue to receive updates, and be used to fulfil requests for Keys and Clocks and to read `unidentified` Clocks.  Only once the new store is complete, should the old store be destroyed.
 88 | 
 89 | A manifest file should be kept to indicate which is the current active store to be used on a restart.
 90 | 
 91 | If the vnode backend has native support for the queries required by the AAE KeyStore, then the KeyStore can be run in native mode - ignoring the batch puts, and re-directing the queries to the actual vnode backend.  In native mode `unidentified` previous clocks cannot be supported (and should not be needed).
 92 | 
 93 | ### TreeCache
 94 | 
 95 | There is a TreeCache process for each IndexN managed by the AAEController.  The TreeCache receives changes in the form {SegmentID, HashChange}.  The HashChange is calculated by performing an XOR operation on the hash of the current clock, and the hash of the previous clock.  The SegmentID is calculated from the hash of the <<Bucket, Key>> binary.
 96 | 
 97 | The TreeCache process should respond to each update by changing the tree to reflect that update.  
 98 | 
 99 | The TreeCache can be in a `starting` state, for instance when a new cache is being built by the AAEController in the `replacing-tree` state.  In the starting state the TreeCache should not be forwarded requests for AAE tree information.
100 | 
101 | 
102 | ### DiskLog
103 | 
104 | When both replacing a store and replacing a tree, batches of updates need to be cached until the store or tree is ready to receive them.  For example, rebuilding the store will start a new KeyStore backend and take a snapshot of the real vnode backend to fold and populate the store.  However, the store being rebuilt cannot receive new updates during this rebuild process (without requiring all the updates from the fold to require a read before the PUT) - so the batches of new updates need to be cached in a log, to be applied only once the fold is complete.
105 | 
106 | ## Startup and Shutdown
107 | 
108 | On shutdown any incomplete batches should be passed to the KeyStore and the KeyStore shutdown.  All functioning TreeCaches should be shutdown, and on shutdown should write a CRC-checked file containing the serialised tree.  At the point the shutdown is requested, the TreeCache may be at a more advanced state than the KeyStore, and if sync_on_write is not enabled in the vnode backend the KeyStore could be in advance of the backend.  To try and protect against situations on startup where the TreeCache reflects a more advanced state than the actual vnode - the TreeCache should not be persisted until the vnode backend and the AAE KeyStore have both successfully closed.
109 | 
110 | On startup, if shutdown was completed normally, the TreeCaches should be restored from disk, as well as the KeyStore.  Any partially rebuilt KeyStore should be destroyed.
111 | 
112 | On recovering a TreeCache from disk, the TreeCache process should delete the TreeCache from disk before receiving any update.
113 | 
114 | If the shutdown was unclean, and there is a KeyStore, but no persisted TreeCache, then before completing startup the AAEController should enforce a fold over the KeyStore to rebuild the TreeCaches.
115 | 
116 | If the KeyStore has missing updates due to an abrupt shutdown, this will cause (potentially false) repairs of the keys, and the repair will also trigger a rehash.  the rehash should prompt a correction in the AAE KeyStore (through an `unidentified`) previous clock to bring the TreeCache and KeyStore back into line.
117 | 
118 | ## Rebuilds and Rehashes
119 | 
120 | If an AAE KeyStore is used in non-native mode, periodically the Keystore should be rebuilt, should there be entropy from disk in the actual KeyStore.  This is achieved using the `replacing-store` state in the AAEController.
121 | 
122 | When replacing a store, the previous version of the store will be kept up to date and used throughout the rebuild process, in order to prevent the blocking of exchanges.  The only exception to this is when a rebuild has been prompted by a conflict of `is_emtpy` properties on startup - in which case the vnode startup process should be paused to allow for the rebuild to complete.
123 | 
124 | To avoid the need to do reads before writes when updating the AAE KeyStore from the vnode backend fold (so as not to replace a new update with an older snapshot value from the backend) new updates must be parked in a DiskLog process whilst the fold completes.  Once the fold is complete, the rebuild of store can be finished by catching up on updates from the DiskLog.
125 | 
126 | At this stage the old Keystore can be deleted, and the new KeyStore be used.  At this stage though, the TreeCache does not necessarily reflect the state of the new KeyStore - the `replacing-tree` state is used to resolve this.  When replacing the tree, new empty TreeCaches are started and maintained in parallel to the existing TreeCaches (which continue to be used in exchanges).  A fold of the KeyStore is now commenced, whilst new updates are cached in a DiskLog.  Once the fold is complete, the new updates are applied and the TreeCache can be migrated from the old cache to the new cache.
127 | 
128 |   
129 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/docs/RIAK_3_AAE.md:
--------------------------------------------------------------------------------
  1 | # Proposed AAE Implementation in Riak 3.0
  2 | 
  3 | - Each riak vnode has a kv_index_tictactree `aae_controller` process if Tictac AAE is enabled.  It may also run [Riak 2 AAE](RIAK_2_AAE.md) in parallel - the two AAE implementations should not care about the existence of the other.
  4 | 
  5 | - That `aae_controller` keeps a single `aae_keystore` process which is responsible for keeping all the keys and AAE metadata for the store.  The `aae_keystore` process can be any backend that supports query by Merkle tree segment that duplicates the storage of keys; this is `parallel` mode.  The `aae_keystore` could also be a reference to the vnode backend PID, if the vnode backend can itself support query by AAE segment; this is `native` mode and doesn't require the duplictaion of key/metadata storage.
  6 | 
  7 | - As well as the key store, the `aae_controller` process keeps a `aae_treecache` process for each combination of preflist and n_val the vnode supports (or IndexN).  This process is an in-memory cache of the current state of the Tictac Merkle tree for that IndexN.
  8 | 
  9 | - IndexN is a reference to a combination of a n-val and a preflist supported by the vnode.  If all buckets in the store are n-val 3, then there will be 3 IndexNs per vnode (and hence 3 merkle trees).  If there are some buckets with an n-val of 3 and some with an n-val of 4 - there will be 7 merkle trees.
 10 | 
 11 | - There is no longer a concept of dirty segments, each tree cache represents the current live state of the vnode.
 12 | 
 13 | ## On PUT
 14 | 
 15 | - When a vnode receives a PUT request, it passes a change note to the `aae_controller`.  The change consists of the {Bucket, Key}, the IndexN, the CurrentClock and the PreviousClock for the PUT - as well as some useful metadata about the value (e.g. sibling count, index_hash, perhaps the whole object head).  
 16 | 
 17 |     - If the change is a delete the CurrentClock should be none.  
 18 | 
 19 |     - If the change is a fresh put, the PreviousClock should be none (this includes any PUt on the write once path).
 20 | 
 21 |     - If the change is a LWW PUT in an non-index backend, the PreviousClock should be undefined.
 22 | 
 23 |     - If the change is a rehash, then the PreviousClock should be undefined.
 24 | 
 25 | - The `aae_controller` receives this update as a cast.  It has two tasks now:
 26 | 
 27 |     - If the `aae_keystore` is running in `parallel` mode, cast the change on to the keystore.  The `aae_keystore` should then queue up the new version for the next batch of keys/metadata to be stored. In `native` mode, no keystore change is required.
 28 | 
 29 |     - Based on the IndexN, the `aae_controller` should cast the delta to the appropriate `aae_treecache`.  This should update the Merkle tree by removing the old version from the tree (by XORing the hash of the {Key, PreviousClock} again), and adding the new version (by XORing the segment by the hash of the {Key, CurrentClock}).
 30 | 
 31 |     - The exception to this is when the PreviousClock is undefined - meaning there was no read before write.  In this case, the PreviousClock needs to be filled in with a read against the `aae_keystore` before processing.  
 32 | 
 33 |         - One scenario where the previous clock is `undefined` is when Last Write Wins is used with a non-indexing backend.  This removes some of the efficiency advantages of Last Write Wins writes, though doesn't eliminate the latency improvement (as the AAE read does not block the update from proceeding).
 34 | 
 35 |         - The other scenario is on a `rehash` request (when a read_repair GET request has not discovered an expected anomaly between the vnode values).
 36 | 
 37 |         - The `aae_keystore` may fill-in this information two ways.  It could simple read the previous object in the keystore, or it could fold over all objects in that segment and IndexN to calculate an entirely new hash value for that segment.  Perhaps the latter should be a fallback for `rehash` requests (i.e. on a dice roll on a rehash request, so rehash eventually causes this)
 38 | 
 39 | - Before any fold operation on the `aae_keystore` the batch of changes waiting to be written are flushed.
 40 | 
 41 | ## On Exchange
 42 | 
 43 | - Riak can then be prompted to do exchanges (maybe via an entropy manager, maybe through scheduled external requests).  An exchange could be:
 44 | 
 45 |     - An exchange for a given n_val between two coverage plans (with different offsets);
 46 | 
 47 |     - An exchange between each locally stored Preflist, and another remote Preflist for that n_val.
 48 | 
 49 |     - An exchange between a randomly chosen set of pairs of common IndexNs between vnodes.
 50 | 
 51 | - An exchange is done by starting an `aae_exchange` process.  The `aae_exchange` is a FSM and should be initiated by the calling service via sidejob.  The `aae_exchange` process takes as input:
 52 | 
 53 |     - A BlueList and a PinkList - lists of {SendFun, [IndexN]} tuples, where the SendFun is a function that can send a message to a given controller, and the list of IndexNs are the preflist/n_val pairs relative to this exchange at that destination.  The SendFun in this case should use the riak_core message passing to reach the riak_kv_vnode - and the riak_kv_vnode will be extended to detect AAE commands and forward them to the `aae_controller`.
 54 | 
 55 |     -  A RepairFun - that will be passed any deltas, and in the case of intra-cluster anti-entropy the RepairFun should just send a throttled stream of GET requests to invoke read_repair
 56 | 
 57 |     - A ReplyFun - to send a response back to the client (giving the state at which the FSM exited, and the number of KeyDeltas discovered.
 58 | 
 59 | - The exchange will request all the tree roots to be fetched from the Blue List and the Pink List - merging to give a Blue root and a Pink root, and comparing those roots.  This will provide a list of branch IDs that may potentially have deltas.  If the list is empty, the process will reply and exit.
 60 | 
 61 | - The exchange will then pause, and then re-request all the tree roots.  This will produce a new list of BranchID deltas from the comparison of the merged roots, and this will be intersected with the first list.  If the list is empty, the process will reply and exit.
 62 | 
 63 | - The exchange will then pause, and then request all the branches that were in the intersected list of BranchIDs from the Blue and Pink lists.  This will again be a repeated request, with the intersection of the SegmentIDs that differ being taken forward to the next stage, and the process will reply and exit if the list is empty.
 64 | 
 65 | - The number of SegmentIDs that are taken forward for the clock comparison is bounded, and the code will attempt to chose the set of SegmentIDs that are closest together as the subset to be used.  Those SegmentIDs will then be forwarded in a request to `fetch_clocks`.  These requests will be passed by the `aae_controller` to the `aae_keystore`, and this will fold over the store (and this will be the vnode store if the `aae_keystore` is running in native mode), looking for all Keys and Version Vectors within those segments and IndexNs.  If the keystore is segment-ordered, this will be a series of range folds on the snapshot.  If the keystore is key-ordered, then there will be a single fold across the whole store, but before a slot of keys is passed into the fold process it will be checked to see if it contains any key in the segment - and skipped if not.
 66 | 
 67 | - When the Keys and Clocks are returned they are compared, and then deltas are passed to the RepairFun for read repair.
 68 | 
 69 | 
 70 | ## On startup and shutdown
 71 | 
 72 | - Before a vnode backend is shutdown, a special object should be stored where the value is a Shutdown GUID.  When a vnode backend, the object should be read, and if present should then be deleted.  
 73 | 
 74 | - When the `aae_controller` is started it is passed the IsEmpty status of the vnode backend as well as the shutdown GUID.  The `aae_keystore` should likewise have the Shutdown GUID on shutdown (and erased it on startup), and on startup an confirm that the IsEmpty status and Shutdown GUIDs match between the vnode and the `parallel` keystore.
 75 | 
 76 | - If there is no match on startup, then it cannot be assumed that the two stores are consistent, and the next rebuild time should be set into the past.  this should then prompt a rebuild.  Until the rebuild is complete, the `aae_controller` should continue on a best endeavours basis, assuming that the data in the `aae_treecache` and `aae_keystore` is good enough until the rebuild completes.
 77 | 
 78 | 
 79 | ## On rebuild
 80 | 
 81 | - Rebuilds are not prompted by the `aae_controller`, they require an external actor to prompt them.
 82 | 
 83 | - The `aae_controller` trackes the next rebuild time, the time when it should be next rebuild.  This is based on adding a rebuild schedule (a fixed time between rebuilds and a random variable time to reduce the chances of rebuild synchronisation across vnodes) to the last rebuild time.  The last rebuild time is persisted to be preserved across restarts.
 84 | 
 85 | - The next rebuild time is reset to the past on startup, if a failure to shutdown cleanly and consistently either the vnode or the aae service is detected through a mismatch on the Shutdown GUID.
 86 | 
 87 | - The vnode should check the next rebuild time after startup of the `aae_controller`, and schedule a callback to prompt the rebuild at that time.
 88 | 
 89 | - When the rebuild time is reached, the vnode should prompt the rebuild of the store via the `aae_controller`
 90 | 
 91 |     - the prompt should pass in a SplitObjFun which can extract/calculate the IndexN and version vector for a Riak object in the store (this is required only in `parallel` mode).
 92 | 
 93 |     - the prompt should first flush all batched updates to the `aae_keystore`, and trigger the `aae_keystore` to enter the `loading` state.
 94 | 
 95 |     - in the `loading` state a separate backend for the keystore is started.  All updates received from that point are added in batches to the main keystore as normal, but also queued up for the new keystore.
 96 | 
 97 |     - a fold objects function and a finish function is returned in response to the prompt request.
 98 | 
 99 |         - the fold fun will load all passed objects in batches as object_specs (by extracting out the vector clock etc) directly into the new load store of the `aae_keystore`.
100 | 
101 |         - the finish fun will prompt the worker running the fold to prompt the keystore to complete the load.  This will involve, loading all the queued object specs (of updates received since the fold was started) into the store, deleting the old key store, and making the newly rebuilt key store the master.
102 | 
103 |     - the vnode process takes these fold functions, and starts an object fold using the functions (if snapshot fold is supported in the backend, then via a riak_core_node_worker so as not to avoid multiple parallel folds).
104 | 
105 |     - the vnode should also request a callback from the worker when the work is completed, to prompt it to prompt the `aae_controller` to now rebuild the tree caches.
106 | 
107 | - If the `aae_keystore` is in `native` mode, none of the above happens, as the store is the store, and so there is no need for a rebuild.
108 | 
109 | - the `aae_controller` should be prompted to rebuild_trees, and for this IndexNs (the list of IndexNs the vnode currently manages), PreflistFun (a fun to calculate the IndexN from a {B, K} pair - required only in `native` mode), and a WorkerFun (a node/vnode worker to tun the next fold) is passed.
110 | 
111 |     - the `aae_controller` should inform the `aae_treecaches` to start loading, this requires them to queue new updates as well as making the changes in the cache.
112 | 
113 |     - A fold is then run over the key store (or the vnode store in the case of `native` backends), using the WorkerFun.
114 | 
115 |     - the fold should incrementally build a separate TicTac tree for each IndexN.
116 | 
117 |     - when the fold is complete, the trees are sent to the `aae_treecache` processes for loading.  Loading discards the previous tree, takes the new tree, and loads all the changes which have been queued since the point the snapshot for the fold to build the trees was taken.
118 | 
119 | - This completes the rebuild process.  It is important that the folds in the rebuild process use snapshots which are co-ordinated with the formation of the load queues - so that the deltas being applied to the load queues takes the system to a consistent point.
120 | 
121 | - If there is a shutdown during a rebuild process, all the partially built elements are discarded, and the rebuild will be due again at startup.
122 | 
123 | - Scheduling of rebuilds is not ventrally managed (so the locks required to reduce concurrency in the existing AAE process are discarded).  There is instead a combination of some random factor added to the schedule time, plus the use of the core_node_worker_pool - which prevents more than one M folds being on a node concurrently (where M is the size of the pool, normally 1).
124 | 
125 | ## Secondary Uses
126 | 
127 | - A proof of concept on coverage folds showed that there were some interesting operations that could be managed more safely and efficiently than Map Reduce, using folds over the heads of objects, using a `core_node_worker_pool` and a backend which stores heads separate to objects.  In introducing an AEE store where the `aae_keystore` can store additional metadata, perhaps including the whole object head - there exists the potential to bring these features to all backends.
128 | 
129 | - Another possibility is the efficient handling of `HEAD` not `GET` requests (for example where only version vector is required).  This is only supported in the leveled backend at present, in other backends it can be supported by still reading the object, and just stripping to the head to avoid the network overhead.  It may be possible for a `riak_kv_vnode` with a bitcask backend to handle `HEAD` requests in this way, unless co-ordination between backend and AAE store is confirmed (because of matching Shutdown GUIDs at startup or a rebuild since startup).  In this case the `HEAD` request could instead be handled by the AAE store, avoiding the actula object read.
130 | 


--------------------------------------------------------------------------------
/src/aae_util.erl:
--------------------------------------------------------------------------------
  1 | %% -------- Overview ---------
  2 | %%
  3 | %% Centralised definition of log functions.  To make switching to Lager in
  4 | %% the future a bit easier, and avoid repeated codes across modules
  5 | 
  6 | -module(aae_util).
  7 | 
  8 | -include("aae.hrl").
  9 | 
 10 | -export([
 11 |     log/4,
 12 |     get_log/1,
 13 |     get_opt/2,
 14 |     get_opt/3,
 15 |     make_binarykey/2,
 16 |     safe_open/1,
 17 |     set_loglevel/1,
 18 |     maybe_include_key/2,
 19 |     check_rootpath/1
 20 | ]).
 21 | 
 22 | -export([
 23 |     clean_subdir/1,
 24 |     test_key_generator/1,
 25 |     flip_byte/3
 26 | ]).
 27 | 
 28 | -ifdef(TEST).
 29 | -export([get_segmentid/2]).
 30 | -endif.
 31 | 
 32 | -type log_level() :: debug | info | warning | error | critical.
 33 | -type log_levels() :: list(log_level()) | undefined.
 34 | 
 35 | -export_type([log_levels/0]).
 36 | 
 37 | -define(DOMAIN, [background, tictacaae]).
 38 | 
 39 | %% erlfmt:ignore-begin
 40 | -define(LOGBASE,
 41 |     #{
 42 |         g0001 => 
 43 |             {info, <<"Generic log point">>},
 44 |         g0002 =>
 45 |             {info, <<"Generic log point with term ~w">>},
 46 |         d0001 =>
 47 |             {debug, <<"Generic debug log">>},
 48 |         aae01 => 
 49 |             {warning,
 50 |                 <<
 51 |                     "AAE Key Store rebuild required on startup due to mismatch between vnode store state ~w "
 52 |                     "and AAE key store state of ~w maybe restart with node excluded from coverage "
 53 |                     "queries to improve AAE operation until rebuild is complete"
 54 |                 >>
 55 |             },
 56 |         aae02 =>
 57 |             {info, <<"Native KeyStore type ~w startup request">>},
 58 |         aae03 =>
 59 |             {debug,
 60 |                 <<"Unexpected Bucket ~w Key ~w passed with IndexN ~w that does not match any of ~w">>
 61 |             },
 62 |         aae04 =>
 63 |             {warning, <<"Misrouted request for IndexN ~w">>},
 64 |         aae06 =>
 65 |             {info, <<"Received rebuild trees request for IndexNs ~w">>},
 66 |         aae07 =>
 67 |             {info, <<"Dispatching test fold">>},
 68 |         aae08 =>
 69 |             {info, <<"Spawned worker receiving test fold">>},
 70 |         aae09 =>
 71 |             {info, <<"Change in IndexNs detected at rebuild - new IndexN ~w">>},
 72 |         aae10 =>
 73 |             {info, <<"AAE controller started with IndexNs ~w and StoreType ~w">>},
 74 |         aae11 =>
 75 |             {info, <<"Next rebuild scheduled for ~w">>},
 76 |         aae12 =>
 77 |             {info, <<"Received rebuild store for parallel store ~w">>},
 78 |         aae13 =>
 79 |             {info, <<"Completed tree rebuild with rebuild_time_ms=~w">>},
 80 |         aae14 =>
 81 |             {debug, <<"Mismatch finding unexpected IndexN in fold of ~w">>},
 82 |         aae15 =>
 83 |             {info, <<"Ping showed time difference of ~w ms">>},
 84 |         aae16 =>
 85 |             {info, <<"Keystore ~w when tree rebuild requested">>},
 86 |         aae17 =>
 87 |             {warning, <<"Corrupted object with B=~p K=~p for ~w ~w">>},
 88 |         ex001 => 
 89 |             {info, <<"Exchange id=~s with target_count=~w expected purpose=~w">>},
 90 |         ex002 =>
 91 |             {error, <<"~w with pending_state=~w and missing_count=~w for exchange id=~s purpose=~w">>},
 92 |         ex003 =>
 93 |             {info,
 94 |                 <<
 95 |                     "Normal exit for full exchange purpose=~w in_sync=~w pending_state=~w for exchange id=~s "
 96 |                     "scope of mismatched_segments=~w root_compare_loops=~w branch_compare_loops=~w keys_passed_for_repair=~w"
 97 |                 >>
 98 |             },
 99 |         ex004 =>
100 |             {info, <<"Exchange id=~s purpose=~w led to prompting of repair_count=~w">>},
101 |         ex005 =>
102 |             {info, <<"Exchange id=~s throttled count=~w at state=~w">>},
103 |         ex006 =>
104 |             {debug, <<"State change to ~w for exchange id=~s">>},
105 |         ex007 => 
106 |             {debug, <<"Reply received for colour=~w in exchange id=~s">>},
107 |         ex008 => 
108 |             {debug, <<"Comparison between BlueList ~w and PinkList ~w">>},
109 |         ex009 =>
110 |             {info, 
111 |                 <<
112 |                     "Normal exit for full exchange purpose=~w in_sync=~w pending_state=~w for exchange id=~s "
113 |                     "scope of mismatched_segments=~w tree_compare_loops=~w  keys_passed_for_repair=~w"
114 |                 >>
115 |             },
116 |         ex010 =>
117 |             {warning, <<"Exchange not_supported in exchange id=~s for colour=~w purpose=~w">>},
118 |         ex011 =>
119 |             {info, <<"Filtered clocks before comparison removing blue=~w pink=~w">>},
120 |         ex012 =>
121 |             {info, <<"Bucket counts for blue ~0p pink ~0p">>},
122 |         ks001 => 
123 |             {info, <<"Key Store loading with id=~w has reached deferred count=~w">>},
124 |         ks002 =>
125 |             {warning, <<"No valid manifest found for AAE keystore at ~s reason ~s">>},
126 |         ks003 =>
127 |             {info, <<"Storing manifest with current GUID ~s">>},
128 |         ks004 =>
129 |             {info, <<"Key Store building with id=~w has reached loaded_count=~w">>},
130 |         ks005 =>
131 |             {info, <<"Clean opening of manifest with current GUID ~s">>},
132 |         ks006 =>
133 |             {warning, <<"Pending store is garbage and should be deleted at ~s">>},
134 |         ks007 =>
135 |             {info, <<"Rebuild prompt ~w with GUID ~s">>},
136 |         ks008 =>
137 |             {info, <<"Rebuild queue load backlog_items=~w loaded_count=~w">>},
138 |         r0001 =>
139 |             {info, <<"AAE fetch clock runner has seen results=~w query_time=~w for a query_count=~w queries">>},
140 |         r0002 =>
141 |             {info, <<"Query backlog resulted in dummy fold">>},
142 |         r0003 =>
143 |             {debug, <<"Query complete in time ~w">>},
144 |         r0004 =>
145 |             {debug, <<"Prompting controller">>},
146 |         r0005 =>
147 |             {warning, <<"Query lead to error ~w pattern ~w">>},
148 |         c0001 =>
149 |             {info, <<"Pending filename ~s found and will delete">>},
150 |         c0002 =>
151 |             {warning, <<"File ~w opened with error=~w so will be ignored">>},
152 |         c0003 =>
153 |             {info, <<"Saving tree cache to path ~s and filename ~s">>},
154 |         c0004 =>
155 |             {info, <<"Destroying tree cache for partition ~w">>},
156 |         c0005 =>
157 |             {info, <<"Starting cache with is_restored=~w and IndexN of ~w">>},
158 |         c0006 =>
159 |             {debug, <<"Altering segment for PartitionID=~w ID=~w Hash=~w">>},
160 |         c0007 =>
161 |             {warning, <<"Treecache exiting after trapping exit from Pid=~w">>},
162 |         c0008 =>
163 |             {info, <<"Complete load of tree with length of change_queue=~w">>},
164 |         c0009 =>
165 |             {info, <<"During cache rebuild reached length of change_queue=~w">>}
166 | 
167 |     }).
168 | %% erlfmt:ignore-end
169 | 
170 | %%%============================================================================
171 | %%% External functions
172 | %%%============================================================================
173 | 
174 | -spec get_log(atom()) -> {log_level(), binary()}.
175 | get_log(LogRef) ->
176 |     maps:get(LogRef, ?LOGBASE).
177 | 
178 | -spec log(log_level(), atom(), leveled_log:log_options(), list()) -> list().
179 | log(LogLevel, LogRef, LogOpts, Subs) ->
180 |     leveled_log:log(
181 |         LogLevel,
182 |         LogRef,
183 |         LogOpts,
184 |         Subs,
185 |         ?LOGBASE,
186 |         ?DOMAIN
187 |     ).
188 | 
189 | -spec set_loglevel(list() | undefined) -> ok.
190 | set_loglevel(undefined) ->
191 |     ok;
192 | set_loglevel(Inputs) when Inputs =/= undefined ->
193 |     LogLevel =
194 |         lists:foldl(
195 |             fun(Check, Acc) ->
196 |                 case {Check, Acc} of
197 |                     {_, none} ->
198 |                         case {Check, lists:member(Check, Inputs)} of
199 |                             {_Check, true} ->
200 |                                 Check;
201 |                             {critical, false} ->
202 |                                 % no valid input - set to info
203 |                                 info;
204 |                             {_Check, false} ->
205 |                                 none
206 |                         end;
207 |                     {_, Level} when Level /= none ->
208 |                         Level
209 |                 end
210 |             end,
211 |             none,
212 |             [debug, info, warning, error, critical]
213 |         ),
214 |     case LogLevel of
215 |         LogLevel when LogLevel /= none ->
216 |             leveled_log:set_loglevel(LogLevel)
217 |     end.
218 | 
219 | -spec check_rootpath(list()) -> string().
220 | check_rootpath(RootPath) ->
221 |     case io_lib:printable_list(RootPath) of
222 |         true ->
223 |             RootPath
224 |     end.
225 | 
226 | -spec get_opt(atom(), list()) -> any().
227 | %% @doc
228 | %% Return an option from a KV list
229 | get_opt(Key, Opts) ->
230 |     get_opt(Key, Opts, undefined).
231 | 
232 | -spec get_opt(atom(), list(), any()) -> any().
233 | %% @doc
234 | %% Return an option from a KV list, or a default if not present
235 | get_opt(Key, Opts, Default) ->
236 |     case proplists:get_value(Key, Opts) of
237 |         undefined ->
238 |             Default;
239 |         Value ->
240 |             Value
241 |     end.
242 | 
243 | -spec make_binarykey(aae_keystore:bucket(), aae_keystore:key()) -> binary().
244 | %% @doc
245 | %% Convert Bucket and Key into a single binary
246 | make_binarykey({Type, Bucket}, Key) when
247 |     is_binary(Type), is_binary(Bucket), is_binary(Key)
248 | ->
249 |     <<Type/binary, Bucket/binary, Key/binary>>;
250 | make_binarykey(Bucket, Key) when is_binary(Bucket), is_binary(Key) ->
251 |     <<Bucket/binary, Key/binary>>.
252 | 
253 | -spec maybe_include_key(
254 |     aae_controller:key_include_fun(),
255 |     {aae_keystore:bucket(), aae_keystore:key()} | reset
256 | ) ->
257 |     boolean().
258 | maybe_include_key(none, _Input) ->
259 |     true;
260 | maybe_include_key(KeyFilterFun, {Bucket, Key}) ->
261 |     KeyFilterFun({Bucket, Key});
262 | maybe_include_key(KeyFilterFun, reset) ->
263 |     KeyFilterFun(reset).
264 | 
265 | %%%============================================================================
266 | %%% Internal functions
267 | %%%============================================================================
268 | 
269 | -spec safe_open(string()) -> {ok, binary()} | {error, atom()}.
270 | safe_open(FileName) ->
271 |     case filelib:is_file(FileName) of
272 |         true ->
273 |             case file:read_file(FileName) of
274 |                 {ok, <<CRC32:32/integer, BinContent/binary>>} ->
275 |                     case erlang:crc32(BinContent) of
276 |                         CRC32 ->
277 |                             {ok, BinContent};
278 |                         _ ->
279 |                             {error, crc_wonky}
280 |                     end;
281 |                 _ ->
282 |                     {error, no_crc}
283 |             end;
284 |         false ->
285 |             {error, not_present}
286 |     end.
287 | 
288 | %%%============================================================================
289 | %%% Test
290 | %%%============================================================================
291 | 
292 | flip_byte(Binary, Offset, Length) ->
293 |     Byte1 = rand:uniform(Length) + Offset - 1,
294 |     <<PreB1:Byte1/binary, A:8/integer, PostByte1/binary>> = Binary,
295 |     case A of
296 |         0 ->
297 |             <<PreB1:Byte1/binary, 255:8/integer, PostByte1/binary>>;
298 |         _ ->
299 |             <<PreB1:Byte1/binary, 0:8/integer, PostByte1/binary>>
300 |     end.
301 | 
302 | test_key_generator(hash) ->
303 |     ValueFun =
304 |         fun() ->
305 |             V = rand:uniform(1000),
306 |             <<Hash:32/integer, _Rest/binary>> =
307 |                 crypto:hash(md5, <<V:32/integer>>),
308 |             Hash
309 |         end,
310 |     internal_generator(ValueFun);
311 | test_key_generator(v1) ->
312 |     ValueFun =
313 |         fun() ->
314 |             Clock = [{rand:uniform(1000), rand:uniform(1000)}],
315 |             BClock = term_to_binary(Clock),
316 |             Size = rand:uniform(100000),
317 |             SibCount = rand:uniform(3),
318 |             <<Hash:32/integer, _Rest/binary>> = crypto:hash(md5, BClock),
319 |             {Clock, Hash, Size, SibCount}
320 |         end,
321 |     internal_generator(ValueFun).
322 | 
323 | internal_generator(ValueFun) ->
324 |     fun(I) ->
325 |         Key = <<"Key", I:32/integer>>,
326 |         Value = ValueFun(),
327 |         {Key, Value}
328 |     end.
329 | 
330 | clean_subdir(DirPath) ->
331 |     case filelib:is_dir(DirPath) of
332 |         true ->
333 |             {ok, Files} = file:list_dir(DirPath),
334 |             lists:foreach(
335 |                 fun(FN) ->
336 |                     File = filename:join(DirPath, FN),
337 |                     io:format("Attempting deletion ~s~n", [File]),
338 |                     ok =
339 |                         case filelib:is_dir(File) of
340 |                             true ->
341 |                                 clean_subdir(File),
342 |                                 file:del_dir(File);
343 |                             false ->
344 |                                 file:delete(File)
345 |                         end,
346 |                     io:format("Success deleting ~s~n", [File])
347 |                 end,
348 |                 Files
349 |             );
350 |         false ->
351 |             ok
352 |     end.
353 | 
354 | -ifdef(TEST).
355 | 
356 | -include_lib("eunit/include/eunit.hrl").
357 | 
358 | get_loglevel() ->
359 |     element(2, leveled_log:get_opts()).
360 | 
361 | set_loglevel_test() ->
362 |     ?assertMatch(error, get_loglevel()),
363 |     ok = set_loglevel([debug]),
364 |     ?assertMatch(debug, get_loglevel()),
365 |     ok = set_loglevel([nonsense]),
366 |     ?assertMatch(info, get_loglevel()),
367 |     ok = set_loglevel([warning, error, critical]),
368 |     ?assertMatch(warning, get_loglevel()),
369 |     ok = set_loglevel([nonsense, critical]),
370 |     ?assertMatch(critical, get_loglevel()).
371 | 
372 | get_segmentid(B, K) ->
373 |     Seg32 = leveled_tictac:keyto_segment32(make_binarykey(B, K)),
374 |     leveled_tictac:get_segment(Seg32, ?TREE_SIZE).
375 | 
376 | flipbyte_test() ->
377 |     Bin0 = <<0:256/integer>>,
378 |     BinFB0 = flip_byte(Bin0, 0, 32),
379 |     ?assertMatch(false, BinFB0 == Bin0),
380 |     Bin1 = <<4294967295:32/integer>>,
381 |     BinFB1 = flip_byte(Bin1, 1, 1),
382 |     ?assertMatch(false, BinFB1 == Bin1).
383 | 
384 | clen_empty_subdir_test() ->
385 |     FakePath = "test/foobar99",
386 |     ok = clean_subdir(FakePath).
387 | 
388 | -endif.
389 | 


--------------------------------------------------------------------------------
/docs/DESIGN.md:
--------------------------------------------------------------------------------
  1 | # TicTac Tree - design
  2 | 
  3 | ## Objective
  4 | 
  5 | The purpose of the KV TicTac Tree is to be able to make comparisons of groups of data partitions within and between database clusters, and prompt repair should differences be found.
  6 | 
  7 | ### Sample Scenario
  8 | 
  9 | Consider two different data stores.  
 10 | 
 11 | One store (Store A) stores data split across two virtual nodes (A1 and A2), and each node has the data split into three different partitions (A1.1, A1.2, A1.3, and A2.1, A2.2, A2.3).
 12 | 
 13 | - Store A
 14 |     - Vnode A1
 15 |         - Partition A1.1
 16 |         - Partition A1.2
 17 |         - Partition A1.3
 18 |     - Vnode A2
 19 |         - Partition A2.1
 20 |         - Partition A2.2
 21 |         - Partition A3.3
 22 | 
 23 | A second store (Store B) stores data in one virtual node (B1), but within that node data is split evenly across 4 partitions (B1.1, B1.2, B1.3 and B1.4).
 24 | 
 25 | - Store B
 26 |     - Vnode B1
 27 |         - Partition B1.1
 28 |         - Partition B1.2
 29 |         - Partition B1.3
 30 |         - Partition B1.4
 31 | 
 32 | There are a number of different relationships with regards to the data ownership that we expect to be true within this system, for example:
 33 | 
 34 | - union(A1.1, A1.2, A1.3) == union(A2.1, A2.2, A2.3)
 35 | - union(A1.1, A1.2, A1.3) == union(B1.1, B1.2, B1.3, B1.4)
 36 | - union(A2.1, A2.2, A2.3) == union(B1.1, B1.2, B1.3, B1.4)
 37 | - A1.1 == A2.1
 38 | - A1.2 == A2.2
 39 | - A1.3 == A2.3
 40 | 
 41 | ### Constraints
 42 | 
 43 | The objective is to have a simple and efficient way of validating all these relationships subject to the following constraints and conditions:
 44 | 
 45 | - The AAE system should not place a dependancy on how the vnodes store their data in the partitions.
 46 | 
 47 | - The AAE system should confirm that not only has all data reach each location but also that all data remains in that location, in particular entropy of persisted data must also be considered.
 48 | 
 49 | - The AAE system should be always on and available to support as many comparisons as possible, management of anti-entropy should not require to schedule around downtime of AAE.
 50 | 
 51 | - The AAE system should allow for throttling of exchanges and repairs so as not to overwhelm the system, especially when an alternative process may currently be managing an efficient repair (e.g. hinted handoff).
 52 | 
 53 | - Any rebuild process (where state is refreshed to reflect current on-disk status in the main store), must be safe to throttle without impacting exchange availability.
 54 | 
 55 | - It can be generally assumed that the vnode is aware of both the before and after state of objects subject to change (to inform the AAE process), but there may be exceptional circumstances (e.g. in Riak with LWW=true on a non-2i backend), where the AAE process itself may need to determine the before state.  It may be that this process is less efficient as it is generally assumed that a system that cares enough about data loss to run anti-entropy, will also care enough to read before a write.
 56 | 
 57 | The issue of how to handle timestamped objects with automatic background expiry is important, but is not currently thought through.
 58 | 
 59 | ## Actors
 60 | 
 61 | It is assumed that there are actor currently managing vnodes within the stores, and mechanisms for communicating within and between the vnodes in the stores, and determining membership relationships between partitions and vnodes.
 62 | 
 63 | For each vnode an `aae_controller` will be started, with the controller requested to handle data for each partition supported by the vnode.  The `aae_controller` will start an `aae_treecache` for each of the partitions, and a single `aae_keystore` for the vnode.
 64 | 
 65 | ### Controller
 66 | 
 67 | The `aae_controller` is responsible for marshalling all requests from the vnode, and for checking that the keystore, treecache and vnode partition stores remain locally synchronised.  It primarily receives the follow requests:
 68 | 
 69 | - put
 70 |     - Make a change to a TreeCache and update the KeyStore to represent a vnode change.  The put request should inform the controller of the current clock, the previous clock, and the partition reference.
 71 | - merge_root/merge_branches
 72 |     - Return the merged root of the tree roots for a list of partitions, or the merged branches of a list of Branch IDs.
 73 | - fetch_clocks
 74 |     - for a given set of leaf identifiers and partitions return all the keys and version clocks for the objects in the system (from the key store).
 75 | - rebuild
 76 |     - prompt the store to rebuild from the vnode store all state.
 77 | - open/close
 78 |     - Open and close, using a shutdown GUID reference on close (then open) to confirm if open and close events are known to have returned the data and the AAE system to a consistent state (the same shutdown GUID should be persisted in the vnode data store at shutdown).
 79 | - fold_keystore
 80 |     - Allow a general fold over the keystore covering all of the store, or a list of buckets, with a function to apply to each key/metadata pair in the store.
 81 | 
 82 | 
 83 | ### TreeCache
 84 | 
 85 | The `aae_treecache` is responsible for an individual partition reference.  The partition reference is expected to be a combination of {n_val, partition_id} - so in a cluster any given vnode will have as many partition references as the sum of the n_vals supported by the cluster.
 86 | 
 87 | The tree cache is an in-memory tictac tree using the `leveled_tictac` module.  Changes are made as they are received (via async message).
 88 | 
 89 | The `aae_treecache` process can also be placed in a load mode.  When in load mode, deltas are queued as well as being applied to the current cache.  When ready, `complete_load` can be called with a TicTac Tree formed from a snapshot taken as part of the same unit of work when the load was initialised.  At this stage, the original tree can be destroyed, and the queue of changes can be applied to the new tree.  This process can be used by the `aae_controller` to refresh the tree from Key Store, without ever having the tree cache go inactive.
 90 | 
 91 | 
 92 | ### KeyStore
 93 | 
 94 | The `aae_keystore` is a FSM that can be in three states:
 95 | 
 96 | - `loading`
 97 |     - In the `loading` state store updates are PUT into the store, but queued for a second (replacement) store.  The keystore can also receive load requests, which are only added into the replacement store.  When the load is complete, the queued requests are put into the replacement store and the original store may be discarded.  This allows the keystore to be rebuilt.
 98 | - `parallel`
 99 |     - In the `parallel` state, a keystore is kept in parallel to the vnode store, to resolve any fold requests passed in.  A `parallel` store may transition in and out of the `loading` state (back into the `parallel` state).
100 | - `native`
101 |     - In the `native` state, not parallel store is kept, but a reference is kept by the `aae_keystore` process to the vnode backend, and queries are resolved by calling the actual vnode backend.  This requires the vnode backend to support the same API is the parallel `aae_keystore` (and so would currently in riak need to be the leveled backend).  There is no transition in and out of `loading` from the `native` state.
102 | 
103 | There are two types of parallel stores currently supported (but adding other stores should be simple):
104 | 
105 | - `leveled_so` (leveled backend but with a key that is ordered first by segment ID)
106 | - `leveled_ko` (leveled backend but ordered by the actual object {Bucket, Key} pair, but with accelerated scanning over a sublist of segment IDs).
107 | 
108 | 
109 | ### Exchange
110 | 
111 | The `aae_exchange` is a FSM used for managing a single anti-entropy exchange to find keys to repair based on comparison between two lists - the `blue` and `pink` lists.  The lists for the comparison are a list of `[{SendFun, PartitionRefList}]` tuples, where the SendFun encapsulates a mechanism for reaching a given `aae_controller`, and the PartitionRefList is a list of Partition References which are required from that controller.  
112 | 
113 | The lists can have multiple items (e.g. require contact with multiple controllers), and request multiple partition references from each controller - which would be normal for comparing coverage plans.  The lists do not need to be of equivalent dimensions between `blue` and `pink`.
114 | 
115 | The FSM process will alternate between multiple 'checking' states and the special state `waiting_all_results`, until a 'checking' state reveals a full match.  The 'checking' state are:
116 | 
117 | - `root_compare` - fetch the tree roots and compare.
118 | - `root_confirm` - fetch the tree roots and compare, select the intersection of branch IDs from this first pass and the last pass to use at the next stage.
119 | - `branch_compare` - fetch the tree branches which differ in the root and compare.
120 | - `branch_confirm` - fetch the tree branches which differ in the root and compare, select the intersection of segment leaf IDs from the first pass and last pass to use at the next stages.
121 | - `clock_compare` - fetch the keys and clocks associated with the segment leaf IDs and compare - passing any deltas to a RepairFun provided by the calling process to repair.
122 | 
123 | The exchange is throttled in two ways.  Firstly, there is a jittered pause between each state transition.  Secondly, the number of IDs (branch or segment leaf IDs) that can be passed from a confirm state is limited.  This will increase the number of iterations required to fill-in an entirely diverse vnode.  The RepairFun that makes the repair is passed-in, and may apply its own throttle, but the `aae_exchange` will not explicitly throttle the repairs.
124 | 
125 | 
126 | ## Notes on Riak Implementation
127 | 
128 | Although the AAE library is intended to be generic, it is primarily focused on being a new AAE mechanism for Riak.  Some notes on how this should be implemented within Riak, and functionality that can be expected.
129 | 
130 | ### Transition
131 | 
132 | Transition between AAE releases is hard (as demonstrated by the difficulties of the hash algorithm change from legacy to version 0 in the existing riak_kv_index_hashtree implementation).  The intention is to allow this AAE to be a plugin over and above the existing AAE implementation, making transition an administrative task: the tictac tree AAE can be run in Riak oblivious to whether existing AAE versions are running.
133 | 
134 | ### Startup, Shutdown and Synchronisation
135 | 
136 | The `riak_kv_vnode` is expected to be responsible for stopping and starting the `aae_controller` should this version of AAE be implemented.  The `aae_controller` should only be started after the vnode backend has been started, but before the vnode is marked as ready.  The trees, parallel keystore (in parallel mode) and vnode store may at this stage be out of sync, if the vnode had not previously shut down cleanly.  Whilst stores are out of sync, they will still operate but return false negative results: however, false negative results will prompt incremental repair of the synchronisation issue.  Incremental repair of a parallel keystore is done using the per-vnode rehash.  Incremental repair of the trees is done through a rehashing of the segments undertaken as part of the `aae_controller:aae_fetchclocks/5`.
137 | 
138 | If the `aae_treecache` was not shutdown correctly, then the whole cache may be incorrect (e.g. empty).  This would take a long time to incrementally repair, and so this scenario is detected and flagged at startup.  It is therefore recommended at vnode startup, that the `aae_controller:aae_rebuildtrees/5` be called with the `OnlyIfBroken = true`.  This will return `skipped` if the treecache appeared to have been recovered successfully and not rebuild, but will rebuild if a potential issue with any of the tree_caches had been flagged at startup.
139 | 
140 | Whilst the stores are potentially out of sync, then the controller should operate as normal - this will potentially lead to false repairs until the rebuild is complete.  If to an administrator, the possibility of non-synchronisation is a known possibility, such as when a node is restarting following a hard crash - then the [participate in coverage](https://github.com/basho/riak_core/pull/917) feature can be used to remove the node's vnodes from any coverage plan based AAE exchanges.
141 | 
142 | There exists the potential for further improvements of vnode store to aae coordination, should the aae store be used for additional functional reasons in the future.
143 | 
144 | ### Intra-Cluster AAE
145 | 
146 | The `aae_exchange` is flexible so that intra-cluster AAE can be done pairwise or between coverage offsets.  If we have a ring size of 128, and a single n-val of 3, there are 384 pairwise exchanges.  So an entropy_manager could be elected in the cluster which rotates around those pairwise exchanges.
147 | 
148 | It would be quicker to just perform the 3 comparisons necessary to rotate around the 3 coverage plans (with the 3 different offsets), and compare those coverage plans.  However, in the scenario where a single
149 | 
150 | ### AAE Cluster Full-Sync
151 | 
152 | ....
153 | 
154 | ### MapFold Changes - Backend Independent
155 | 
156 | Previously there had been some work down to add [MapFold](https://github.com/martinsumner/riak_kv/blob/mas-2.1.7-foldobjects/docs/MAPFOLD.md) as a feature to Riak.  This is in someways an alternative to the work done by Basho on riak_kv_sweeper - there is a generic need to have functions that fold over objects, that produce outputs that aren't required immediately.  This is especially true for operational reasons e.g.:
157 | 
158 | - find all sibling'd objects;
159 | - count then number of objects in a bucket;
160 | - what is the average object size in the database;
161 | - provide a histogram of last modified dates on objects in a bucket).
162 | 
163 | There may also be functional reasons whereby we might want to have non-disruptive folds with bespoke functions and accumulators - especially for reporting (e.g. count all the people by age-range and gender), that currently require a secondary index and for all 2i terms to be fed back to the application for processing, with the application needing to control throttling of the query.
164 | 
165 | Riak previously had Map/Reduce which could answer these problems, but Map/Reduce was designed to be fast.  It was controlled in the sense it has back pressure to prevent the reading of data from overwhelming the processing of that data - but it was not controlled to prevent a Map/Reduce workload from overloading standard K/V GET/PUT activity.  Also Map/Reduce required the reading of the whole object, so didn't offer any optimisation if the interesting information was on a 2i term or in object metadata.
166 | 
167 | The Mapfold work provided to a solution to this, but to be efficient it depended on
168 | the backend supporting secondary indexes and/or fold_heads.  The Mapfold work was optimised for the leveled backend, but left other backends behind.
169 | 
170 | One side effect of kv_index_tictactree is that provides a parallel store (when leveled is not used), that can still be key-ordered.  The metadata that gets put into that parallel store could be extended to include the full object head.  So the same queries that work with a native leveled backend, will work with a parallel AAE leveled key-ordered backend.  Potentially this would mean that MapFold could be supported efficiently with any backend where AAE has been enabled.
171 | 
172 | ### Per-Bucket MDC Replication
173 | 
174 | ...
175 | 
176 | ### Bitcask and HEAD requests
177 | 
178 | ....
179 | 
180 | ### Bitcask and 2i Support
181 | 
182 | ....
183 | 
184 | ### Improved Vnode Synchronisation on Abrupt Shutdown
185 | 
186 | ....
187 | 
188 | ### Backup Use-case
189 | 
190 | Backups in Riak are hard.  Hard for good reasons:
191 | 
192 | - the difficulty of co-ordinating a snapshot in a point in time across many processes on many machines;
193 | - the volume of data traditionally used by people who need a distributed database;
194 | - the inherent duplication of data in Riak;
195 | - the write amplification in some Riak backends (leveldb) increasing the cost of any rsync based mechanism for backup.
196 | 
197 | Historically different approaches have been tried, and ultimately most Riak systems either end up running without historic backups (just MDC replication), or with a bespoke backup approach integrated into either the database and/or the application.
198 | 
199 | One possibility is to be able to run a very small cluster with dense storage machines, in a backup configuration:  e.g. node count of 1, ring size of 8, n/r/w-val of 1, vnode backend rsync friendly (leveled/bitcask) with 2i disabled.  If we can now replicate from a production scale cluster to this (using rabl for real time-replication so that peak load is queued), then stopping this single node cluster and running rsync periodically could produce a more traditional backup approach without impeding on decision making wrt production database setup (e.g. ring size, n-val and write-amplification and query support in the backend).
200 | 
201 | The combination of repl replication, and AAE full-sync independent of ring-size and n-val might make such a solution possible without bespoke application effort.
202 | 
203 | ### AAE for 2i Terms
204 | 
205 | ....
206 | 
207 | ### 2i Repair
208 | 
209 | ....
210 | 
211 | ### Rehash Support - Consideration for W1 Misuse
212 | 
213 | ....
214 | 
215 | ### Support for LWW on Bitcask
216 | 
217 | ....
218 | 


--------------------------------------------------------------------------------
/test/property/aae_eqc.erl:
--------------------------------------------------------------------------------
  1 | %%% @author Thomas Arts <thomas@SpaceGrey.local>
  2 | %%% @copyright (C) 2019, Thomas Arts
  3 | %%% @doc
  4 | %%%
  5 | %%% @end
  6 | %%% Created :  5 Feb 2019 by Thomas Arts <thomas@SpaceGrey.local>
  7 | 
  8 | -module(aae_eqc).
  9 | 
 10 | -ifdef(EQC).
 11 | -include_lib("eqc/include/eqc.hrl").
 12 | -include_lib("eqc/include/eqc_statem.hrl").
 13 | -include_lib("eunit/include/eunit.hrl").
 14 | 
 15 | -compile([export_all, nowarn_export_all]).
 16 | -compile({nowarn_deprecated_function, [{erlang, now, 0}]}).
 17 | 
 18 | -define(LOG_LEVELS, [error, critical]).
 19 | -define(EXCHANGE_PAUSE_MS, 10).
 20 | 
 21 | -define(NUMTESTS, 1000).
 22 | -define(QC_OUT(P),
 23 |         eqc:on_output(fun(Str, Args) ->
 24 |                               io:format(user, Str, Args) end, P)).
 25 | 
 26 | 
 27 | eqc_test_() ->
 28 |     {timeout, 120,
 29 |         ?_assertEqual(true,
 30 |             eqc:quickcheck(eqc:testing_time(60, ?QC_OUT(prop_aae()))))}.
 31 | 
 32 | run() ->
 33 |     run(?NUMTESTS).
 34 | 
 35 | run(Count) ->
 36 |     eqc:quickcheck(eqc:numtests(Count, prop_aae())).
 37 | 
 38 | check() ->
 39 |     eqc:check(prop_aae()).
 40 | 
 41 | 
 42 | %% -- State and state functions ----------------------------------------------
 43 | initial_state() ->
 44 |     #{aae_controllers => 
 45 |           [{"a", #{store => []}}, 
 46 |            {"b", #{store => []}}], %% list of controllers, each unique map
 47 |       history => 
 48 |           [] %% {Bucket, Key, VClock, LastModified}
 49 |       }.  
 50 | 
 51 | %% -- Generators -------------------------------------------------------------
 52 | 
 53 | pos() ->
 54 |     ?LET(N, nat(), N+1).
 55 | 
 56 | timestamp(_Obj) ->
 57 |     1.
 58 | 
 59 | gen_vclock() ->
 60 |     ?LET(Names, non_empty(sublist(names())),
 61 |          [ {Name, nat()} || Name <- Names]).
 62 | 
 63 | gen_vclock(VClockGen) ->
 64 |     ?LET(VClock, VClockGen,
 65 |     ?LET({{K, C}, P}, {elements(VClock), pos()},
 66 |          lists:keyreplace(K, 1, VClock, {K, C + P}))).
 67 | 
 68 | names() ->
 69 |     [a, b, c, d, e, f].
 70 | 
 71 | %% Cannot be atoms!
 72 | %% key() type specified: should be binary().
 73 | gen_bucket() -> 
 74 |     elements([<<"bucket1">>, <<"bucket2">>, <<"bucket3">>]).
 75 | 
 76 | gen_key() ->
 77 |     binary(16).
 78 | 
 79 | gen_bkcm(S) ->
 80 |     ?LET({B, K}, frequency([{length(maps:get(history, S, [])), ?LAZY(elements([F || {F, _, _} <-maps:get(history, S)]))},
 81 |                             {10, {gen_bucket(), gen_key()}}]), 
 82 |          case lists:keyfind({B, K}, 1, maps:get(history, S, [])) of
 83 |              false ->
 84 |                  {B, K, none, gen_vclock(), gen_last_modified()};
 85 |              {_, PrevClock, _LastModifed} ->
 86 |                  {B, K, undefined, gen_vclock(PrevClock), gen_last_modified()}
 87 |          end).
 88 | 
 89 | gen_last_modified() ->
 90 |     [{1549, choose(448000, 448100), 0}].
 91 | 
 92 | 
 93 | %% generate a new store
 94 | gen_store([], Store2) ->
 95 |     Store2;
 96 | gen_store([{{B, K}, C1, LM1} | Store1], Store2) ->
 97 |     case lists:keyfind({B, K}, 1, Store2) of
 98 |         false ->
 99 |             [ {{B, K}, C1, LM1} | gen_store(Store1, Store2) ]; 
100 |         {_, C2, _} ->
101 |             [ {{B, K}, gen_vclock(elements([C1, C2])), gen_last_modified()} | 
102 |               gen_store(Store1, lists:keydelete({B,K}, 1, Store2))]
103 |     end.
104 |     
105 | 
106 | %% -- Common pre-/post-conditions --------------------------------------------
107 | command_precondition_common(_S, _Cmd) ->
108 |     true.
109 | 
110 | precondition_common(_S, _Call) ->
111 |     true.
112 | 
113 | postcondition_common(_S, _Call, _Res) ->
114 |     true.
115 | 
116 | %% -- Operations -------------------------------------------------------------
117 | 
118 | object_split(Object) ->
119 |      {_Size, _SiblingCount, _IndexHash, _LastMod, _UserData} = binary_to_term(Object).
120 | 
121 | %% --- Operation: init ---
122 | start_pre(S) ->
123 |     unstarted_controllers(S) =/= [].
124 | 
125 | start_args(S) ->
126 |     ?LET({Path, M}, elements(unstarted_controllers(S)),
127 |          [ Path, 
128 |            {parallel, leveled_ko}, 
129 |            maps:get(store, M, []) == [], 
130 |            elements([{1, 1}, {0, 3600}]), %% if hours is set to 1 it means we cannot trigger a rebuild in a test
131 |            [{0, 3}, {1, 3}, {2,3}],   %% behaviour is not different for less
132 |            {var, dir}
133 |          ]).
134 | 
135 | start_pre(S, [Path, _KeyStoreType, _IsEmpty, _RebuildSchedule, _PrefLists, _RootPath]) ->
136 |     Controllers = maps:get(aae_controllers, S, []),
137 |     case lists:keyfind(Path, 1, Controllers) of
138 |         false ->
139 |             %% Controller has not been started yet
140 |             true;
141 |         {_, M} ->
142 |             %% Check whether the controller is already started
143 |             not maps:is_key(aae_controller, M)
144 |     end.
145 | 
146 | start(Path, KeyStoreType, IsEmpty, RebuildSchedule, PrefLists, RootPath) ->
147 |     case catch aae_controller:aae_start(KeyStoreType, IsEmpty, RebuildSchedule, PrefLists, 
148 |                                          filename:join(RootPath, Path),
149 |                                          fun object_split/1,
150 |                                          ?LOG_LEVELS) of
151 |         {ok, Pid} -> Pid;
152 |         Other -> Other
153 |     end.
154 | 
155 | start_next(S, Value, [Path, _KeyStoreType, IsEmpty, _RebuildSchedule, PrefLists, _RootPath]) ->
156 |     Controllers = maps:get(aae_controllers, S),
157 |     {_, Map} = lists:keyfind(Path, 1, Controllers),
158 |     RebuildIsDue = (not IsEmpty andalso maps:get(store, Map, []) == []),
159 |     S#{aae_controllers => 
160 |            lists:keyreplace(Path, 1, Controllers, {Path, Map#{aae_controller => Value,
161 |                                                               rebuild_due => RebuildIsDue,
162 |                                                               preflists => PrefLists}})}. 
163 | 
164 | start_post(_S, _Args, Res) ->
165 |     is_pid(Res).
166 | 
167 | start_features(_S, [_Path, _KeyStoreType, IsEmpty, RebuildSchedule, _PrefLists, _RootPath], _Res) ->
168 |     [ {start, {schedule, RebuildSchedule}}, {start, {is_empty, IsEmpty}} ].
169 | 
170 | 
171 | %% --- Operation: stop ---
172 | stop_pre(S) ->
173 |     started_controllers(S) =/= [].
174 | 
175 | stop_args(S) ->
176 |     ?LET({Path, M}, elements(started_controllers(S)),
177 |          [Path, maps:get(aae_controller, M)]).
178 | 
179 | stop_pre(S, [Path, Pid]) ->
180 |     {_, M} = lists:keyfind(Path, 1, maps:get(aae_controllers, S)),
181 |     Pid == maps:get(aae_controller, M).  %% for shrinking
182 | 
183 | stop(_, Pid) ->
184 |     catch aae_controller:aae_close(Pid).
185 | 
186 | stop_next(S, _Value, [Path, _Pid]) ->
187 |     Controllers = maps:get(aae_controllers, S),
188 |     {_, M} = lists:keyfind(Path, 1, Controllers),
189 |     S#{aae_controllers => 
190 |            lists:keyreplace(Path, 1, Controllers, {Path,  maps:without([aae_controller], M)})}.
191 | 
192 | stop_post(_S, [_, _Pid], Res) ->
193 |     eq(Res, ok).
194 | 
195 | %% --- Operation: next_rebuild ---
196 | nextrebuild_pre(S) ->
197 |     started_controllers(S) =/= [].
198 | 
199 | nextrebuild_args(S) ->
200 |     ?LET({Path, M}, elements(started_controllers(S)),
201 |          [Path, maps:get(aae_controller, M)]).
202 | 
203 | nextrebuild_pre(S, [Path, Pid]) ->
204 |     Controllers = maps:get(aae_controllers, S),
205 |     {_, M} = lists:keyfind(Path, 1, Controllers),
206 |     Pid == maps:get(aae_controller, M).  %% for shrinking
207 | 
208 | %% If we expected to be due, it should be due.
209 | nextrebuild(_, Pid) ->
210 |     TS = aae_controller:aae_nextrebuild(Pid),
211 |     os:timestamp() > TS.
212 | 
213 | nextrebuild_post(S, [Path, _Pid], Res) ->
214 |     Controllers = maps:get(aae_controllers, S),
215 |     {_, M} = lists:keyfind(Path, 1, Controllers),
216 |     not maps:get(rebuild_due, M) orelse Res.
217 |             
218 | 
219 | nextrebuild_features(_S, [_, _Pid], Res) ->
220 |     [ {nextrebuild, Res} ].
221 | 
222 | 
223 | %%--- Operation: put ---
224 | put_pre(S) ->
225 |     started_controllers(S) =/= [].
226 | 
227 | put_args(S) ->
228 |     ?LET({{Path, M}, {B, K, PClock, VClock, LastMod}}, {elements(started_controllers(S)), gen_bkcm(S)},
229 |          [Path, maps:get(aae_controller, M), 
230 |           maps:get(preflists, M), B, K, VClock, PClock, {pos(), pos(), 0, LastMod, []}]).
231 | 
232 | put_pre(_S, [_Path, _Pid, _PrefLists, _Bucket, _Key, _CurrentClock, _PrevClock, _MetaData]) ->
233 |     true.
234 | 
235 | put(_Path, Pid, PrefLists, Bucket, Key, CurrentClock, PrevClock, MetaData) ->
236 |     PrefList = lists:nth((erlang:phash2({Bucket, Key}) rem length(PrefLists)) + 1, PrefLists),
237 |     aae_controller:aae_put(Pid, PrefList, Bucket, Key, CurrentClock, PrevClock, term_to_binary(MetaData)).
238 | 
239 | put_next(S, _Value, [Path, _Pid, _PrefLists, Bucket, Key, CurrentClock, _PrevClock, {_, _, _, LastMod, _}]) ->
240 |     Controllers = maps:get(aae_controllers, S),
241 |     {_, M} = lists:keyfind(Path, 1, Controllers),
242 |     S#{aae_controllers => 
243 |            lists:keyreplace(Path, 1, Controllers, 
244 |                             {Path, M#{store =>
245 |                                           [ {{B, K}, C, L} || {{B, K}, C, L} <- maps:get(store, M), {Bucket, Key} =/= {B, K}] ++
246 |                                           [ {{Bucket, Key}, CurrentClock, LastMod} ] 
247 |                                           }}),
248 |        history =>
249 |            maps:get(history, S, []) ++ [{{Bucket, Key}, CurrentClock, LastMod}]
250 |       }.
251 | 
252 | 
253 | put_post(_S, [_Path, _Pid, _PrefLists, _Bucket, _Key, _CurrentClock, _PrevClock, _MetaData], Res) ->
254 |     eq(Res, ok).
255 | 
256 | put_features(_S, [_Path, _Pid, _PrefLists, _Bucket, _Key, _CurrentClock, PrevClock, _MetaData], _Res) ->
257 |     [ {put, PrevClock} ].
258 | 
259 | 
260 | %% --- Operation: exchange ---
261 | exchange_pre(S) ->
262 |     length(started_controllers(S)) >= 2.
263 | 
264 | exchange_args(S) ->
265 |     Controllers = started_controllers(S),
266 |     ?LET({Path1, M1}, elements(Controllers),
267 |     ?LET({Path2, M2}, elements(Controllers), %% possibly minus the already selected one
268 |          [ Path1, Path2,
269 |            [maps:get(aae_controller, M1), maps:get(preflists, M1)], %% BlueList
270 |            [maps:get(aae_controller, M2), maps:get(preflists, M2)]  %% PinkList
271 |          ])).
272 | 
273 | exchange_pre(S, [Path1, Path2, _Blue, _Pink]) ->
274 |     lists:keymember(Path1, 1, started_controllers(S)) andalso 
275 |         lists:keymember(Path2, 1, started_controllers(S)).
276 | 
277 | exchange(_, _, [BluePid, BluePrefLists], [PinkPid, PinkPrefLists]) ->
278 |     BlueList =  [{testutil:exchange_sendfun(BluePid), BluePrefLists}],
279 |     PinkList =  [{testutil:exchange_sendfun(PinkPid), PinkPrefLists}],
280 |     QuickCheck = self(),
281 |     {ok, Pid, _UUID} = aae_exchange:start(full, BlueList, PinkList, 
282 |                                           fun(KeyList) -> QuickCheck ! {self(), repair, KeyList} end, %% do not repair at all 
283 |                                           fun(Result) -> QuickCheck ! {self(), reply, Result} end,
284 |                                           none,
285 |                                           [{transition_pause_ms, ?EXCHANGE_PAUSE_MS},
286 |                                             {log_levels, ?LOG_LEVELS}]),
287 |     receive
288 |         {Pid, reply, {root_compare, 0}} ->            
289 |             {root_compare, 0};
290 |         {Pid, reply, Other} ->
291 |             receive
292 |                 {Pid, repair, KeyList} ->
293 |                     {repair, Other, KeyList}
294 |             after 5000 -> timeout
295 |             end
296 |     after 5000 -> timeout
297 |     end.
298 | 
299 | exchange_post(S, [Path1, Path2, _Blue, _Pink], Res) ->
300 |     {_, M1} = lists:keyfind(Path1, 1, maps:get(aae_controllers, S, [])),
301 |     {_, M2} = lists:keyfind(Path2, 1, maps:get(aae_controllers, S, [])),
302 |     BlueStore = lists:usort(maps:get(store, M1, [])), 
303 |     PinkStore = lists:usort(maps:get(store, M2, [])),
304 |     MatchBlueFun =
305 |         fun({{B, K}, C, _L}, Acc) ->
306 |             case lists:keyfind({B, K}, 1, PinkStore) of
307 |                 false ->
308 |                     [{{B, K}, {C, none}}|Acc];
309 |                 {{B, K}, C, _} ->
310 |                     Acc;
311 |                 {{B, K}, NC, _} ->
312 |                     [{{B, K}, {C, NC}}|Acc]
313 |             end
314 |         end,
315 |     MatchPinkFun =
316 |         fun({{B, K}, C, _L}, Acc) ->
317 |             case lists:keyfind({B, K}, 1, BlueStore) of
318 |                 false ->
319 |                     [{{B, K}, {none, C}}|Acc];
320 |                 _ ->
321 |                     Acc
322 |             end
323 |         end,
324 |     Acc0 = lists:foldl(MatchBlueFun, [], BlueStore),
325 |     Expected = lists:usort(lists:foldl(MatchPinkFun, Acc0, PinkStore)),
326 |     case Res of
327 |         {root_compare, 0} -> 
328 |             eq(0, length(Expected));
329 |         {repair, {clock_compare, N}, KeyList} -> 
330 |             N == length(KeyList) 
331 |                 andalso eq(lists:sort(KeyList), Expected);
332 |         _ ->
333 |             eq(Res, Expected)  %% will print the difference
334 |     end.
335 | 
336 | exchange_features(_S, [_Path1, _Path2, _Blue, _Pink], Res) ->
337 |     case Res of
338 |         {root_compare, 0} -> 
339 |             root_compare;
340 |         {repair, {clock_compare, N}, _KeyList} -> 
341 |             {clock_compare, N};
342 |         _ ->
343 |             Res
344 |     end.
345 | 
346 | 
347 | 
348 | %% --- Operation: sync ---
349 | sync_pre(S) ->
350 |     length(started_controllers(S)) >= 2.
351 | 
352 | sync_args(S) ->
353 |     Controllers = started_controllers(S),
354 |     ?LET({Path1, M1}, elements(Controllers),
355 |     ?LET({Path2, M2}, elements(Controllers -- [{Path1, M1}]),
356 |          [ Path1, Path2, 
357 |            maps:get(preflists, M1), maps:get(preflists, M2),
358 |            maps:get(aae_controller, M1), maps:get(aae_controller, M2),
359 |            gen_store(maps:get(store, M1), maps:get(store, M2)) ])).
360 | 
361 | 
362 | sync_pre(S, [Path1, Path2, _, _, _, _, _Store]) ->
363 |     lists:keymember(Path1, 1, started_controllers(S)) andalso 
364 |         lists:keymember(Path2, 1, started_controllers(S)).
365 | 
366 | 
367 | sync(_Path1, _Path2, _PrefLists1, _PrefLists2, _Pid1, _Pid2, []) ->
368 |     ok;
369 | sync(Path1, Path2, PrefLists1, PrefLists2, Pid1, Pid2, [{{B, K}, VC, LastMod}|Store]) ->
370 |     %% TODO: add meta data to the state and extract it again
371 |     put(Path1, Pid1, PrefLists1, B, K, VC, undefined, {1, 1, 0, LastMod, []}),
372 |     put(Path2, Pid2, PrefLists2, B, K, VC, undefined, {1, 1, 0, LastMod, []}),
373 |     sync(Path1, Path2, PrefLists1, PrefLists2, Pid1, Pid2, Store).
374 | 
375 | sync_next(S, _Value, [Path1, Path2, _PrefLists1, _PrefLists2, _Pid1, _Pid2, Store]) ->
376 |     Controllers = maps:get(aae_controllers, S),
377 |     {_, M1} = lists:keyfind(Path1, 1, Controllers),
378 |     {_, M2} = lists:keyfind(Path2, 1, Controllers),
379 |     S#{aae_controllers => 
380 |            lists:keyreplace(Path1, 1, 
381 |                             lists:keyreplace(Path2, 1, Controllers,
382 |                             {Path2, M2#{store => Store}}),
383 |                             {Path1, M1#{store => Store}}),
384 |        history =>
385 |            maps:get(history, S, []) ++ Store
386 |       }.
387 | 
388 | 
389 | 
390 | 
391 | %% --- ... more operations
392 | 
393 | %% -- Property ---------------------------------------------------------------
394 | prop_aae() ->
395 |     Dir = "./aae_data",
396 |     eqc:dont_print_counterexample( 
397 |     ?FORALL(Cmds, commands(?MODULE),
398 |     begin
399 |         os:cmd("rm -rf " ++ Dir),
400 |         {H, S, Res} = run_commands(Cmds, [{dir, Dir}]),
401 |         [ aae_controller:aae_close(maps:get(aae_controller, M)) || {_, M} <- started_controllers(S) ],
402 |         CallFeatures = call_features(H),
403 |         check_command_names(Cmds,
404 |             measure(length, commands_length(Cmds),
405 |             aggregate(with_title('Features'), CallFeatures,
406 |             aggregate_feats(all_command_names(), CallFeatures,
407 |             features(CallFeatures,
408 |                 pretty_commands(?MODULE, Cmds, {H, S, Res},
409 |                                 Res == ok))))))
410 |     end)).
411 | 
412 | aggregate_feats([], _, Prop) -> Prop;
413 | aggregate_feats([atoms | Kinds], Features, Prop) ->
414 |     {Atoms, Rest} = lists:partition(fun is_atom/1, Features),
415 |     aggregate(with_title(atoms), Atoms, aggregate_feats(Kinds, Rest, Prop));
416 | aggregate_feats([Tag | Kinds], Features, Prop) ->
417 |     {Tuples, Rest} = lists:partition(fun(X) -> is_tuple(X) andalso element(1, X) == Tag end, Features),
418 |     aggregate(with_title(Tag), [ Arg || {_, Arg} <- Tuples ], aggregate_feats(Kinds, Rest, Prop)).
419 | 
420 | 
421 | bugs() -> bugs(10).
422 | 
423 | bugs(N) -> bugs(N, []).
424 | 
425 | bugs(Time, Bugs) ->
426 |     more_bugs(eqc:testing_time(Time, prop_aae()), 20, Bugs).
427 | 
428 | 
429 | %%% ---- state functions
430 | 
431 | unstarted_controllers(S) ->
432 |     Controllers = maps:get(aae_controllers, S, []),
433 |     lists:filter(fun({_, M}) -> not maps:is_key(aae_controller, M) end, Controllers).
434 | 
435 | started_controllers(S) ->
436 |     Controllers = maps:get(aae_controllers, S, []),
437 |     lists:filter(fun({_, M}) -> maps:is_key(aae_controller, M) end, Controllers).
438 | 
439 | -endif.


--------------------------------------------------------------------------------
/test/end_to_end/basic_SUITE.erl:
--------------------------------------------------------------------------------
  1 | -module(basic_SUITE).
  2 | -include_lib("common_test/include/ct.hrl").
  3 | -export([all/0, init_per_suite/1, end_per_suite/1]).
  4 | -export([
  5 |     dual_store_compare_medium_so/1,
  6 |     dual_store_compare_medium_ko/1,
  7 |     dual_store_compare_large_so/1,
  8 |     dual_store_compare_large_ko/1,
  9 |     store_notsupported/1,
 10 |     get_set_rebuild_schedule/1,
 11 |     get_set_storeheads/1,
 12 |     get_set_nextrebuild/1,
 13 |     splitfun_compare_functions/1
 14 | ]).
 15 | 
 16 | all() ->
 17 |     [
 18 |         dual_store_compare_medium_so,
 19 |         dual_store_compare_medium_ko,
 20 |         dual_store_compare_large_so,
 21 |         dual_store_compare_large_ko,
 22 |         store_notsupported,
 23 |         get_set_rebuild_schedule,
 24 |         get_set_storeheads,
 25 |         get_set_nextrebuild,
 26 |         splitfun_compare_functions
 27 |     ].
 28 | 
 29 | init_per_suite(Config) ->
 30 |     testutil:init_per_suite([{suite, "basic"} | Config]),
 31 |     Config.
 32 | 
 33 | end_per_suite(Config) ->
 34 |     testutil:end_per_suite(Config).
 35 | 
 36 | get_set_rebuild_schedule(_Config) ->
 37 |     RootPath = testutil:reset_filestructure(),
 38 |     VnodePath1 = filename:join(RootPath, "vnode1/"),
 39 |     SplitF = fun(_) -> {_SomeSensibleSize = 42, 1, 0, undefined, <<>>} end,
 40 |     RS0 = {1, 300},
 41 | 
 42 |     {ok, Cntrl} =
 43 |         aae_controller:aae_start(
 44 |             {parallel, leveled_ko},
 45 |             true,
 46 |             RS0,
 47 |             [{2, 0}, {2, 1}],
 48 |             VnodePath1,
 49 |             SplitF
 50 |         ),
 51 | 
 52 |     ok = test_rebuild_schedule(Cntrl, RS0),
 53 | 
 54 |     aae_controller:aae_close(Cntrl),
 55 |     testutil:reset_filestructure().
 56 | 
 57 | test_rebuild_schedule(Cntrl, RS0) ->
 58 |     RS1 = {RS1a, RS1b} = aae_controller:aae_get_rebuild_schedule(Cntrl),
 59 |     RS1 = RS0,
 60 |     ok = aae_controller:aae_set_rebuild_schedule(Cntrl, {RS1a, RS1b + 1}),
 61 |     {RS2a, RS2b} = aae_controller:aae_get_rebuild_schedule(Cntrl),
 62 |     RS1a = RS2a,
 63 |     RS2b = RS1b + 1,
 64 |     ok = aae_controller:aae_set_rebuild_schedule(Cntrl, {RS1a + 1, RS1b}),
 65 |     {RS3a, RS3b} = aae_controller:aae_get_rebuild_schedule(Cntrl),
 66 |     RS3a = RS1a + 1,
 67 |     RS1b = RS3b,
 68 |     ok.
 69 | 
 70 | get_set_nextrebuild(_Config) ->
 71 |     RootPath = testutil:reset_filestructure(),
 72 |     VnodePath1 = filename:join(RootPath, "vnode1/"),
 73 |     SplitF = fun(_) -> {42, 1, 0, null} end,
 74 | 
 75 |     {ok, Cntrl} =
 76 |         aae_controller:aae_start(
 77 |             {parallel, leveled_ko},
 78 |             true,
 79 |             {1, 300},
 80 |             [{2, 0}, {2, 1}],
 81 |             VnodePath1,
 82 |             SplitF
 83 |         ),
 84 | 
 85 |     NextRebuild0 = aae_controller:aae_nextrebuild(Cntrl),
 86 |     Now = os:timestamp(),
 87 |     ok = aae_controller:aae_prompt_nextrebuild(Cntrl, 10),
 88 |     NextRebuild1 = aae_controller:aae_nextrebuild(Cntrl),
 89 |     true = (NextRebuild1 /= NextRebuild0),
 90 |     Report = aae_controller:aae_produce_progress_report(Cntrl),
 91 |     NextRebuild1Reported = proplists:get_value(next_rebuild, Report),
 92 |     NextRebuild1Reported = NextRebuild1,
 93 |     ApproxTenSec = timer:now_diff(NextRebuild1, Now) div 1000000,
 94 |     true = (ApproxTenSec > 9),
 95 |     true = (ApproxTenSec < 11),
 96 | 
 97 |     aae_controller:aae_close(Cntrl),
 98 |     testutil:reset_filestructure().
 99 | 
100 | -define(NKEYS, 15).
101 | -define(NKEYS_IN_RANGE, 9).
102 | -define(NKEYS_UPDATED, 5).
103 | -define(NEW_SIBLING_COUNT,
104 |     (?NKEYS_IN_RANGE + (?NKEYS_IN_RANGE - ?NKEYS_UPDATED))
105 | ).
106 | 
107 | get_set_storeheads(_Config) ->
108 |     RootPath = testutil:reset_filestructure(),
109 |     VnodePath = filename:join(RootPath, "vnode1/"),
110 |     Preflist = [{2, 0}, {2, 1}],
111 | 
112 |     StoreheadsOnSplitF = fun(_) ->
113 |         {_SomeSensibleSize = 42, 1, 0, undefined, <<>>}
114 |     end,
115 |     StoreheadsOffSplitF = fun(_) ->
116 |         {42, _DoubleSiblingCount = 2, 0, undefined, <<>>}
117 |     end,
118 | 
119 |     {ok, Cntrl} =
120 |         aae_controller:aae_start(
121 |             {parallel, leveled_ko},
122 |             true,
123 |             {1, 300},
124 |             Preflist,
125 |             VnodePath,
126 |             StoreheadsOffSplitF,
127 |             %% have one function
128 |             [info, warn, error, critical]
129 |         ),
130 | 
131 |     Bucket = <<"b1">>,
132 |     BKVList = testutil:gen_keys([], ?NKEYS, Bucket),
133 |     {BKVList1, _} = lists:split(?NKEYS_UPDATED, BKVList),
134 |     ok = testutil:put_keys(Cntrl, 2, BKVList, none),
135 |     ct:print("put ~b keys: ~p\n", [?NKEYS, BKVList]),
136 | 
137 |     StartKey = list_to_binary(string:right(integer_to_list(0), 6, $0)),
138 |     EndKey = list_to_binary(string:right(integer_to_list(10), 6, $0)),
139 | 
140 |     %% there be 9*2 siblings in the range
141 |     SCFolder0 = key_range_folder(Cntrl, Bucket, StartKey, EndKey),
142 | 
143 |     %% test query
144 |     SCF0 = SCFolder0(),
145 |     ct:print(
146 |         "storeheads is initially off: test query should return ~b siblings:\n~b indeed\n",
147 |         [?NKEYS_IN_RANGE * 2, element(2, SCF0)]
148 |     ),
149 |     ?NKEYS_IN_RANGE * 2 = element(2, SCF0),
150 | 
151 |     %% update split_function
152 |     ok = aae_controller:aae_set_object_splitfun(Cntrl, StoreheadsOnSplitF),
153 |     ct:print("storeheads now set to on\n"),
154 | 
155 |     %% test query: no change in output
156 |     SCFolder1 = key_range_folder(Cntrl, Bucket, StartKey, EndKey),
157 |     SCF1 = SCFolder1(),
158 |     ct:print(
159 |         "after setting storeheads to on, expect no change in query output:\n"
160 |         "number of siblings returned is still ~b\n",
161 |         [element(2, SCF1)]
162 |     ),
163 |     ?NKEYS_IN_RANGE * 2 = element(2, SCF1),
164 |     true = (SCF0 == SCF1),
165 | 
166 |     %% update some objects
167 |     BKVList1Updated =
168 |         [
169 |             {B, K, [{<<V/binary, "UPDATED">>, C}]}
170 |          || {B, K, [{V, C}]} <- BKVList1
171 |         ],
172 |     ok = testutil:put_keys(Cntrl, 2, BKVList1Updated, none),
173 |     ct:print("update ~b objects\n", [?NKEYS_UPDATED]),
174 | 
175 |     %% test query to show partial change
176 |     SCFolder2 = key_range_folder(Cntrl, Bucket, StartKey, EndKey),
177 |     SCF2 = SCFolder2(),
178 |     ct:print(
179 |         "after updating, there should be a partial change (minus ~b siblings). Query returns ~b siblings:\n",
180 |         [?NKEYS_UPDATED, element(2, SCF2)]
181 |     ),
182 |     true = (SCF0 /= SCF2),
183 |     ?NEW_SIBLING_COUNT = element(2, SCF2),
184 | 
185 |     aae_controller:aae_close(Cntrl),
186 |     RootPath = testutil:reset_filestructure().
187 | 
188 | splitfun_compare_functions(_Config) ->
189 |     RootPath = testutil:reset_filestructure(),
190 |     VnodePath = filename:join(RootPath, "vnode1/"),
191 |     Preflist = [{2, 0}],
192 | 
193 |     SplitF_1 = mock_aae_from_object_binary_for_storeheads(true),
194 |     SplitF_2 = mock_aae_from_object_binary_for_storeheads(false),
195 | 
196 |     {ok, Cntrl} =
197 |         aae_controller:aae_start(
198 |             {parallel, leveled_ko},
199 |             true,
200 |             {1, 300},
201 |             Preflist,
202 |             VnodePath,
203 |             SplitF_1,
204 |             %% have one function
205 |             [info, warn, error, critical]
206 |         ),
207 | 
208 |     %% this is essentially to test that two logically identical functions
209 |     %% created separately, do indeed compare equal
210 |     true =
211 |         (aae_controller:wrapped_splitobjfun(SplitF_1) ==
212 |             aae_controller:aae_get_object_splitfun(Cntrl)),
213 |     ok = aae_controller:aae_set_object_splitfun(
214 |         Cntrl, aae_controller:wrapped_splitobjfun(SplitF_2)
215 |     ),
216 |     true =
217 |         (aae_controller:wrapped_splitobjfun(SplitF_2) ==
218 |             aae_controller:aae_get_object_splitfun(Cntrl)),
219 | 
220 |     aae_controller:aae_close(Cntrl),
221 |     RootPath = testutil:reset_filestructure().
222 | 
223 | -define(APOINTINTIME, {1747, 917445, 410090}).
224 | mock_aae_from_object_binary_for_storeheads(true) ->
225 |     fun(_ObjBin) ->
226 |         {_Size = 42, _SibCount = 1, 0, _LastMods = [?APOINTINTIME], <<>>}
227 |     end;
228 | mock_aae_from_object_binary_for_storeheads(false) ->
229 |     fun(_) ->
230 |         {42, 1, 0, [?APOINTINTIME], term_to_binary(null)}
231 |     end.
232 | 
233 | key_range_folder(Cntrl, Bucket, StartKey, EndKey) ->
234 |     Elements = [{sibcount, null}],
235 |     SCFoldFun =
236 |         fun(_FB, FK, FV, {FAccKL, FAccSc}) ->
237 |             {sibcount, FSc} = lists:keyfind(sibcount, 1, FV),
238 |             if
239 |                 (FK >= StartKey) and (FK < EndKey) ->
240 |                     {[FK | FAccKL], FAccSc + FSc};
241 |                 el /= se ->
242 |                     {FAccKL, FAccSc}
243 |             end
244 |         end,
245 |     SCInitAcc = {[], 0},
246 |     {async, Folder} =
247 |         aae_controller:aae_fold(
248 |             Cntrl,
249 |             {key_range, Bucket, StartKey, EndKey},
250 |             all,
251 |             SCFoldFun,
252 |             SCInitAcc,
253 |             Elements
254 |         ),
255 |     Folder.
256 | 
257 | store_notsupported(_Config) ->
258 |     RootPath = testutil:reset_filestructure(),
259 |     VnodePath1 = filename:join(RootPath, "vnode1/"),
260 |     SplitF = fun(_X) -> {rand:uniform(1000), 1, 0, null} end,
261 |     RPid = self(),
262 |     ReturnFun = fun(R) -> RPid ! {result, R} end,
263 |     RepairFun = fun(_KL) -> null end,
264 | 
265 |     {ok, Cntrl1} =
266 |         aae_controller:aae_start(
267 |             {parallel, leveled_ko},
268 |             true,
269 |             {1, 300},
270 |             [{2, 0}, {2, 1}],
271 |             VnodePath1,
272 |             SplitF,
273 |             [info, warn, error, critical]
274 |         ),
275 | 
276 |     BKVList = testutil:gen_keys([], 100),
277 |     ok = testutil:put_keys(Cntrl1, 2, BKVList, none),
278 | 
279 |     {ok, _P1, GUID1} =
280 |         aae_exchange:start(
281 |             [{exchange_sendfun(Cntrl1), [{2, 0}]}],
282 |             [{exchange_notsupported_sendfun(), [{3, 0}]}],
283 |             RepairFun,
284 |             ReturnFun
285 |         ),
286 |     io:format("Exchange id ~s~n", [GUID1]),
287 |     {ExchangeState1, 0} = testutil:start_receiver(),
288 |     io:format("ExchangeState ~w~n", [ExchangeState1]),
289 |     true = ExchangeState1 == not_supported,
290 |     aae_controller:aae_close(Cntrl1),
291 |     RootPath = testutil:reset_filestructure().
292 | 
293 | dual_store_compare_medium_so(_Config) ->
294 |     dual_store_compare_tester(10000, leveled_so).
295 | 
296 | dual_store_compare_medium_ko(_Config) ->
297 |     dual_store_compare_tester(10000, leveled_ko).
298 | 
299 | dual_store_compare_large_so(_Config) ->
300 |     dual_store_compare_tester(100000, leveled_so).
301 | 
302 | dual_store_compare_large_ko(_Config) ->
303 |     dual_store_compare_tester(100000, leveled_ko).
304 | 
305 | dual_store_compare_tester(InitialKeyCount, StoreType) ->
306 |     % Setup to AAE controllers, each representing the same data.  One store
307 |     % will be split into two three preflists, the other into two.  The
308 |     % preflists will be mapped as follows:
309 |     % {2, 0} <-> {3, 0}
310 |     % {2, 1} <-> {3, 1} & {3, 2}
311 |     %
312 |     % Think of these preflists in terms of needless partitions for test
313 |     % purposes.  Although this is a comparison between 2 'nodes', it is
314 |     % more like a comparison between 2 clusters where n=1, there is 1
315 |     % vnode, but data is still partitioned into either 2 or 3 partitions.
316 |     % Don't try and make sense of this in term of a ring - the
317 |     % mock_vnode_coverage_fold tests have a more Riak ring-like setup.
318 | 
319 |     RootPath = testutil:reset_filestructure(),
320 |     VnodePath1 = filename:join(RootPath, "vnode1/"),
321 |     VnodePath2 = filename:join(RootPath, "vnode2/"),
322 |     SplitF = fun(_X) -> {rand:uniform(1000), 1, 0, null} end,
323 |     RPid = self(),
324 |     ReturnFun = fun(R) -> RPid ! {result, R} end,
325 |     RepairFun = fun(_KL) -> null end,
326 | 
327 |     %% Add a key filter fun that never matches
328 |     KFF = fun({B, _K}) -> B =/= <<"SkipThisBucket">> end,
329 | 
330 |     {ok, Cntrl1} =
331 |         aae_controller:aae_start(
332 |             {parallel, StoreType},
333 |             true,
334 |             {1, 300},
335 |             [{2, 0}, {2, 1}],
336 |             VnodePath1,
337 |             SplitF,
338 |             [warn, error, critical],
339 |             [],
340 |             KFF
341 |         ),
342 |     {ok, Cntrl2} =
343 |         aae_controller:aae_start(
344 |             {parallel, StoreType},
345 |             true,
346 |             {1, 300},
347 |             [{3, 0}, {3, 1}, {3, 2}],
348 |             VnodePath2,
349 |             SplitF,
350 |             [warn, error, critical],
351 |             [],
352 |             KFF
353 |         ),
354 | 
355 |     initial_load(InitialKeyCount, Cntrl1, Cntrl2),
356 | 
357 |     SW1 = os:timestamp(),
358 | 
359 |     ok = aae_controller:aae_mergeroot(
360 |         Cntrl1,
361 |         [{2, 0}, {2, 1}],
362 |         ReturnFun
363 |     ),
364 |     Root1A = testutil:start_receiver(),
365 |     ok = aae_controller:aae_mergeroot(
366 |         Cntrl2,
367 |         [{3, 0}, {3, 1}, {3, 2}],
368 |         ReturnFun
369 |     ),
370 |     Root2A = testutil:start_receiver(),
371 |     true = Root1A == Root2A,
372 | 
373 |     ok = aae_controller:aae_fetchroot(
374 |         Cntrl1,
375 |         [{2, 0}],
376 |         ReturnFun
377 |     ),
378 |     [{{2, 0}, Root1B}] = testutil:start_receiver(),
379 |     ok = aae_controller:aae_fetchroot(
380 |         Cntrl2,
381 |         [{3, 0}],
382 |         ReturnFun
383 |     ),
384 |     [{{3, 0}, Root2B}] = testutil:start_receiver(),
385 |     true = Root1B == Root2B,
386 | 
387 |     ok = aae_controller:aae_mergeroot(
388 |         Cntrl1,
389 |         [{2, 1}],
390 |         ReturnFun
391 |     ),
392 |     Root1C = testutil:start_receiver(),
393 |     ok = aae_controller:aae_mergeroot(
394 |         Cntrl2,
395 |         [{3, 1}, {3, 2}],
396 |         ReturnFun
397 |     ),
398 |     Root2C = testutil:start_receiver(),
399 |     true = Root1C == Root2C,
400 | 
401 |     %% Turn down logging in Cntrl1 and Cntrl2
402 |     ok = aae_controller:aae_loglevel(Cntrl1, [warn, error, critical]),
403 |     ok = aae_controller:aae_loglevel(Cntrl2, [warn, error, critical]),
404 | 
405 |     io:format(
406 |         "Direct partition compare complete in ~w ms~n",
407 |         [timer:now_diff(os:timestamp(), SW1) / 1000]
408 |     ),
409 | 
410 |     % Change log levels
411 |     ok = aae_controller:aae_loglevel(Cntrl1, [info, warn, error, critical]),
412 |     ok = aae_controller:aae_loglevel(Cntrl2, [info, warn, error, critical]),
413 | 
414 |     % Now do a comparison based based on some key range queries:
415 |     SW2 = os:timestamp(),
416 |     Bucket = integer_to_binary(1),
417 |     StartKey = list_to_binary(string:right(integer_to_list(10), 6, $0)),
418 |     EndKey = list_to_binary(string:right(integer_to_list(50), 6, $0)),
419 |     Elements = [{sibcount, null}],
420 |     SCFoldFun =
421 |         fun(FB, FK, FV, {FAccKL, FAccSc}) ->
422 |             {sibcount, FSc} = lists:keyfind(sibcount, 1, FV),
423 |             true = FB == Bucket,
424 |             true = FK >= StartKey,
425 |             true = FK < EndKey,
426 |             {[FK | FAccKL], FAccSc + FSc}
427 |         end,
428 |     SCInitAcc = {[], 0},
429 | 
430 |     {async, SCFolder1} =
431 |         aae_controller:aae_fold(
432 |             Cntrl1,
433 |             {key_range, Bucket, StartKey, EndKey},
434 |             all,
435 |             SCFoldFun,
436 |             SCInitAcc,
437 |             Elements
438 |         ),
439 |     {async, SCFolder2} =
440 |         aae_controller:aae_fold(
441 |             Cntrl2,
442 |             {key_range, Bucket, StartKey, EndKey},
443 |             all,
444 |             SCFoldFun,
445 |             SCInitAcc,
446 |             Elements
447 |         ),
448 |     SCF1 = SCFolder1(),
449 |     SCF2 = SCFolder2(),
450 | 
451 |     true = SCF1 == SCF2,
452 |     true = element(2, SCF1) == 8,
453 |     true = length(element(1, SCF1)) == 8,
454 |     io:format(
455 |         "Comparison through key range folder in ~w ms with results ~w~n",
456 |         [timer:now_diff(os:timestamp(), SW2) / 1000, SCF1]
457 |     ),
458 | 
459 |     % Confirm no differences when using different matching AAE exchanges
460 |     SW3 = os:timestamp(),
461 | 
462 |     {ok, _P1, GUID1} =
463 |         aae_exchange:start(
464 |             [{exchange_sendfun(Cntrl1), [{2, 0}]}],
465 |             [{exchange_sendfun(Cntrl2), [{3, 0}]}],
466 |             RepairFun,
467 |             ReturnFun
468 |         ),
469 |     io:format("Exchange id ~s~n", [GUID1]),
470 |     {ExchangeState1, 0} = testutil:start_receiver(),
471 |     true = ExchangeState1 == root_compare,
472 | 
473 |     {ok, _P2, GUID2} =
474 |         aae_exchange:start(
475 |             [{exchange_sendfun(Cntrl1), [{2, 1}]}],
476 |             [{exchange_sendfun(Cntrl2), [{3, 1}, {3, 2}]}],
477 |             RepairFun,
478 |             ReturnFun
479 |         ),
480 |     io:format("Exchange id ~s~n", [GUID2]),
481 |     {ExchangeState2, 0} = testutil:start_receiver(),
482 |     true = ExchangeState2 == root_compare,
483 | 
484 |     {ok, _P3, GUID3} =
485 |         aae_exchange:start(
486 |             [{exchange_sendfun(Cntrl1), [{2, 0}, {2, 1}]}],
487 |             [{exchange_sendfun(Cntrl2), [{3, 0}, {3, 1}, {3, 2}]}],
488 |             RepairFun,
489 |             ReturnFun
490 |         ),
491 |     io:format("Exchange id ~s~n", [GUID3]),
492 |     {ExchangeState3, 0} = testutil:start_receiver(),
493 |     true = ExchangeState3 == root_compare,
494 | 
495 |     {ok, _P4, GUID4} =
496 |         aae_exchange:start(
497 |             [
498 |                 {exchange_sendfun(Cntrl1), [{2, 0}]},
499 |                 {exchange_sendfun(Cntrl1), [{2, 1}]}
500 |             ],
501 |             [{exchange_sendfun(Cntrl2), [{3, 0}, {3, 1}, {3, 2}]}],
502 |             RepairFun,
503 |             ReturnFun
504 |         ),
505 |     io:format("Exchange id ~s~n", [GUID4]),
506 |     {ExchangeState4, 0} = testutil:start_receiver(),
507 |     true = ExchangeState4 == root_compare,
508 | 
509 |     BKVListN = create_discrepancy(Cntrl1, InitialKeyCount),
510 | 
511 |     {ok, _P6, GUID6} =
512 |         aae_exchange:start(
513 |             [
514 |                 {exchange_sendfun(Cntrl1), [{2, 0}]},
515 |                 {exchange_sendfun(Cntrl1), [{2, 1}]}
516 |             ],
517 |             [{exchange_sendfun(Cntrl2), [{3, 0}, {3, 1}, {3, 2}]}],
518 |             RepairFun,
519 |             ReturnFun
520 |         ),
521 |     io:format("Exchange id ~s~n", [GUID6]),
522 |     {ExchangeState6, 10} = testutil:start_receiver(),
523 |     true = ExchangeState6 == clock_compare,
524 | 
525 |     % Same again, but request a missing partition, and should get same result
526 | 
527 |     {ok, _P6a, GUID6a} =
528 |         aae_exchange:start(
529 |             [
530 |                 {exchange_sendfun(Cntrl1), [{2, 0}]},
531 |                 {exchange_sendfun(Cntrl1), [{2, 1}]}
532 |             ],
533 |             [{exchange_sendfun(Cntrl2), [{3, 0}, {3, 1}, {3, 2}, {3, 3}]}],
534 |             RepairFun,
535 |             ReturnFun
536 |         ),
537 |     io:format("Exchange id ~s~n", [GUID6a]),
538 |     {ExchangeState6a, 10} = testutil:start_receiver(),
539 |     true = ExchangeState6a == clock_compare,
540 | 
541 |     {ok, _P6b, GUID6b} =
542 |         aae_exchange:start(
543 |             full,
544 |             [
545 |                 {exchange_sendfun(Cntrl1), [{2, 0}]},
546 |                 {exchange_sendfun(Cntrl1), [{2, 1}]}
547 |             ],
548 |             [{exchange_sendfun(Cntrl2), [{3, 0}, {3, 1}, {3, 2}, {3, 3}]}],
549 |             RepairFun,
550 |             ReturnFun,
551 |             none,
552 |             [{scan_timeout, 0}, {max_results, 256}]
553 |         ),
554 |     io:format("Exchange id ~s~n", [GUID6b]),
555 |     {timeout, 0} = testutil:start_receiver(),
556 | 
557 |     % Nothing repaired last time.  The deltas are all new keys though, so
558 |     % We can repair by adding them in to the other vnode
559 | 
560 |     RepairFun0 = testutil:repair_fun(BKVListN, Cntrl2, 3),
561 |     {ok, _P7, GUID7} =
562 |         aae_exchange:start(
563 |             [
564 |                 {exchange_sendfun(Cntrl1), [{2, 0}]},
565 |                 {exchange_sendfun(Cntrl1), [{2, 1}]}
566 |             ],
567 |             [{exchange_sendfun(Cntrl2), [{3, 0}, {3, 1}, {3, 2}]}],
568 |             RepairFun0,
569 |             ReturnFun
570 |         ),
571 |     io:format("Exchange id ~s~n", [GUID7]),
572 |     {ExchangeState7, 10} = testutil:start_receiver(),
573 |     true = ExchangeState7 == clock_compare,
574 | 
575 |     {ok, _P8, GUID8} =
576 |         aae_exchange:start(
577 |             [
578 |                 {exchange_sendfun(Cntrl1), [{2, 0}]},
579 |                 {exchange_sendfun(Cntrl1), [{2, 1}]}
580 |             ],
581 |             [{exchange_sendfun(Cntrl2), [{3, 0}, {3, 1}, {3, 2}]}],
582 |             RepairFun,
583 |             ReturnFun
584 |         ),
585 |     io:format("Exchange id ~s~n", [GUID8]),
586 |     {ExchangeState8, 0} = testutil:start_receiver(),
587 |     true = ExchangeState8 == root_compare,
588 | 
589 |     io:format(
590 |         "Comparison through exchange complete in ~w ms~n",
591 |         [timer:now_diff(os:timestamp(), SW3) / 1000]
592 |     ),
593 | 
594 |     % Shutdown and tidy up
595 |     ok = aae_controller:aae_close(Cntrl1),
596 |     ok = aae_controller:aae_close(Cntrl2),
597 |     RootPath = testutil:reset_filestructure().
598 | 
599 | initial_load(InitialKeyCount, Cntrl1, Cntrl2) ->
600 |     SW0 = os:timestamp(),
601 | 
602 |     BKVListXS = testutil:gen_keys([], InitialKeyCount),
603 |     {BKVList, _Discard} = lists:split(20, BKVListXS),
604 |     % The first 20 keys discarded to create an overlap between the add
605 |     % replace list
606 |     ok = testutil:put_keys(Cntrl1, 2, BKVList, none),
607 |     ok = testutil:put_keys(Cntrl2, 3, lists:reverse(BKVList), none),
608 | 
609 |     {BKVListRem, _Ignore} = lists:split(10, BKVList),
610 |     ok = testutil:remove_keys(Cntrl1, 2, BKVListRem),
611 |     ok = testutil:remove_keys(Cntrl2, 3, BKVListRem),
612 | 
613 |     % Change all of the keys - cheat by using undefined rather than replace
614 |     % properly
615 | 
616 |     BKVListR = testutil:gen_keys([], 100),
617 |     % As 100 > 20 expect 20 of these keys to be new, so no clock will be
618 |     % returned from fetch_clock, and 80 of these will be updates
619 |     ok = testutil:put_keys(Cntrl1, 2, BKVListR, undefined),
620 |     ok = testutil:put_keys(Cntrl2, 3, BKVListR, undefined),
621 | 
622 |     io:format(
623 |         "Initial put complete in ~w ms~n",
624 |         [timer:now_diff(os:timestamp(), SW0) / 1000]
625 |     ).
626 | 
627 | create_discrepancy(Cntrl, InitialKeyCount) ->
628 |     % Create a discrepancy and discover it through exchange
629 |     BKVListN = testutil:gen_keys([], InitialKeyCount + 10, InitialKeyCount),
630 |     _SL = lists:foldl(
631 |         fun({B, K, _V}, Acc) ->
632 |             BK = aae_util:make_binarykey(B, K),
633 |             Seg = leveled_tictac:keyto_segment48(BK),
634 |             Seg0 = aae_keystore:generate_treesegment(Seg),
635 |             io:format(
636 |                 "Generate new key B ~w K ~w " ++
637 |                     "for Segment ~w ~w ~w partition ~w ~w~n",
638 |                 [
639 |                     B,
640 |                     K,
641 |                     Seg0,
642 |                     Seg0 bsr 8,
643 |                     Seg0 band 255,
644 |                     testutil:calc_preflist(K, 2),
645 |                     testutil:calc_preflist(K, 3)
646 |                 ]
647 |             ),
648 |             [Seg0 | Acc]
649 |         end,
650 |         [],
651 |         BKVListN
652 |     ),
653 |     ok = testutil:put_keys(Cntrl, 2, BKVListN),
654 |     BKVListN.
655 | 
656 | exchange_sendfun(Cntrl) -> testutil:exchange_sendfun(Cntrl).
657 | 
658 | exchange_notsupported_sendfun() ->
659 |     SendFun =
660 |         fun(_Msg, _Preflists, Colour) ->
661 |             RPid = self(),
662 |             aae_exchange:reply(RPid, not_supported, Colour)
663 |         end,
664 |     SendFun.
665 | 


--------------------------------------------------------------------------------
/test/end_to_end/mock_kv_vnode.erl:
--------------------------------------------------------------------------------
  1 | %% -------- Overview ---------
  2 | %%
  3 | %% A simplified mock of riak_kv_vnode for testing
  4 | 
  5 | -module(mock_kv_vnode).
  6 | 
  7 | -behaviour(gen_server).
  8 | 
  9 | -export([
 10 |     init/1,
 11 |     handle_call/3,
 12 |     handle_cast/2,
 13 |     handle_info/2,
 14 |     terminate/2,
 15 |     code_change/3
 16 | ]).
 17 | 
 18 | -export([
 19 |     open/5,
 20 |     put/4,
 21 |     read_repair/4,
 22 |     push/6,
 23 |     backend_delete/4,
 24 |     exchange_message/4,
 25 |     rebuild/2,
 26 |     rehash/4,
 27 |     rebuild_complete/2,
 28 |     fold_aae/6,
 29 |     bucketlist_aae/1,
 30 |     reset_keyfilter/1,
 31 |     close/1
 32 | ]).
 33 | 
 34 | -export([
 35 |     extractclock_from_riakhead/1,
 36 |     from_aae_binary/1,
 37 |     new_v1/2,
 38 |     workerfun/1,
 39 |     rebuild_worker/1,
 40 |     fold_worker/0
 41 | ]).
 42 | 
 43 | -record(r_content, {
 44 |     metadata,
 45 |     value :: term()
 46 | }).
 47 | 
 48 | -record(r_object, {
 49 |     bucket,
 50 |     key,
 51 |     contents :: [#r_content{}],
 52 |     vclock = [],
 53 |     updatemetadata = dict:store(clean, true, dict:new()),
 54 |     updatevalue :: term()
 55 | }).
 56 | 
 57 | -record(options, {
 58 |     aae :: parallel_so | parallel_ko | native,
 59 |     index_ns :: list(tuple()),
 60 |     root_path :: list(),
 61 |     preflist_fun = null :: preflist_fun(),
 62 |     key_filter = none :: aae_controller:key_include_fun()
 63 | }).
 64 | 
 65 | -record(state, {
 66 |     root_path :: list(),
 67 |     index_ns :: list(tuple()),
 68 |     aae_controller :: pid(),
 69 |     vnode_store :: pid(),
 70 |     vnode_id :: binary(),
 71 |     aae_type :: tuple(),
 72 |     vnode_sqn = 1 :: integer(),
 73 |     preflist_fun = null :: preflist_fun(),
 74 |     aae_rebuild = false :: boolean()
 75 | }).
 76 | 
 77 | -include_lib("eunit/include/eunit.hrl").
 78 | 
 79 | -define(RIAK_TAG, o_rkv).
 80 | -define(REBUILD_SCHEDULE, {1, 60}).
 81 | -define(LASTMOD_LEN, 29).
 82 | -define(V1_VERS, 1).
 83 | -define(MAGIC, 53).
 84 | -define(EMPTY_VTAG_BIN, <<"e">>).
 85 | -define(MAGIC_KEYS, [<<48, 48, 48, 52, 57, 51>>]).
 86 | -define(POKE_TIME, 1000).
 87 | 
 88 | -type r_object() :: #r_object{}.
 89 | -type preflist_fun() :: null | fun((term(), term()) -> non_neg_integer()).
 90 | -type fold_objects_fun() :: fun((term(), term(), term(), term()) -> term()).
 91 | -type folder() :: fun(() -> term()).
 92 | 
 93 | %%%============================================================================
 94 | %%% API
 95 | %%%============================================================================
 96 | 
 97 | -spec open(
 98 |     list(),
 99 |     atom(),
100 |     list(tuple()),
101 |     preflist_fun() | null,
102 |     aae_controller:key_include_fun()
103 | ) -> {ok, pid()}.
104 | %% @doc
105 | %% Open a mock vnode
106 | open(Path, AAEType, IndexNs, PreflistFun, KFF) ->
107 |     gen_server:start(
108 |         ?MODULE,
109 |         [
110 |             #options{
111 |                 aae = AAEType,
112 |                 index_ns = IndexNs,
113 |                 root_path = Path,
114 |                 preflist_fun = PreflistFun,
115 |                 key_filter = KFF
116 |             }
117 |         ],
118 |         []
119 |     ).
120 | 
121 | -spec put(pid(), r_object(), tuple(), list(pid())) -> ok.
122 | %% @doc
123 | %% Put a new object in the store, updating AAE - and co-ordinating
124 | put(Vnode, Object, IndexN, OtherVnodes) ->
125 |     gen_server:call(Vnode, {put, Object, IndexN, OtherVnodes}).
126 | 
127 | -spec read_repair(pid(), r_object(), tuple(), list(pid())) -> ok.
128 | %% @doc
129 | %% Fetch the version vector from this store, and push the completed object
130 | %% to another
131 | read_repair(Vnode, Object, IndexN, OtherVnodes) ->
132 |     gen_server:call(Vnode, {read_repair, Object, IndexN, OtherVnodes}).
133 | 
134 | -spec push(pid(), binary(), binary(), list(tuple()), binary(), tuple()) -> ok.
135 | %% @doc
136 | %% Push a new object in the store, updating AAE
137 | push(Vnode, Bucket, Key, UpdClock, ObjectBin, IndexN) ->
138 |     gen_server:cast(Vnode, {push, Bucket, Key, UpdClock, ObjectBin, IndexN}).
139 | 
140 | -spec backend_delete(pid(), binary(), binary(), tuple()) -> ok.
141 | %% @doc
142 | %% Delete an object from the backend
143 | backend_delete(Vnode, Bucket, Key, IndexN) ->
144 |     gen_server:call(Vnode, {delete, Bucket, Key, IndexN}).
145 | 
146 | -spec rebuild(pid(), boolean()) -> {erlang:timestamp(), boolean()}.
147 | %% @doc
148 | %% Prompt for the next rebuild time, using ForceRebuild=true to override that
149 | %% time and trigger a rebuild.  As well as the next rebuild time the response
150 | %$ includes if a rebuild is currently in progress
151 | rebuild(Vnode, ForceRebuild) ->
152 |     gen_server:call(Vnode, {rebuild, ForceRebuild}).
153 | 
154 | -spec rebuild_complete(pid(), store | tree) -> ok.
155 | %% @doc
156 | %% Prompt for the rebuild of the tree
157 | rebuild_complete(Vnode, Stage) ->
158 |     gen_server:cast(Vnode, {rebuild_complete, Stage}).
159 | 
160 | -spec rehash(pid(), binary(), binary(), tuple()) -> ok.
161 | %% @doc
162 | %% Prompt a given key to be rehashed
163 | rehash(Vnode, Bucket, Key, IndexN) ->
164 |     gen_server:call(Vnode, {rehash, Bucket, Key, IndexN}).
165 | 
166 | -spec fold_aae(
167 |     pid(),
168 |     aae_keystore:range_limiter(),
169 |     aae_keystore:segment_limiter(),
170 |     fold_objects_fun(),
171 |     any(),
172 |     list(aae_keystore:value_element())
173 | ) -> {async, folder()}.
174 | %% @doc
175 | %% Fold over the heads in the aae store (which may be the key store when
176 | %% running in native mode)
177 | fold_aae(Vnode, Range, Segments, FoldObjectsFun, InitAcc, Elements) ->
178 |     gen_server:call(
179 |         Vnode,
180 |         {fold_aae, Range, Segments, FoldObjectsFun, InitAcc, Elements}
181 |     ).
182 | 
183 | -type return_fun() :: fun((any()) -> ok).
184 | 
185 | -spec exchange_message(pid(), tuple() | atom(), list(tuple()), return_fun()) ->
186 |     ok.
187 | %% @doc
188 | %% Handle a message from an AAE exchange
189 | exchange_message(Vnode, Msg, IndexNs, ReturnFun) ->
190 |     gen_server:call(Vnode, {aae, Msg, IndexNs, ReturnFun}).
191 | 
192 | -spec reset_keyfilter(pid()) -> ok.
193 | reset_keyfilter(Pid) ->
194 |     gen_server:cast(Pid, reset_keyfilter).
195 | 
196 | -spec bucketlist_aae(pid()) -> {async, fun(() -> list())}.
197 | %% @doc
198 | %% List buckets via AAE store
199 | bucketlist_aae(Vnode) ->
200 |     gen_server:call(Vnode, bucketlist_aae).
201 | 
202 | -spec close(pid()) -> ok.
203 | %% @doc
204 | %% Close the vnode, and any aae controller
205 | close(Vnode) ->
206 |     gen_server:call(Vnode, close).
207 | 
208 | %%%============================================================================
209 | %%% gen_server callbacks
210 | %%%============================================================================
211 | 
212 | init([Opts]) ->
213 |     % Start the vnode backend
214 |     % Get the shutdown GUID
215 |     % Delete the shutdown GUID
216 |     % Check is_empty
217 |     % Start the aae_controller
218 |     % Report back OK
219 |     RP = Opts#options.root_path,
220 |     {ok, VnSt} =
221 |         leveled_bookie:book_start(RP, 4000, 100000000, none),
222 |     IsEmpty = leveled_bookie:book_isempty(VnSt, ?RIAK_TAG),
223 |     BackendOpts = aae_keystore:store_generate_backendoptions(),
224 |     {KeyStoreType, UpdBackendOpts} =
225 |         case Opts#options.aae of
226 |             native ->
227 |                 {
228 |                     {native, leveled_nko, VnSt},
229 |                     BackendOpts
230 |                 };
231 |             parallel_so ->
232 |                 {
233 |                     {parallel, leveled_so},
234 |                     aae_keystore:store_setbackendoption(
235 |                         max_pencillercachesize,
236 |                         12000,
237 |                         BackendOpts
238 |                     )
239 |                 };
240 |             parallel_ko ->
241 |                 AltOpts =
242 |                     [
243 |                         {max_journalobjectcount, 1000},
244 |                         {database_id, 65534},
245 |                         {snapshot_timeout_short, 360},
246 |                         {snapshot_timeout_long, 3600},
247 |                         {compression_method, zstd},
248 |                         {
249 |                             forced_logs,
250 |                             [b0015, b0016, b0017, b0018, p0032, sst12]
251 |                         },
252 |                         {log_level, warn},
253 |                         {stats_logfrequency, 120}
254 |                     ],
255 | 
256 |                 {
257 |                     {parallel, leveled_ko},
258 |                     lists:foldl(
259 |                         fun({K, S}, AccOpts) ->
260 |                             aae_keystore:store_setbackendoption(
261 |                                 K, S, AccOpts
262 |                             )
263 |                         end,
264 |                         BackendOpts,
265 |                         AltOpts
266 |                     )
267 |                 }
268 |         end,
269 |     {ok, AAECntrl} =
270 |         aae_controller:aae_start(
271 |             KeyStoreType,
272 |             IsEmpty,
273 |             ?REBUILD_SCHEDULE,
274 |             Opts#options.index_ns,
275 |             RP,
276 |             fun from_aae_binary/1,
277 |             undefined,
278 |             UpdBackendOpts,
279 |             Opts#options.key_filter
280 |         ),
281 |     erlang:send_after(?POKE_TIME, self(), poke),
282 |     {ok, #state{
283 |         root_path = RP,
284 |         aae_type = KeyStoreType,
285 |         vnode_store = VnSt,
286 |         index_ns = Opts#options.index_ns,
287 |         aae_controller = AAECntrl,
288 |         vnode_id = list_to_binary(leveled_util:generate_uuid()),
289 |         preflist_fun = Opts#options.preflist_fun
290 |     }}.
291 | 
292 | handle_call({read_repair, Object, IndexN, OtherVnodes}, _From, State) ->
293 |     Bucket = Object#r_object.bucket,
294 |     Key = Object#r_object.key,
295 |     case
296 |         leveled_bookie:book_head(
297 |             State#state.vnode_store,
298 |             Bucket,
299 |             Key,
300 |             ?RIAK_TAG
301 |         )
302 |     of
303 |         not_found ->
304 |             {reply, ok, State};
305 |         {ok, Head} ->
306 |             Clock = extractclock_from_riakhead(Head),
307 |             ObjectBin = new_v1(Clock, Object#r_object.contents),
308 |             PushFun =
309 |                 fun(VN) ->
310 |                     push(VN, Bucket, Key, Clock, ObjectBin, IndexN)
311 |                 end,
312 |             lists:foreach(PushFun, OtherVnodes),
313 |             {reply, ok, State}
314 |     end;
315 | handle_call({put, Object, IndexN, OtherVnodes}, _From, State) ->
316 |     % Get Bucket and Key from object
317 |     % Do head request
318 |     % Compare clock, update clock
319 |     % Send update to other stores
320 |     % Update AAE
321 |     % Report back OK
322 |     Bucket = Object#r_object.bucket,
323 |     Key = Object#r_object.key,
324 | 
325 |     {UpdClock, PrevClock} =
326 |         case
327 |             leveled_bookie:book_head(
328 |                 State#state.vnode_store,
329 |                 Bucket,
330 |                 Key,
331 |                 ?RIAK_TAG
332 |             )
333 |         of
334 |             not_found ->
335 |                 {[{State#state.vnode_id, State#state.vnode_sqn}], none};
336 |             {ok, Head} ->
337 |                 Clock0 =
338 |                     extractclock_from_riakhead(Head),
339 |                 Clock1 =
340 |                     [{State#state.vnode_id, State#state.vnode_sqn} | Clock0],
341 |                 {lists:ukeysort(1, Clock1), Clock0}
342 |         end,
343 |     ObjectBin = new_v1(UpdClock, Object#r_object.contents),
344 |     VVEBin = to_aae_binary(ObjectBin),
345 |     leveled_bookie:book_put(
346 |         State#state.vnode_store,
347 |         Bucket,
348 |         Key,
349 |         ObjectBin,
350 |         [],
351 |         ?RIAK_TAG
352 |     ),
353 | 
354 |     ok = aae_controller:aae_put(
355 |         State#state.aae_controller,
356 |         IndexN,
357 |         Bucket,
358 |         Key,
359 |         UpdClock,
360 |         PrevClock,
361 |         VVEBin
362 |     ),
363 | 
364 |     lists:foreach(
365 |         fun(VN) ->
366 |             push(VN, Bucket, Key, UpdClock, ObjectBin, IndexN)
367 |         end,
368 |         OtherVnodes
369 |     ),
370 | 
371 |     {reply, ok, State#state{vnode_sqn = State#state.vnode_sqn + 1}};
372 | handle_call({delete, Bucket, Key, IndexN}, _From, State) ->
373 |     PrevClock =
374 |         case
375 |             leveled_bookie:book_head(
376 |                 State#state.vnode_store,
377 |                 Bucket,
378 |                 Key,
379 |                 ?RIAK_TAG
380 |             )
381 |         of
382 |             not_found ->
383 |                 none;
384 |             {ok, Head} ->
385 |                 extractclock_from_riakhead(Head)
386 |         end,
387 |     leveled_bookie:book_put(
388 |         State#state.vnode_store,
389 |         Bucket,
390 |         Key,
391 |         delete,
392 |         [],
393 |         ?RIAK_TAG
394 |     ),
395 |     ok = aae_controller:aae_put(
396 |         State#state.aae_controller,
397 |         IndexN,
398 |         Bucket,
399 |         Key,
400 |         none,
401 |         PrevClock,
402 |         <<>>
403 |     ),
404 |     {reply, ok, State};
405 | handle_call({rebuild, true}, _From, State) ->
406 |     % To rebuild the store an Object SplitFun will be required if is is a
407 |     % parallel store, which will depend on the preflist_fun.
408 |     NRT = aae_controller:aae_nextrebuild(State#state.aae_controller),
409 | 
410 |     SplitFun =
411 |         fun(B, K, V) ->
412 |             PreflistFun = State#state.preflist_fun,
413 |             IndexN = PreflistFun(B, K),
414 |             Clock = extractclock_from_riakhead(V),
415 |             {IndexN, Clock}
416 |         end,
417 |     Vnode = self(),
418 |     ReturnFun =
419 |         fun(ok) ->
420 |             ok = rebuild_complete(Vnode, store)
421 |         end,
422 | 
423 |     case
424 |         aae_controller:aae_rebuildstore(
425 |             State#state.aae_controller,
426 |             SplitFun
427 |         )
428 |     of
429 |         ok ->
430 |             % This store is rebuilt already (i.e. it is native), so nothing to
431 |             % do here other than prompt the status change
432 |             ReturnFun(ok);
433 |         {ok, FoldFun, FinishFun} ->
434 |             Worker = workerfun({rebuild_worker, [ReturnFun]}),
435 |             % Now need to get a fold query to run over the vnode store to
436 |             % rebuild the parallel store.  The aae_controller has provided
437 |             % the object fold fun which should load the parallel store, and
438 |             % the finish fun which should tell the controller the fold is
439 |             % complete and prompt the finishing of the rebuild activity
440 |             {async, Runner} =
441 |                 leveled_bookie:book_headfold(
442 |                     State#state.vnode_store,
443 |                     ?RIAK_TAG,
444 |                     {FoldFun, []},
445 |                     true,
446 |                     true,
447 |                     false
448 |                 ),
449 |             % dispatch the work to the worker
450 |             Worker(Runner, FinishFun)
451 |     end,
452 |     {reply, {NRT, true}, State#state{aae_rebuild = true}};
453 | handle_call({rebuild, false}, _From, State) ->
454 |     % Check next rebuild
455 |     % Reply with next rebuild TS - and the status to indicate an ongoing
456 |     % rebuild
457 |     NRT = aae_controller:aae_nextrebuild(State#state.aae_controller),
458 |     {reply, {NRT, State#state.aae_rebuild}, State};
459 | handle_call({rehash, Bucket, Key, IndexN}, _From, State) ->
460 |     case
461 |         leveled_bookie:book_head(
462 |             State#state.vnode_store,
463 |             Bucket,
464 |             Key,
465 |             ?RIAK_TAG
466 |         )
467 |     of
468 |         not_found ->
469 |             ok = aae_controller:aae_put(
470 |                 State#state.aae_controller,
471 |                 IndexN,
472 |                 Bucket,
473 |                 Key,
474 |                 none,
475 |                 undefined,
476 |                 <<>>
477 |             );
478 |         {ok, Head} ->
479 |             C0 = extractclock_from_riakhead(Head),
480 |             ok = aae_controller:aae_put(
481 |                 State#state.aae_controller,
482 |                 IndexN,
483 |                 Bucket,
484 |                 Key,
485 |                 C0,
486 |                 undefined,
487 |                 to_aae_binary(Head)
488 |             )
489 |     end,
490 |     {reply, ok, State};
491 | handle_call({aae, Msg, IndexNs, ReturnFun}, _From, State) ->
492 |     case Msg of
493 |         fetch_root ->
494 |             aae_controller:aae_mergeroot(
495 |                 State#state.aae_controller,
496 |                 IndexNs,
497 |                 ReturnFun
498 |             );
499 |         {fetch_branches, BranchIDs} ->
500 |             aae_controller:aae_mergebranches(
501 |                 State#state.aae_controller,
502 |                 IndexNs,
503 |                 BranchIDs,
504 |                 ReturnFun
505 |             );
506 |         {fetch_clocks, SegmentIDs} ->
507 |             aae_controller:aae_fetchclocks(
508 |                 State#state.aae_controller,
509 |                 IndexNs,
510 |                 SegmentIDs,
511 |                 ReturnFun,
512 |                 State#state.preflist_fun
513 |             );
514 |         {fetch_clocks, SegmentIDs, MR} ->
515 |             aae_controller:aae_fetchclocks(
516 |                 State#state.aae_controller,
517 |                 IndexNs,
518 |                 all,
519 |                 SegmentIDs,
520 |                 MR,
521 |                 ReturnFun,
522 |                 State#state.preflist_fun
523 |             );
524 |         {merge_tree_range, B, KR, TS, SF, MR, HM} ->
525 |             NullExtractFun =
526 |                 fun({B0, K0}, V0) ->
527 |                     {aae_util:make_binarykey(B0, K0), V0}
528 |                 end,
529 |             {FoldFun, Elements} =
530 |                 case HM of
531 |                     pre_hash ->
532 |                         {
533 |                             fun(BF, KF, EFs, TreeAcc) ->
534 |                                 {hash, CH} = lists:keyfind(hash, 1, EFs),
535 |                                 leveled_tictac:add_kv(
536 |                                     TreeAcc,
537 |                                     {BF, KF},
538 |                                     {is_hash, CH},
539 |                                     NullExtractFun
540 |                                 )
541 |                             end,
542 |                             [{hash, null}]
543 |                         };
544 |                     {rehash, IV} ->
545 |                         {
546 |                             fun(BF, KF, EFs, TreeAcc) ->
547 |                                 {clock, VC} = lists:keyfind(clock, 1, EFs),
548 |                                 CH = erlang:phash2({IV, lists:sort(VC)}),
549 |                                 leveled_tictac:add_kv(
550 |                                     TreeAcc,
551 |                                     {BF, KF},
552 |                                     {is_hash, CH},
553 |                                     NullExtractFun
554 |                                 )
555 |                             end,
556 |                             [{clock, null}]
557 |                         }
558 |                 end,
559 |             InitAcc = leveled_tictac:new_tree(State#state.vnode_id, TS),
560 |             RangeLimiter = aaefold_setrangelimiter(B, KR),
561 |             ModifiedLimiter = aaefold_setmodifiedlimiter(MR),
562 |             {async, Folder} =
563 |                 aae_controller:aae_fold(
564 |                     State#state.aae_controller,
565 |                     RangeLimiter,
566 |                     SF,
567 |                     ModifiedLimiter,
568 |                     false,
569 |                     FoldFun,
570 |                     InitAcc,
571 |                     Elements
572 |                 ),
573 |             Worker = workerfun({fold_worker, []}),
574 |             Worker(Folder, ReturnFun);
575 |         {fetch_clocks_range, B, KR, SF, MR} ->
576 |             FoldFun =
577 |                 fun(BF, KF, EFs, KeyClockAcc) ->
578 |                     magickey_check(KF, State#state.aae_type),
579 |                     {clock, VV} = lists:keyfind(clock, 1, EFs),
580 |                     [{BF, KF, VV} | KeyClockAcc]
581 |                 end,
582 |             RangeLimiter = aaefold_setrangelimiter(B, KR),
583 |             ModifiedLimiter = aaefold_setmodifiedlimiter(MR),
584 |             {async, Folder} =
585 |                 aae_controller:aae_fold(
586 |                     State#state.aae_controller,
587 |                     RangeLimiter,
588 |                     SF,
589 |                     ModifiedLimiter,
590 |                     false,
591 |                     FoldFun,
592 |                     [],
593 |                     [{clock, null}]
594 |                 ),
595 |             Worker = workerfun({fold_worker, []}),
596 |             Worker(Folder, ReturnFun)
597 |     end,
598 |     {reply, ok, State};
599 | handle_call(
600 |     {fold_aae, Range, Segments, FoldFun, InitAcc, Elements},
601 |     _From,
602 |     State
603 | ) ->
604 |     R = aae_controller:aae_fold(
605 |         State#state.aae_controller,
606 |         Range,
607 |         Segments,
608 |         FoldFun,
609 |         InitAcc,
610 |         Elements
611 |     ),
612 |     {reply, R, State};
613 | handle_call(bucketlist_aae, _From, State) ->
614 |     R = aae_controller:aae_bucketlist(State#state.aae_controller),
615 |     {reply, R, State};
616 | handle_call(close, _From, State) ->
617 |     ok = aae_controller:aae_close(State#state.aae_controller),
618 |     ok = leveled_bookie:book_close(State#state.vnode_store),
619 |     {stop, normal, ok, State}.
620 | 
621 | handle_cast({push, Bucket, Key, UpdClock, ObjectBin, IndexN}, State) ->
622 |     % As PUT, but don't increment vclock, replace regardless of current state
623 |     PrevClock =
624 |         case
625 |             leveled_bookie:book_head(
626 |                 State#state.vnode_store,
627 |                 Bucket,
628 |                 Key,
629 |                 ?RIAK_TAG
630 |             )
631 |         of
632 |             not_found ->
633 |                 none;
634 |             {ok, Head} ->
635 |                 extractclock_from_riakhead(Head)
636 |         end,
637 |     leveled_bookie:book_put(
638 |         State#state.vnode_store,
639 |         Bucket,
640 |         Key,
641 |         ObjectBin,
642 |         [],
643 |         ?RIAK_TAG
644 |     ),
645 | 
646 |     ok = aae_controller:aae_put(
647 |         State#state.aae_controller,
648 |         IndexN,
649 |         Bucket,
650 |         Key,
651 |         UpdClock,
652 |         PrevClock,
653 |         to_aae_binary(ObjectBin)
654 |     ),
655 | 
656 |     {noreply, State};
657 | handle_cast({rebuild_complete, store}, State) ->
658 |     % Trigger a rebuild of the tree.  Will require a non-null preflist_fun
659 |     % if the store is native (as the native store will not store the IndexN,
660 |     % and so a recalculation will be required)
661 |     Vnode = self(),
662 |     ReturnFun =
663 |         fun(ok) ->
664 |             ok = rebuild_complete(Vnode, tree)
665 |         end,
666 |     Worker = workerfun({rebuild_worker, [ReturnFun]}),
667 |     case
668 |         aae_controller:aae_rebuildtrees(
669 |             State#state.aae_controller,
670 |             State#state.index_ns,
671 |             State#state.preflist_fun,
672 |             Worker,
673 |             false
674 |         )
675 |     of
676 |         ok ->
677 |             {noreply, State#state{aae_rebuild = true}};
678 |         loading ->
679 |             gen_server:cast(self(), {rebuild_complete, store}),
680 |             timer:sleep(1000),
681 |             {noreply, State}
682 |     end;
683 | handle_cast({rebuild_complete, tree}, State) ->
684 |     {noreply, State#state{aae_rebuild = false}};
685 | handle_cast(reset_keyfilter, State) ->
686 |     ok = aae_controller:aae_reset_key_filter(State#state.aae_controller),
687 |     {noreply, State}.
688 | 
689 | handle_info(poke, State) ->
690 |     ok = aae_controller:aae_ping(
691 |         State#state.aae_controller,
692 |         os:timestamp(),
693 |         self()
694 |     ),
695 |     {noreply, State};
696 | handle_info({aae_pong, QueueTime}, State) ->
697 |     io:format("Queuetime in microseconds ~w~n", [QueueTime]),
698 |     erlang:send_after(?POKE_TIME, self(), poke),
699 |     {noreply, State}.
700 | 
701 | terminate(_Reason, _State) ->
702 |     ok.
703 | 
704 | code_change(_OldVsn, State, _Extra) ->
705 |     {ok, State}.
706 | 
707 | %%%============================================================================
708 | %%% External functions
709 | %%%============================================================================
710 | 
711 | -spec extractclock_from_riakhead(binary()) -> list(tuple()).
712 | %% @doc
713 | %% Extract the vector clock from a riak binary object (without doing a full
714 | %% binary to objetc conversion)
715 | extractclock_from_riakhead(
716 |     <<?MAGIC:8/integer, ?V1_VERS:8/integer, VclockLen:32/integer,
717 |         VclockBin:VclockLen/binary, _Rest/binary>>
718 | ) ->
719 |     lists:usort(binary_to_term(VclockBin));
720 | extractclock_from_riakhead(RiakHead) ->
721 |     {proxy_object, HeadBin, _Size, _F} = binary_to_term(RiakHead),
722 |     extractclock_from_riakhead(HeadBin).
723 | 
724 | %% V1 Riak Object Binary Encoding
725 | %% -type binobj_header()     :: <<53:8, Version:8, VClockLen:32, VClockBin/binary,
726 | %%                                SibCount:32>>.
727 | %% -type binobj_flags()      :: <<Deleted:1, 0:7/bitstring>>.
728 | %% -type binobj_umeta_pair() :: <<KeyLen:32, Key/binary, ValueLen:32, Value/binary>>.
729 | %% -type binobj_meta()       :: <<LastMod:LastModLen, VTag:128, binobj_flags(),
730 | %%                                [binobj_umeta_pair()]>>.
731 | %% -type binobj_value()      :: <<ValueLen:32, ValueBin/binary, MetaLen:32,
732 | %%                                [binobj_meta()]>>.
733 | %% -type binobj()            :: <<binobj_header(), [binobj_value()]>>.
734 | new_v1(Vclock, Siblings) ->
735 |     VclockBin = term_to_binary(Vclock),
736 |     VclockLen = byte_size(VclockBin),
737 |     SibCount = length(Siblings),
738 |     SibsBin = bin_contents(Siblings),
739 |     <<?MAGIC:8/integer, ?V1_VERS:8/integer, VclockLen:32/integer,
740 |         VclockBin/binary, SibCount:32/integer, SibsBin/binary>>.
741 | 
742 | bin_content(#r_content{metadata = Meta0, value = Val}) ->
743 |     TypeTag = 1,
744 |     ValBin = encode_maybe_binary(Val, TypeTag),
745 |     ValLen = byte_size(ValBin),
746 |     MetaBin = meta_bin(Meta0),
747 |     MetaLen = byte_size(MetaBin),
748 |     <<ValLen:32/integer, ValBin:ValLen/binary, MetaLen:32/integer,
749 |         MetaBin:MetaLen/binary>>.
750 | 
751 | encode_maybe_binary(Value, TypeTag) when is_binary(Value) ->
752 |     <<TypeTag, Value/binary>>.
753 | 
754 | bin_contents(Contents) ->
755 |     F = fun(Content, Acc) ->
756 |         <<Acc/binary, (bin_content(Content))/binary>>
757 |     end,
758 |     lists:foldl(F, <<>>, Contents).
759 | 
760 | meta_bin(MetaData) ->
761 |     {last_modified_date, {Mega, Secs, Micro}} =
762 |         lists:keyfind(last_modified_date, 1, MetaData),
763 |     LastModBin = <<Mega:32/integer, Secs:32/integer, Micro:32/integer>>,
764 |     Deleted = <<0>>,
765 |     RestBin = term_to_binary(MetaData),
766 |     VTagBin = ?EMPTY_VTAG_BIN,
767 |     VTagLen = byte_size(VTagBin),
768 |     <<LastModBin/binary, VTagLen:8/integer, VTagBin:VTagLen/binary,
769 |         Deleted:1/binary-unit:8, RestBin/binary>>.
770 | 
771 | workerfun({WorkerFun, Args}) ->
772 |     WorkerPid = spawn(?MODULE, WorkerFun, Args),
773 |     fun(FoldFun, FinishFun) ->
774 |         WorkerPid ! {fold, FoldFun, FinishFun}
775 |     end.
776 | 
777 | rebuild_worker(ReturnFun) ->
778 |     receive
779 |         {fold, FoldFun, FinishFun} ->
780 |             FinishFun(FoldFun()),
781 |             ReturnFun(ok)
782 |     end.
783 | 
784 | fold_worker() ->
785 |     receive
786 |         {fold, FoldFun, ReturnFun} ->
787 |             SW0 = os:timestamp(),
788 |             R = FoldFun(),
789 |             io:format(
790 |                 "FoldFun took ~w ms~n",
791 |                 [timer:now_diff(os:timestamp(), SW0) div 1000]
792 |             ),
793 |             ReturnFun(R)
794 |     end.
795 | 
796 | from_aae_binary(AAEBin) ->
797 |     <<ObjectSize:32/integer, SibCount:32/integer, IndexHash:32/integer,
798 |         LMDmeg:32/integer, LMDsec:32/integer, LMDmcr:32/integer,
799 |         MDOnly/binary>> = AAEBin,
800 |     {ObjectSize, SibCount, IndexHash, [{LMDmeg, LMDsec, LMDmcr}], MDOnly}.
801 | 
802 | %%%============================================================================
803 | %%% Internal functions
804 | %%%============================================================================
805 | 
806 | %% @doc
807 | %% Convert the format of the range limiter to one compatible with the aae store
808 | aaefold_setrangelimiter(all, all) ->
809 |     all;
810 | aaefold_setrangelimiter(Bucket, all) ->
811 |     {buckets, [Bucket]};
812 | aaefold_setrangelimiter(Bucket, {StartKey, EndKey}) ->
813 |     {key_range, Bucket, StartKey, EndKey}.
814 | 
815 | %% @doc
816 | %% Convert the format of the date limiter to one compatible with the aae store
817 | aaefold_setmodifiedlimiter({LowModDate, HighModDate}) when
818 |     is_integer(LowModDate), is_integer(HighModDate)
819 | ->
820 |     {LowModDate, HighModDate};
821 | aaefold_setmodifiedlimiter(_) ->
822 |     all.
823 | 
824 | to_aae_binary(ObjectBin) ->
825 |     ObjectSize = byte_size(ObjectBin),
826 |     <<?MAGIC:8/integer, ?V1_VERS:8/integer, VclockLen:32/integer,
827 |         _VclockBin:VclockLen/binary, SibCount:32/integer,
828 |         SibsBin/binary>> = ObjectBin,
829 | 
830 |     % faking here
831 |     IndexHash = erlang:phash2([]),
832 | 
833 |     {{LMDmeg, LMDsec, LMDmcr}, MD} =
834 |         strip_metabinary(SibCount, SibsBin, {0, 0, 0}, <<>>),
835 | 
836 |     <<ObjectSize:32/integer, SibCount:32/integer, IndexHash:32/integer,
837 |         LMDmeg:32/integer, LMDsec:32/integer, LMDmcr:32/integer, MD/binary>>.
838 | 
839 | strip_metabinary(0, <<>>, LMD, MetaBinAcc) ->
840 |     {LMD, MetaBinAcc};
841 | strip_metabinary(SibCount, SibBin, LMD, MetaBinAcc) ->
842 |     <<ValLen:32/integer, _ValBin:ValLen/binary, MetaLen:32/integer,
843 |         MetaBin:MetaLen/binary, Rest/binary>> = SibBin,
844 |     <<LMDmega:32/integer, LMDsec:32/integer, LMDmicro:32/integer,
845 |         _RestMeta/binary>> = MetaBin,
846 |     LMD0 = max({LMDmega, LMDsec, LMDmicro}, LMD),
847 |     strip_metabinary(
848 |         SibCount - 1,
849 |         Rest,
850 |         LMD0,
851 |         <<MetaBinAcc/binary, MetaLen:32/integer, MetaBin:MetaLen/binary>>
852 |     ).
853 | 
854 | magickey_check(Key, VnodeType) ->
855 |     case lists:member(Key, ?MAGIC_KEYS) of
856 |         true ->
857 |             io:format("Magic key ~w at VnodeType ~w~n", [Key, VnodeType]);
858 |         false ->
859 |             ok
860 |     end.
861 | 
862 | %%%============================================================================
863 | %%% Test
864 | %%%============================================================================
865 | 
866 | -ifdef(TEST).
867 | 
868 | -endif.
869 | 


--------------------------------------------------------------------------------
/src/aae_treecache.erl:
--------------------------------------------------------------------------------
  1 | %% -------- Overview ---------
  2 | %%
  3 | 
  4 | -module(aae_treecache).
  5 | 
  6 | -behaviour(gen_server).
  7 | 
  8 | -include("aae.hrl").
  9 | 
 10 | -export([
 11 |     init/1,
 12 |     handle_call/3,
 13 |     handle_cast/2,
 14 |     handle_info/2,
 15 |     terminate/2,
 16 |     code_change/3,
 17 |     format_status/1
 18 | ]).
 19 | 
 20 | -export([
 21 |     cache_open/3,
 22 |     cache_new/3,
 23 |     cache_alter/4,
 24 |     cache_root/1,
 25 |     cache_leaves/2,
 26 |     cache_markdirtysegments/3,
 27 |     cache_replacedirtysegments/3,
 28 |     cache_destroy/1,
 29 |     cache_startload/1,
 30 |     cache_completeload/2,
 31 |     cache_loglevel/2,
 32 |     cache_close/1,
 33 |     cache_segment_count/1
 34 | ]).
 35 | 
 36 | -define(PENDING_EXT, ".pnd").
 37 | -define(FINAL_EXT, ".aae").
 38 | -define(START_SQN, 1).
 39 | -define(SYNC_TIMEOUT, 30000).
 40 | 
 41 | -record(state, {
 42 |     save_sqn = 0 :: integer(),
 43 |     is_restored = false :: boolean(),
 44 |     tree :: leveled_tictac:tictactree() | undefined,
 45 |     root_path :: list() | undefined,
 46 |     partition_id :: {integer(), integer()} | integer() | undefined,
 47 |     loading = false :: boolean(),
 48 |     dirty_segments = [] :: list(),
 49 |     active_fold :: string() | undefined,
 50 |     change_queue = [] :: list() | redacted,
 51 |     queued_changes = 0 :: non_neg_integer(),
 52 |     safe_save = false :: boolean()
 53 | }).
 54 | 
 55 | -type partition_id() :: integer() | {integer(), integer()}.
 56 | 
 57 | %%%============================================================================
 58 | %%% API
 59 | %%%============================================================================
 60 | 
 61 | -spec cache_open(
 62 |     list(), partition_id(), aae_util:log_levels()
 63 | ) -> {boolean(), pid()}.
 64 | %% @doc
 65 | %% Open a tree cache, using any previously saved one for this tree cache as a
 66 | %% starting point.  Return is_empty boolean as true to indicate if a new cache
 67 | %% was created, as well as the PID of this FSM
 68 | cache_open(RootPath, PartitionID, LogLevels) ->
 69 |     Opts = [
 70 |         {root_path, RootPath},
 71 |         {partition_id, PartitionID},
 72 |         {log_levels, LogLevels}
 73 |     ],
 74 |     {ok, Pid} = gen_server:start_link(?MODULE, [Opts], []),
 75 |     IsRestored = gen_server:call(Pid, is_restored, infinity),
 76 |     {IsRestored, Pid}.
 77 | 
 78 | -spec cache_new(
 79 |     list(), partition_id(), aae_util:log_levels()
 80 | ) -> {ok, pid()}.
 81 | %% @doc
 82 | %% Open a tree cache, without restoring from file
 83 | cache_new(RootPath, PartitionID, LogLevels) ->
 84 |     Opts = [
 85 |         {root_path, RootPath},
 86 |         {partition_id, PartitionID},
 87 |         {ignore_disk, true},
 88 |         {log_levels, LogLevels}
 89 |     ],
 90 |     {ok, Pid} = gen_server:start_link(?MODULE, [Opts], []),
 91 |     {ok, Pid}.
 92 | 
 93 | -spec cache_destroy(pid()) -> ok.
 94 | %% @doc
 95 | %% Close a cache without saving
 96 | cache_destroy(AAECache) ->
 97 |     gen_server:cast(AAECache, destroy).
 98 | 
 99 | -spec cache_segment_count(pid()) -> non_neg_integer().
100 | %% @doc
101 | %% Expose dirty_segments length, for aae-progress-report.
102 | cache_segment_count(AAECache) ->
103 |     gen_server:call(AAECache, segment_count, ?SYNC_TIMEOUT).
104 | 
105 | -spec cache_close(pid()) -> ok.
106 | %% @doc
107 | %% Close a cache with saving
108 | cache_close(AAECache) ->
109 |     gen_server:call(AAECache, close, ?SYNC_TIMEOUT).
110 | 
111 | -spec cache_alter(pid(), binary(), integer() | none, integer() | none) -> ok.
112 | %% @doc
113 | %% Change the hash tree to reflect an addition and removal of a hash value
114 | cache_alter(AAECache, Key, CurrentHash, OldHash) ->
115 |     gen_server:cast(AAECache, {alter, Key, CurrentHash, OldHash}).
116 | 
117 | -spec cache_root(pid()) -> binary().
118 | %% @doc
119 | %% Fetch the root of the cache tree to compare
120 | cache_root(Pid) ->
121 |     gen_server:call(Pid, fetch_root, infinity).
122 | 
123 | -spec cache_leaves(pid(), list(integer())) -> list().
124 | %% @doc
125 | %% Fetch the leaves for a given list of branch IDs.
126 | cache_leaves(Pid, BranchIDs) ->
127 |     gen_server:call(Pid, {fetch_leaves, BranchIDs}, infinity).
128 | 
129 | -spec cache_markdirtysegments(pid(), list(integer()), string()) -> ok.
130 | %% @doc
131 | %% Mark dirty segments.  These segments are currently subject to a fetch_clocks
132 | %% fold.  If they aren't touched until the fold is complete, the segment can be
133 | %% safely replaced with the value in the fold.
134 | %%
135 | %% The FoldGUID is used to identify the request that prompted the marking.
136 | %% This becomes the active_fold, replacing any previous marking.  Dirty
137 | %% segments can only be replaced by the last active fold.  Need to avoid race
138 | %% conditions between multiple dirtysegment markings (as well as updates
139 | %% clearing dirty segments)
140 | cache_markdirtysegments(Pid, SegmentIDs, FoldGUID) ->
141 |     gen_server:cast(Pid, {mark_dirtysegments, SegmentIDs, FoldGUID}).
142 | 
143 | -spec cache_replacedirtysegments(
144 |     pid(),
145 |     list({integer(), integer()}),
146 |     string()
147 | ) -> ok.
148 | %% @doc
149 | %% When a fold_clocks is complete, replace any dirty_segments which remain
150 | %% clean from other interventions
151 | cache_replacedirtysegments(Pid, ReplacementSegments, FoldGUID) ->
152 |     gen_server:cast(
153 |         Pid,
154 |         {replace_dirtysegments, ReplacementSegments, FoldGUID}
155 |     ).
156 | 
157 | -spec cache_startload(pid()) -> ok.
158 | %% @doc
159 | %% Sets the cache loading state to true, now as well as maintaining the
160 | %% current tree the cache should keep a queue of all the changes from this
161 | %% point.
162 | %%
163 | %% Eventually cache_completeload should be called with a tree built from
164 | %% a loading process snapshotted at the startload point, and the changes can
165 | %% all be applied
166 | cache_startload(Pid) ->
167 |     gen_server:cast(Pid, start_load).
168 | 
169 | -spec cache_completeload(pid(), leveled_tictac:tictactree()) -> ok.
170 | %% @doc
171 | %% Take a tree which has been produced from a fold of the KeyStore, and make
172 | %% this the new tree
173 | cache_completeload(Pid, LoadedTree) ->
174 |     gen_server:cast(Pid, {complete_load, LoadedTree}).
175 | 
176 | -spec cache_loglevel(pid(), aae_util:log_levels()) -> ok.
177 | %% @doc
178 | %% Alter the log level at runtime
179 | cache_loglevel(Pid, LogLevels) ->
180 |     gen_server:cast(Pid, {log_levels, LogLevels}).
181 | 
182 | %%%============================================================================
183 | %%% gen_server callbacks
184 | %%%============================================================================
185 | 
186 | init([Opts]) ->
187 |     PartitionID =
188 |         case aae_util:get_opt(partition_id, Opts) of
189 |             {Index, N} when is_integer(Index), is_integer(N) ->
190 |                 {Index, N};
191 |             Index when is_integer(Index) ->
192 |                 Index
193 |         end,
194 |     RootPath =
195 |         case aae_util:get_opt(root_path, Opts) of
196 |             RPOpt when is_list(RPOpt) ->
197 |                 aae_util:check_rootpath(RPOpt)
198 |         end,
199 |     IgnoreDisk =
200 |         case aae_util:get_opt(ignore_disk, Opts, false) of
201 |             IgnoreOpt when is_boolean(IgnoreOpt) ->
202 |                 IgnoreOpt
203 |         end,
204 |     case aae_util:get_opt(log_levels, Opts) of
205 |         LLOpt when is_list(LLOpt) ->
206 |             aae_util:set_loglevel(LLOpt);
207 |         undefined ->
208 |             ok
209 |     end,
210 |     RootPath0 = filename:join(RootPath, flatten_id(PartitionID)) ++ "/",
211 |     {StartTree, SaveSQN, IsRestored} =
212 |         case {open_from_disk(RootPath0), IgnoreDisk} of
213 |             % Always run open_from_disk even if the result is to be ignored,
214 |             % as any files present must still be cleared
215 |             {{Tree, SQN}, false} when Tree =/= none ->
216 |                 {Tree, SQN, true};
217 |             _ ->
218 |                 {
219 |                     leveled_tictac:new_tree(PartitionID, ?TREE_SIZE),
220 |                     ?START_SQN,
221 |                     false
222 |                 }
223 |         end,
224 |     ?STD_LOG(c0005, [IsRestored, PartitionID]),
225 |     process_flag(trap_exit, true),
226 |     {ok,
227 |         #state{
228 |             save_sqn = SaveSQN,
229 |             tree = StartTree,
230 |             is_restored = IsRestored,
231 |             root_path = RootPath0,
232 |             partition_id = PartitionID,
233 |             safe_save = IsRestored or IgnoreDisk
234 |         },
235 |         hibernate}.
236 | 
237 | handle_call(is_restored, _From, State) ->
238 |     {reply, State#state.is_restored, State};
239 | handle_call(fetch_root, _From, State = #state{tree = Tree}) when
240 |     Tree =/= undefined
241 | ->
242 |     {reply, leveled_tictac:fetch_root(State#state.tree), State};
243 | handle_call({fetch_leaves, BranchIDs}, _From, State = #state{tree = Tree}) when
244 |     Tree =/= undefined
245 | ->
246 |     {reply, leveled_tictac:fetch_leaves(State#state.tree, BranchIDs), State};
247 | handle_call(segment_count, _From, State = #state{dirty_segments = A}) ->
248 |     {reply, length(A), State};
249 | handle_call(close, _From, State) ->
250 |     case {State#state.safe_save, State#state.tree, State#state.root_path} of
251 |         {true, Tree, RP} when Tree =/= undefined, RP =/= undefined ->
252 |             save_to_disk(
253 |                 RP, State#state.save_sqn, Tree
254 |             );
255 |         _ ->
256 |             ok
257 |     end,
258 |     {stop, normal, ok, State}.
259 | 
260 | handle_cast(
261 |     {alter, Key, CurrentHash, OldHash}, State = #state{change_queue = CQ}
262 | ) when is_list(CQ) ->
263 |     {Tree0, Segment} =
264 |         leveled_tictac:add_kv(
265 |             State#state.tree,
266 |             Key,
267 |             {CurrentHash, OldHash},
268 |             fun alterhash_fun/2,
269 |             true
270 |         ),
271 |     State0 =
272 |         case State#state.loading of
273 |             true ->
274 |                 QCnt = State#state.queued_changes,
275 |                 State#state{
276 |                     change_queue = [{Key, CurrentHash, OldHash} | CQ],
277 |                     queued_changes = QCnt + 1
278 |                 };
279 |             false ->
280 |                 State
281 |         end,
282 |     case State#state.dirty_segments of
283 |         [] ->
284 |             {noreply, State0#state{tree = Tree0}};
285 |         DirtyList ->
286 |             DirtyList0 = lists:delete(Segment, DirtyList),
287 |             {noreply, State0#state{tree = Tree0, dirty_segments = DirtyList0}}
288 |     end;
289 | handle_cast(start_load, State = #state{loading = Loading}) when
290 |     Loading == false
291 | ->
292 |     {noreply, State#state{
293 |         loading = true,
294 |         change_queue = [],
295 |         queued_changes = 0,
296 |         dirty_segments = [],
297 |         active_fold = undefined
298 |     }};
299 | handle_cast({complete_load, Tree}, State = #state{loading = Loading}) when
300 |     Loading == true
301 | ->
302 |     LoadFun =
303 |         fun({Key, CH, OH}, AccTree) ->
304 |             leveled_tictac:add_kv(
305 |                 AccTree, Key, {CH, OH}, fun alterhash_fun/2
306 |             )
307 |         end,
308 |     Tree0 = lists:foldr(LoadFun, Tree, State#state.change_queue),
309 |     ?STD_LOG(c0008, [length(State#state.change_queue)]),
310 |     {noreply,
311 |         State#state{
312 |             loading = false,
313 |             change_queue = [],
314 |             queued_changes = 0,
315 |             tree = Tree0,
316 |             safe_save = true
317 |         },
318 |         hibernate};
319 | handle_cast({mark_dirtysegments, SegmentList, FoldGUID}, State) ->
320 |     case State#state.loading of
321 |         true ->
322 |             % don't mess about with dirty segments, loading anyway
323 |             {noreply, State};
324 |         false ->
325 |             {noreply, State#state{
326 |                 dirty_segments = SegmentList,
327 |                 active_fold = FoldGUID
328 |             }}
329 |     end;
330 | handle_cast({replace_dirtysegments, SegmentMap, FoldGUID}, State) ->
331 |     ChangeSegmentFoldFun =
332 |         fun({SID, NewHash}, TreeAcc) ->
333 |             case lists:member(SID, State#state.dirty_segments) of
334 |                 true ->
335 |                     ?STD_LOG(c0006, [State#state.partition_id, SID, NewHash]),
336 |                     leveled_tictac:alter_segment(SID, NewHash, TreeAcc);
337 |                 false ->
338 |                     TreeAcc
339 |             end
340 |         end,
341 |     case State#state.active_fold of
342 |         FoldGUID ->
343 |             UpdTree =
344 |                 lists:foldl(
345 |                     ChangeSegmentFoldFun,
346 |                     State#state.tree,
347 |                     SegmentMap
348 |                 ),
349 |             {noreply, State#state{tree = UpdTree}};
350 |         _ ->
351 |             {noreply, State}
352 |     end;
353 | handle_cast(destroy, State) ->
354 |     ?STD_LOG(c0004, [State#state.partition_id]),
355 |     {stop, normal, State};
356 | handle_cast({log_levels, LogLevels}, State) ->
357 |     ok = aae_util:set_loglevel(LogLevels),
358 |     {noreply, State}.
359 | 
360 | handle_info(_Info, State) ->
361 |     {stop, normal, State}.
362 | 
363 | format_status(Status) ->
364 |     case maps:get(reason, Status, normal) of
365 |         terminate ->
366 |             State = maps:get(state, Status),
367 |             maps:update(
368 |                 state,
369 |                 State#state{change_queue = redacted},
370 |                 Status
371 |             );
372 |         _ ->
373 |             Status
374 |     end.
375 | 
376 | terminate(_Reason, _State) ->
377 |     ok.
378 | 
379 | code_change(_OldVsn, State, _Extra) ->
380 |     {ok, State}.
381 | 
382 | %%%============================================================================
383 | %%% Internal functions
384 | %%%============================================================================
385 | 
386 | -spec flatten_id(partition_id()) -> list().
387 | %% @doc
388 | %% Flatten partition ID to make a folder name
389 | flatten_id({Index, N}) ->
390 |     integer_to_list(Index) ++ "_" ++ integer_to_list(N);
391 | flatten_id(ID) ->
392 |     integer_to_list(ID).
393 | 
394 | -spec save_to_disk(list(), integer(), leveled_tictac:tictactree()) -> ok.
395 | %% @doc
396 | %% Save the TreeCache to disk, with a checksum so thatit can be
397 | %% validated on read.
398 | save_to_disk(RootPath, SaveSQN, TreeCache) ->
399 |     Serialised = term_to_binary(leveled_tictac:export_tree(TreeCache)),
400 |     CRC32 = erlang:crc32(Serialised),
401 |     ok = filelib:ensure_dir(RootPath),
402 |     PendingName = integer_to_list(SaveSQN) ++ ?PENDING_EXT,
403 |     ?STD_LOG(c0003, [RootPath, PendingName]),
404 |     ok = file:write_file(
405 |         filename:join(RootPath, PendingName),
406 |         <<CRC32:32/integer, Serialised/binary>>,
407 |         [raw]
408 |     ),
409 |     ok =
410 |         file:rename(
411 |             filename:join(RootPath, PendingName),
412 |             form_cache_filename(RootPath, SaveSQN)
413 |         ),
414 |     ok.
415 | 
416 | -spec open_from_disk(list()) -> {leveled_tictac:tictactree() | none, integer()}.
417 | %% @doc
418 | %% Open most recently saved TicTac tree cache file on disk, deleting all
419 | %% others both used and unused - to save an out of date tree from being used
420 | %% following a subsequent crash
421 | open_from_disk(RootPath) ->
422 |     ok = filelib:ensure_dir(RootPath),
423 |     {ok, Filenames} = file:list_dir(RootPath),
424 |     FileFilterFun =
425 |         fun(FN, FinalFiles) ->
426 |             case filename:extension(FN) of
427 |                 ?PENDING_EXT ->
428 |                     ?STD_LOG(c0001, [FN]),
429 |                     ok = file:delete(filename:join(RootPath, FN)),
430 |                     FinalFiles;
431 |                 ?FINAL_EXT ->
432 |                     BaseFN =
433 |                         filename:basename(filename:rootname(FN, ?FINAL_EXT)),
434 |                     [list_to_integer(BaseFN) | FinalFiles];
435 |                 _ ->
436 |                     FinalFiles
437 |             end
438 |         end,
439 |     SQNList =
440 |         lists:reverse(lists:sort(lists:foldl(FileFilterFun, [], Filenames))),
441 |     case SQNList of
442 |         [] ->
443 |             {none, 1};
444 |         [HeadSQN | Tail] ->
445 |             DeleteFun =
446 |                 fun(SQN) ->
447 |                     ok = file:delete(form_cache_filename(RootPath, SQN))
448 |                 end,
449 |             lists:foreach(DeleteFun, Tail),
450 |             FileToUse = form_cache_filename(RootPath, HeadSQN),
451 |             case aae_util:safe_open(FileToUse) of
452 |                 {ok, STC} ->
453 |                     ok = file:delete(FileToUse),
454 |                     {
455 |                         leveled_tictac:import_tree(binary_to_term(STC)),
456 |                         HeadSQN + 1
457 |                     };
458 |                 {error, Reason} ->
459 |                     ?STD_LOG(c0002, [FileToUse, Reason]),
460 |                     {none, 1}
461 |             end
462 |     end.
463 | 
464 | -spec form_cache_filename(list(), integer()) -> list().
465 | %% @doc
466 | %% Return the cache filename by combining the Root Path with the SQN
467 | form_cache_filename(RootPath, SaveSQN) ->
468 |     filename:join(RootPath, integer_to_list(SaveSQN) ++ ?FINAL_EXT).
469 | 
470 | -spec alterhash_fun(term(), term()) -> {binary(), {is_hash, integer()}}.
471 | %% @doc
472 | %% Function to calculate the hash change need to make an alter into a straight
473 | %% add as the BinExtractfun in leveled_tictac
474 | alterhash_fun(Key, {CurrentHash, OldHash}) when
475 |     is_binary(Key),
476 |     is_integer(CurrentHash) orelse CurrentHash == none,
477 |     is_integer(OldHash) orelse OldHash == none
478 | ->
479 |     % TODO: Should move this function to leveled_tictac
480 |     % - requires secret knowledge of implementation to perform
481 |     % alter
482 |     %
483 |     % What we know about the addition of a value into a leveled_tictac tree is
484 |     % that an addition is made be doing:
485 |     % SegHash bxor (AltKeyHash bxor ClockHash)
486 |     %
487 |     % The ClockHash in this case is the output of this function.  When an
488 |     % alteration is being made the resulting Hash needs to still include the
489 |     % AltKeyHash, so it is necessary apply bxor AltKeyHash an odd number of
490 |     % times.  Hence an alteration or a null change must include the AltKeyHash
491 |     % within the ClockHash
492 |     UpdateHash =
493 |         case {CurrentHash, OldHash} of
494 |             {none, OldHash} when is_integer(OldHash) ->
495 |                 % Remove - treat like adding back in
496 |                 % the tictac will bxor this with the key - so don't need to
497 |                 % bxor this here again
498 |                 OldHash;
499 |             {CurrentHash, none} when is_integer(CurrentHash) ->
500 |                 % Nothing to remove - straight add
501 |                 CurrentHash;
502 |             {none, none} ->
503 |                 % This may be prompted in rehash.
504 |                 % In this case a neutral update is required (when bxor'd with
505 |                 % the key hash it should produce no change) - so return the
506 |                 % relevant hash of the key
507 |                 {_SegmentHash, AltKeyHash} =
508 |                     leveled_tictac:keyto_doublesegment32(Key),
509 |                 AltKeyHash;
510 |             {CurrentHash, OldHash} when
511 |                 is_integer(CurrentHash), is_integer(OldHash)
512 |             ->
513 |                 % Alter - need to account for hashing with key
514 |                 % to remove the original
515 |                 {_SegmentHash, AltKeyHash} =
516 |                     leveled_tictac:keyto_doublesegment32(Key),
517 |                 CurrentHash bxor (OldHash bxor AltKeyHash)
518 |         end,
519 |     {Key, {is_hash, UpdateHash}}.
520 | 
521 | %%%============================================================================
522 | %%% Test
523 | %%%============================================================================
524 | 
525 | -ifdef(TEST).
526 | 
527 | -include_lib("eunit/include/eunit.hrl").
528 | 
529 | setup_savedcaches(RootPath) ->
530 |     Tree0 = leveled_tictac:new_tree(test),
531 |     Tree1 = leveled_tictac:add_kv(
532 |         Tree0,
533 |         {<<"K1">>},
534 |         {<<"V1">>},
535 |         fun({K}, {V}) -> {K, V} end
536 |     ),
537 |     Tree2 = leveled_tictac:add_kv(
538 |         Tree1,
539 |         {<<"K2">>},
540 |         {<<"V2">>},
541 |         fun({K}, {V}) -> {K, V} end
542 |     ),
543 |     ok = save_to_disk(RootPath, 1, Tree1),
544 |     ok = save_to_disk(RootPath, 2, Tree2),
545 |     Tree2.
546 | 
547 | clean_saveopen_test() ->
548 |     % Check that pending files ar eignored, and that the highest SQN that is
549 |     % not pending is the one opened
550 |     RootPath = "test/cache0/",
551 |     aae_util:clean_subdir(RootPath),
552 |     Tree2 = setup_savedcaches(RootPath),
553 |     NextFN = filename:join(RootPath, integer_to_list(3) ++ ?PENDING_EXT),
554 |     ok = file:write_file(NextFN, <<"delete">>),
555 |     UnrelatedFN = filename:join(RootPath, "alt.file"),
556 |     ok = file:write_file(UnrelatedFN, <<"no_delete">>),
557 | 
558 |     {Tree3, SaveSQN} =
559 |         case open_from_disk(RootPath) of
560 |             {OT3, OT3SQN} when OT3 =/= none ->
561 |                 {OT3, OT3SQN}
562 |         end,
563 |     ?assertMatch(3, SaveSQN),
564 |     ?assertMatch([], leveled_tictac:find_dirtyleaves(Tree2, Tree3)),
565 |     ?assertMatch({none, 1}, open_from_disk(RootPath)),
566 | 
567 |     ?assertMatch({ok, <<"no_delete">>}, file:read_file(UnrelatedFN)),
568 |     ?assertMatch({error, enoent}, file:read_file(NextFN)),
569 |     aae_util:clean_subdir(RootPath).
570 | 
571 | clear_old_cache_test() ->
572 |     RootPath = "test/oldcache0/",
573 |     PartitionID = 1,
574 |     RP0 = filename:join(RootPath, integer_to_list(PartitionID)) ++ "/",
575 |     aae_util:clean_subdir(RP0),
576 |     _Tree2 = setup_savedcaches(RP0),
577 |     {ok, FN0s} = file:list_dir(RP0),
578 |     ?assertMatch(2, length(FN0s)),
579 |     {ok, Cpid} = cache_new(RootPath, 1, undefined),
580 |     {ok, FN1s} = file:list_dir(RP0),
581 |     ?assertMatch(0, length(FN1s)),
582 |     ok = cache_close(Cpid),
583 |     {ok, FN2s} = file:list_dir(RP0),
584 |     ?assertMatch(1, length(FN2s)),
585 |     aae_util:clean_subdir(RootPath).
586 | 
587 | dirty_saveopen_test() ->
588 |     RootPath = "test/dirtycache0/",
589 |     aae_util:clean_subdir(RootPath),
590 |     RP0 = filename:join(RootPath, integer_to_list(1)) ++ "/",
591 |     {ok, Cpid0} = cache_new(RootPath, 1, undefined),
592 |     Hash0 = erlang:phash2({<<"K1">>, <<"C1">>}),
593 |     cache_alter(Cpid0, <<"K1">>, Hash0, none),
594 |     ok = cache_close(Cpid0),
595 |     ?assertMatch(true, filelib:is_file(form_cache_filename(RP0, 1))),
596 |     {true, Cpid1} = cache_open(RootPath, 1, undefined),
597 |     Hash1 = erlang:phash2({<<"K1">>, <<"C2">>}),
598 |     cache_alter(Cpid1, <<"K1">>, Hash1, Hash0),
599 |     ok = cache_close(Cpid1),
600 |     ?assertMatch(true, filelib:is_file(form_cache_filename(RP0, 2))),
601 |     aae_util:clean_subdir(RootPath),
602 |     {false, Cpid2} = cache_open(RootPath, 1, undefined),
603 |     Hash2 = erlang:phash2({<<"K1">>, <<"C3">>}),
604 |     cache_alter(Cpid2, <<"K1">>, Hash2, Hash1),
605 |     ok = cache_close(Cpid2),
606 |     ?assertMatch(false, filelib:is_file(form_cache_filename(RP0, 1))),
607 |     ?assertMatch(false, filelib:is_file(form_cache_filename(RP0, 2))),
608 |     ?assertMatch(false, filelib:is_file(form_cache_filename(RP0, 3))),
609 |     {false, Cpid3} = cache_open(RootPath, 1, undefined),
610 |     Hash3 = erlang:phash2({<<"K1">>, <<"C4">>}),
611 |     cache_alter(Cpid3, <<"K1">>, Hash3, Hash2),
612 |     ok = cache_close(Cpid3),
613 |     ?assertMatch(false, filelib:is_file(form_cache_filename(RP0, 1))),
614 |     ?assertMatch(false, filelib:is_file(form_cache_filename(RP0, 2))),
615 |     ?assertMatch(false, filelib:is_file(form_cache_filename(RP0, 3))),
616 |     ?assertMatch(false, filelib:is_file(form_cache_filename(RP0, 4))),
617 |     {false, Cpid4} = cache_open(RootPath, 1, undefined),
618 |     cache_startload(Cpid4),
619 |     cache_alter(Cpid4, <<"K1">>, Hash3, none),
620 |     T0 = leveled_tictac:new_tree(raw, ?TREE_SIZE),
621 |     cache_completeload(Cpid4, T0),
622 |     ok = cache_close(Cpid4),
623 |     ?assertMatch(true, filelib:is_file(form_cache_filename(RP0, 1))),
624 |     {true, Cpid5} = cache_open(RootPath, 1, undefined),
625 |     R0 = cache_root(Cpid5),
626 |     [BranchID] =
627 |         leveled_tictac:find_dirtysegments(R0, leveled_tictac:fetch_root(T0)),
628 |     [{BranchID, Branch5}] = cache_leaves(Cpid5, [BranchID]),
629 |     [{BranchID, Branch0}] = leveled_tictac:fetch_leaves(T0, [BranchID]),
630 |     [SegmentID] =
631 |         leveled_tictac:find_dirtysegments(Branch0, Branch5),
632 |     Pos = SegmentID * 4,
633 |     <<_Pre:Pos/binary, HashToCheck:32/integer, _Post/binary>> = Branch5,
634 |     {_SegmentHash, AltHash} = leveled_tictac:keyto_doublesegment32(<<"K1">>),
635 |     ?assertMatch(Hash3, HashToCheck bxor AltHash),
636 |     ok = cache_close(Cpid5),
637 |     ?assertMatch(true, filelib:is_file(form_cache_filename(RP0, 2))),
638 |     aae_util:clean_subdir(RootPath).
639 | 
640 | corrupt_save_test_() ->
641 |     {timeout, 60, fun corrupt_save_tester/0}.
642 | 
643 | corrupt_save_tester() ->
644 |     % If any byte is corrupted on disk - then the result should be a failure
645 |     % to open and the TreeCache reverting to empty
646 |     RootPath = "test/cachecs/",
647 |     aae_util:clean_subdir(RootPath),
648 |     _Tree2 = setup_savedcaches(RootPath),
649 |     BestFN = form_cache_filename(RootPath, 2),
650 |     {ok, LatestCache} = file:read_file(BestFN),
651 |     FlipByteFun =
652 |         fun(Offset) ->
653 |             aae_util:flip_byte(LatestCache, 1, Offset)
654 |         end,
655 |     BrokenCaches =
656 |         lists:map(FlipByteFun, lists:seq(1, byte_size(LatestCache) - 1)),
657 |     BrokenCacheCheckFun =
658 |         fun(BrokenCache) ->
659 |             ok = file:write_file(BestFN, BrokenCache),
660 |             R = open_from_disk(RootPath),
661 |             ?assertMatch({none, 1}, R)
662 |         end,
663 |     ok = lists:foreach(BrokenCacheCheckFun, BrokenCaches),
664 |     aae_util:clean_subdir(RootPath).
665 | 
666 | format_status_test() ->
667 |     RootPath = "test/formatstatus/",
668 |     PartitionID = 99,
669 |     aae_util:clean_subdir(RootPath ++ "/" ++ integer_to_list(PartitionID)),
670 |     {ok, C0} = cache_new(RootPath, PartitionID, undefined),
671 |     {status, _C0, {module, gen_server}, SItemL} = sys:get_status(C0),
672 |     {data, [{"State", S}]} = lists:nth(3, lists:nth(5, SItemL)),
673 |     ?assert(is_list(S#state.change_queue)),
674 |     RedactedStatus = format_status(#{reason => terminate, state => S}),
675 |     RST = maps:get(state, RedactedStatus),
676 |     ?assertMatch(redacted, RST#state.change_queue),
677 |     NormStatus = format_status(#{reason => normal, state => S}),
678 |     NST = maps:get(state, NormStatus),
679 |     ?assert(is_list(NST#state.change_queue)),
680 |     ok = cache_destroy(C0).
681 | 
682 | simple_test() ->
683 |     RootPath = "test/cache1/",
684 |     PartitionID = 99,
685 |     aae_util:clean_subdir(RootPath ++ "/" ++ integer_to_list(PartitionID)),
686 | 
687 |     GenerateKeyFun = aae_util:test_key_generator(hash),
688 | 
689 |     InitialKeys = lists:map(GenerateKeyFun, lists:seq(1, 100)),
690 |     AlternateKeys = lists:map(GenerateKeyFun, lists:seq(61, 80)),
691 |     RemoveKeys = lists:map(GenerateKeyFun, lists:seq(81, 100)),
692 | 
693 |     {ok, AAECache0} = cache_new(RootPath, PartitionID, undefined),
694 | 
695 |     {AddFun, AlterFun, RemoveFun} = test_setup_funs(InitialKeys),
696 | 
697 |     lists:foreach(AddFun(AAECache0), InitialKeys),
698 | 
699 |     ok = cache_close(AAECache0),
700 | 
701 |     {true, AAECache1} = cache_open(RootPath, PartitionID, undefined),
702 | 
703 |     lists:foreach(AlterFun(AAECache1), AlternateKeys),
704 |     lists:foreach(RemoveFun(AAECache1), RemoveKeys),
705 | 
706 |     %% Now build the equivalent outside of the process
707 |     %% Accouting up-fron for the removals and the alterations
708 |     KHL0 = lists:sublist(InitialKeys, 60) ++ AlternateKeys,
709 |     DirectAddFun =
710 |         fun({K, H}, TreeAcc) ->
711 |             leveled_tictac:add_kv(
712 |                 TreeAcc,
713 |                 K,
714 |                 H,
715 |                 fun(Key, Value) ->
716 |                     {Key, {is_hash, Value}}
717 |                 end
718 |             )
719 |         end,
720 |     CompareTree =
721 |         lists:foldl(
722 |             DirectAddFun,
723 |             leveled_tictac:new_tree(raw, ?TREE_SIZE),
724 |             KHL0
725 |         ),
726 |     CompareRoot = leveled_tictac:fetch_root(CompareTree),
727 |     Root = cache_root(AAECache1),
728 |     ?assertMatch(Root, CompareRoot),
729 | 
730 |     ok = cache_destroy(AAECache1).
731 | 
732 | replace_test() ->
733 |     RootPath = "test/cache1/",
734 |     PartitionID = 99,
735 |     aae_util:clean_subdir(RootPath ++ "/" ++ integer_to_list(PartitionID)),
736 |     GenerateKeyFun = aae_util:test_key_generator(hash),
737 | 
738 |     InitialKeys = lists:map(GenerateKeyFun, lists:seq(1, 100)),
739 |     AlternateKeys = lists:map(GenerateKeyFun, lists:seq(61, 80)),
740 |     RemoveKeys = lists:map(GenerateKeyFun, lists:seq(81, 100)),
741 | 
742 |     {ok, AAECache0} = cache_new(RootPath, PartitionID, undefined),
743 | 
744 |     {AddFun, AlterFun, RemoveFun} = test_setup_funs(InitialKeys),
745 | 
746 |     lists:foreach(AddFun(AAECache0), InitialKeys),
747 |     ok = cache_startload(AAECache0),
748 | 
749 |     lists:foreach(AlterFun(AAECache0), AlternateKeys),
750 |     lists:foreach(RemoveFun(AAECache0), RemoveKeys),
751 | 
752 |     %% Now build the equivalent outside of the process
753 |     %% Accouting up-fron for the removals and the alterations
754 |     KHL0 = lists:sublist(InitialKeys, 60) ++ AlternateKeys,
755 |     DirectAddFun =
756 |         fun({K, H}, TreeAcc) ->
757 |             leveled_tictac:add_kv(
758 |                 TreeAcc,
759 |                 K,
760 |                 H,
761 |                 fun(Key, Value) -> {Key, {is_hash, Value}} end
762 |             )
763 |         end,
764 |     CompareTree =
765 |         lists:foldl(
766 |             DirectAddFun, leveled_tictac:new_tree(raw, ?TREE_SIZE), KHL0
767 |         ),
768 | 
769 |     %% The load tree is a tree as would have been produced by a fold over a
770 |     %% snapshot taken at the time all the initial keys added.
771 |     %%
772 |     %% If we now complete the load using this tree, the comparison should
773 |     %% still match.  The cache should be replaced by one playing the stored
774 |     %% alterations ont the load tree.
775 | 
776 |     LoadTree =
777 |         lists:foldl(
778 |             DirectAddFun,
779 |             leveled_tictac:new_tree(raw, ?TREE_SIZE),
780 |             InitialKeys
781 |         ),
782 | 
783 |     ok = cache_completeload(AAECache0, LoadTree),
784 | 
785 |     CompareRoot = leveled_tictac:fetch_root(CompareTree),
786 |     Root = cache_root(AAECache0),
787 |     ?assertMatch(Root, CompareRoot),
788 | 
789 |     cache_alter(AAECache0, <<"K_With0Hash">>, 0, none),
790 |     % Key added with a Vclock  that hashes to 0
791 |     cache_alter(AAECache0, <<"K_With0Hash">>, (1 bsl 27) - 1, 0),
792 |     % Key now has a Vclock that hashes to 2 ^ 27 -1 (the top of the hash range)
793 |     CompareTree1 = DirectAddFun(
794 |         {<<"K_With0Hash">>, (1 bsl 27) - 1}, CompareTree
795 |     ),
796 |     AlterRoot = cache_root(AAECache0),
797 |     AlterComapreRoot = leveled_tictac:fetch_root(CompareTree1),
798 |     % Altering a key which had a hash of 0 has the same impact as inserting from scratch
799 |     ?assertMatch(AlterRoot, AlterComapreRoot),
800 | 
801 |     cache_alter(AAECache0, <<"K_With0Hash">>, none, (1 bsl 27) - 1),
802 | 
803 |     % Removing the key => as if it was never there
804 |     NewRoot = cache_root(AAECache0),
805 |     ?assertMatch(Root, NewRoot),
806 | 
807 |     cache_alter(AAECache0, <<"K_WithNeutralChange">>, 1, none),
808 |     cache_alter(AAECache0, <<"K_WithNeutralChange">>, none, none),
809 |     cache_alter(AAECache0, <<"K_WithNeutralChange">>, none, 1),
810 | 
811 |     UnchangedRoot = cache_root(AAECache0),
812 |     ?assertMatch(Root, UnchangedRoot),
813 | 
814 |     ok = cache_destroy(AAECache0).
815 | 
816 | dirty_segment_test() ->
817 |     % Segments based on
818 |     GetSegFun =
819 |         fun(BinaryKey) ->
820 |             SegmentID = leveled_tictac:keyto_segment48(BinaryKey),
821 |             aae_keystore:generate_treesegment(SegmentID)
822 |         end,
823 |     % Have clashes with keys of integer_to_binary/1 and integers -
824 |     % [4241217,2576207,2363385]
825 |     RootPath = "test/dirtysegment/",
826 |     PartitionID = 99,
827 |     aae_util:clean_subdir(RootPath ++ "/" ++ integer_to_list(PartitionID)),
828 | 
829 |     {ok, AAECache0} = cache_new(RootPath, PartitionID, undefined),
830 |     AddFun =
831 |         fun(I) ->
832 |             K = integer_to_binary(I),
833 |             H = erlang:phash2(rand:uniform(100000)),
834 |             cache_alter(AAECache0, K, H, none)
835 |         end,
836 | 
837 |     lists:foreach(AddFun, lists:seq(2350000, 2380000)),
838 | 
839 |     K0 = integer_to_binary(2363385),
840 |     K1 = integer_to_binary(2576207),
841 |     K2 = integer_to_binary(4241217),
842 |     S0 = GetSegFun(K0),
843 |     S1 = GetSegFun(K1),
844 |     S2 = GetSegFun(K2),
845 |     ?assertMatch(true, S0 == S1),
846 |     ?assertMatch(true, S0 == S2),
847 |     BranchID = S0 bsr 8,
848 |     LeafID = S0 band 255,
849 | 
850 |     Leaf0 = get_leaf(AAECache0, BranchID, LeafID),
851 | 
852 |     ?assertMatch(false, Leaf0 == 0),
853 | 
854 |     H1 = erlang:phash2(rand:uniform(100000)),
855 |     H2 = erlang:phash2(rand:uniform(100000)),
856 |     {_HK1, TTH1} = leveled_tictac:tictac_hash(K1, {is_hash, H1}),
857 |     {_HK2, TTH2} = leveled_tictac:tictac_hash(K2, {is_hash, H2}),
858 | 
859 |     cache_alter(AAECache0, K1, H1, none),
860 | 
861 |     Leaf1 = get_leaf(AAECache0, BranchID, LeafID),
862 |     ?assertMatch(Leaf1, Leaf0 bxor TTH1),
863 | 
864 |     GUID0 = leveled_util:generate_uuid(),
865 |     NOTGUID = "NOT GUID",
866 | 
867 |     cache_markdirtysegments(AAECache0, [S0], GUID0),
868 |     % Replace with wrong GUID ignored
869 |     cache_replacedirtysegments(AAECache0, [{S0, Leaf0}], NOTGUID),
870 |     ?assertMatch(Leaf1, get_leaf(AAECache0, BranchID, LeafID)),
871 | 
872 |     % Replace with right GUID succeeds
873 |     cache_replacedirtysegments(AAECache0, [{S0, Leaf0}], GUID0),
874 |     ?assertMatch(Leaf0, get_leaf(AAECache0, BranchID, LeafID)),
875 | 
876 |     GUID1 = leveled_util:generate_uuid(),
877 |     cache_markdirtysegments(AAECache0, [S0], GUID1),
878 |     cache_alter(AAECache0, K2, H2, none),
879 |     Leaf2 = get_leaf(AAECache0, BranchID, LeafID),
880 |     ?assertMatch(Leaf2, Leaf0 bxor TTH2),
881 |     cache_replacedirtysegments(AAECache0, [{S0, Leaf0}], GUID1),
882 |     % Replace has been ignored due to update - so still Leaf2
883 |     ?assertMatch(Leaf2, get_leaf(AAECache0, BranchID, LeafID)),
884 | 
885 |     GUID2 = leveled_util:generate_uuid(),
886 |     cache_markdirtysegments(AAECache0, [S0], GUID2),
887 |     cache_startload(AAECache0),
888 |     cache_replacedirtysegments(AAECache0, [{S0, Leaf0}], GUID2),
889 |     % Replace has been ignored due to load - so still Leaf2
890 |     ?assertMatch(Leaf2, get_leaf(AAECache0, BranchID, LeafID)),
891 | 
892 |     ok = cache_destroy(AAECache0).
893 | 
894 | get_leaf(AAECache0, BranchID, LeafID) ->
895 |     [{BranchID, LeafBin}] = cache_leaves(AAECache0, [BranchID]),
896 |     LeafStartPos = LeafID * 4,
897 |     <<_Pre:LeafStartPos/binary, Leaf:32/integer, _Rest/binary>> = LeafBin,
898 |     Leaf.
899 | 
900 | coverage_cheat_test() ->
901 |     {ok, _State1} = code_change(null, #state{}, null),
902 |     {stop, normal, _State2} = handle_info({'EXIT', self(), "Test"}, #state{}).
903 | 
904 | test_setup_funs(InitialKeys) ->
905 |     AddFun =
906 |         fun(CachePid) ->
907 |             fun({K, H}) ->
908 |                 cache_alter(CachePid, K, H, none)
909 |             end
910 |         end,
911 |     AlterFun =
912 |         fun(CachePid) ->
913 |             fun({K, H}) ->
914 |                 {K, OH} = lists:keyfind(K, 1, InitialKeys),
915 |                 cache_alter(CachePid, K, H, OH)
916 |             end
917 |         end,
918 |     RemoveFun =
919 |         fun(CachePid) ->
920 |             fun({K, _H}) ->
921 |                 {K, OH} = lists:keyfind(K, 1, InitialKeys),
922 |                 cache_alter(CachePid, K, none, OH)
923 |             end
924 |         end,
925 |     {AddFun, AlterFun, RemoveFun}.
926 | 
927 | -endif.
928 | 


--------------------------------------------------------------------------------