├── .github └── workflows │ └── erlang.yml ├── .gitignore ├── .travis.yml ├── LICENSE ├── Makefile ├── README.md ├── c_src └── riak_ensemble_clock.c ├── doc ├── Readme.md ├── cluster.png ├── cluster.svg ├── hierarchy.png ├── hierarchy.svg ├── peer_states.dot ├── peer_states.png ├── peer_states.svg ├── round_trip.png ├── round_trip.svg └── update_line_numbers.erl ├── eqc ├── sc.erl └── synctree_eqc.erl ├── include └── riak_ensemble_types.hrl ├── rebar.config ├── rebar3 ├── src ├── riak_ensemble.app.src ├── riak_ensemble_app.erl ├── riak_ensemble_backend.erl ├── riak_ensemble_basic_backend.erl ├── riak_ensemble_client.erl ├── riak_ensemble_clock.erl ├── riak_ensemble_config.erl ├── riak_ensemble_exchange.erl ├── riak_ensemble_lease.erl ├── riak_ensemble_manager.erl ├── riak_ensemble_msg.erl ├── riak_ensemble_peer.erl ├── riak_ensemble_peer_sup.erl ├── riak_ensemble_peer_tree.erl ├── riak_ensemble_peer_worker.erl ├── riak_ensemble_root.erl ├── riak_ensemble_router.erl ├── riak_ensemble_router_sup.erl ├── riak_ensemble_save.erl ├── riak_ensemble_state.erl ├── riak_ensemble_storage.erl ├── riak_ensemble_sup.erl ├── riak_ensemble_test.erl ├── riak_ensemble_util.erl ├── synctree.erl ├── synctree_ets.erl ├── synctree_leveldb.erl └── synctree_orddict.erl └── test ├── TESTS ├── basic_test.erl ├── corrupt_exchange_test.erl ├── corrupt_follower_test.erl ├── corrupt_segment_test.erl ├── corrupt_upper_test.erl ├── drop_write_test.erl ├── ens_test.erl ├── ensemble_tests_pure.erl ├── expand_test.erl ├── intercept.erl ├── leadership_watchers.erl ├── lease_test.erl ├── read_tombstone_test.erl ├── replace_members_test.erl ├── riak_ensemble_basic_backend_intercepts.erl ├── riak_ensemble_peer_intercepts.erl ├── rt_intercept.erl ├── run.sh ├── synctree_intercepts.erl ├── synctree_path_test.erl ├── synctree_pure.erl └── synctree_remote.erl /.github/workflows/erlang.yml: -------------------------------------------------------------------------------- 1 | name: Erlang CI 2 | 3 | on: 4 | push: 5 | branches: [ develop ] 6 | pull_request: 7 | branches: [ develop ] 8 | 9 | 10 | jobs: 11 | 12 | build: 13 | 14 | runs-on: ubuntu-latest 15 | 16 | strategy: 17 | fail-fast: false 18 | matrix: 19 | otp: 20 | - "25.1" 21 | - "24.3" 22 | - "22.3" 23 | 24 | container: 25 | image: erlang:${{ matrix.otp }} 26 | 27 | steps: 28 | - uses: lukka/get-cmake@latest 29 | - uses: actions/checkout@v2 30 | - name: Compile 31 | run: ./rebar3 compile 32 | - name: Run xref and dialyzer 33 | run: ./rebar3 do xref, dialyzer 34 | - name: Run eunit 35 | run: ./rebar3 as gha do eunit 36 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.beam 2 | *.eqc 3 | .eunit/* 4 | ebin 5 | .local_dialyzer_plt 6 | deps/* 7 | edoc 8 | .rebar/ 9 | deps.test/* 10 | _build 11 | .rebar3 12 | rebar.lock 13 | priv/ 14 | c_src/ 15 | .DS_Store 16 | log/*\.log 17 | .eqc-info 18 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: erlang 2 | otp_release: 3 | - 20.3.8 4 | - 21.3 5 | - 22.3 6 | script: 7 | - chmod u+x rebar3 8 | - ./rebar3 do upgrade, compile, xref, dialyzer, eunit 9 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: compile rel cover test dialyzer 2 | REBAR=./rebar3 3 | 4 | compile: 5 | $(REBAR) compile 6 | 7 | clean: 8 | $(REBAR) clean 9 | 10 | cover: test 11 | $(REBAR) cover 12 | 13 | test: compile 14 | $(REBAR) as test do eunit 15 | 16 | dialyzer: 17 | $(REBAR) dialyzer 18 | 19 | xref: 20 | $(REBAR) xref 21 | 22 | check: test dialyzer xref 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Erlang CI Actions Status](https://github.com/basho/riak_ensemble/workflows/Erlang%20CI/badge.svg)](https://github.com/basho/riak_ensemble/actions) 2 | 3 | (Note: Work-in-progress documentation [here](https://github.com/basho/riak_ensemble/blob/develop/doc/Readme.md)) 4 | 5 | `riak_ensemble` is a consensus library that supports creating multiple 6 | consensus groups (ensembles). Each ensemble is a separate Multi-Paxos 7 | instance with its own leader, set of members, and state. 8 | 9 | Each ensemble also supports an extended API that provides consistent 10 | key/value operations. Conceptually, this is identical to treating each 11 | key as a separate Paxos entity. However, this isn't accomplished by 12 | having each key maintain its own Paxos group. Instead, an ensemble 13 | emulates per-key consensus through a combination of per-key and 14 | per-ensemble state. 15 | 16 | As mentioned, `riak_ensemble` supports multiple independent consensus 17 | groups. Ensembles are created dynamically, allowing applications to 18 | use `riak_ensemble` in whatever way best fits that 19 | application. Each ensemble also supports dynamic ensemble membership, 20 | using joint consensus to guarantee consistent membership transitions. 21 | 22 | A given ensemble is configured to use a particular "ensemble backend". 23 | An ensemble backend is an implemented behavior that defines how a given 24 | peer actually stores/retrieves data. For example, `riak_ensemble` 25 | ships with the `riak_ensemble_basic_backend` which stores data as an 26 | `orddict` that is saved to disk using `term_to_binary`, whereas 27 | `riak_kv` implements the `riak_kv_ensemble_backend` that is used to 28 | interface with Riak's vnodes. 29 | 30 | Better documentation is coming soon. For now, this talk from RICON 31 | West 2013 discusses the design of `riak_ensemble` in the context of 32 | adding consistency to Riak 2.0: 33 | http://www.youtube.com/watch?v=gXJxbhca5Xg 34 | -------------------------------------------------------------------------------- /c_src/riak_ensemble_clock.c: -------------------------------------------------------------------------------- 1 | /******************************************************************** 2 | * 3 | * Copyright (c) 2014 Basho Technologies, Inc. All Rights Reserved. 4 | * 5 | * This file is provided to you under the Apache License, 6 | * Version 2.0 (the "License"); you may not use this file 7 | * except in compliance with the License. You may obtaine 8 | * a copy of the License at 9 | * 10 | * http: www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | * 19 | ********************************************************************/ 20 | #include "erl_nif.h" 21 | 22 | #include 23 | #include 24 | #include 25 | #include 26 | 27 | #if defined(__MACH__) && defined(__APPLE__) 28 | #include 29 | #include 30 | #endif 31 | 32 | static ERL_NIF_TERM ATOM_OK; 33 | static ERL_NIF_TERM ATOM_ERROR; 34 | 35 | #if defined(__MACH__) && defined(__APPLE__) 36 | static mach_timebase_info_data_t timebase_info; 37 | #endif 38 | 39 | /*********************************************************************/ 40 | 41 | #if defined(_POSIX_TIMERS) && (_POSIX_TIMERS > 0) 42 | uint64_t posix_get_clock(clockid_t clock) 43 | { 44 | struct timespec ts; 45 | if(clock_gettime(clock, &ts) == -1) 46 | return 0; 47 | return ((uint64_t)ts.tv_sec * 1000000000) + ts.tv_nsec; 48 | } 49 | 50 | /* Note: Prefer CLOCK_BOOTTIME on Linux where supported, as this 51 | includes time spent in suspend. CLOCK_MONOTONIC may or may 52 | not include time spent in suspend -- it's CPU dependent. In 53 | practice, this shouldn't matter -- people don't typically 54 | suspend/resume production servers while under client load. 55 | Likewise, client TCP connections are unlikely to survive 56 | across reasonable suspend durations. 57 | */ 58 | 59 | uint64_t posix_monotonic_time(void) 60 | { 61 | uint64_t time; 62 | #if defined(CLOCK_BOOTTIME) 63 | if((time = posix_get_clock(CLOCK_BOOTTIME))) 64 | return time; 65 | #elif defined(CLOCK_MONOTONIC) 66 | if((time = posix_get_clock(CLOCK_MONOTONIC))) 67 | return time; 68 | #endif 69 | return 0; 70 | } 71 | #endif 72 | 73 | /********************************************************************* 74 | * See Apple technical note: * 75 | * https://developer.apple.com/library/mac/qa/qa1398/_index.html * 76 | *********************************************************************/ 77 | 78 | /* Note: mach_absolute_time() is based on the CPU timestamp counter, 79 | which is synchronized across all CPUs since Intel Nehalem. 80 | Earlier CPUs do not provide this guarantee. It's unclear if 81 | Apple provides any correction for this behavior on older CPUs. 82 | We assume this doesn't matter in practice -- people don't use 83 | ancient OS X machines as production servers. 84 | */ 85 | 86 | #if defined(__MACH__) && defined(__APPLE__) 87 | uint64_t osx_monotonic_time(void) 88 | { 89 | uint64_t time; 90 | uint64_t timeNano; 91 | 92 | time = mach_absolute_time(); 93 | 94 | // Do the maths. We hope that the multiplication doesn't 95 | // overflow; the price you pay for working in fixed point. 96 | 97 | timeNano = time * timebase_info.numer / timebase_info.denom; 98 | 99 | return timeNano; 100 | } 101 | #endif 102 | 103 | /*********************************************************************/ 104 | 105 | static uint64_t get_monotonic_time() 106 | { 107 | uint64_t time = 0; 108 | 109 | #if defined(__MACH__) && defined(__APPLE__) 110 | time = osx_monotonic_time(); 111 | #endif 112 | 113 | #if defined(_POSIX_TIMERS) && (_POSIX_TIMERS > 0) 114 | time = posix_monotonic_time(); 115 | #endif 116 | 117 | return time; 118 | } 119 | 120 | /*********************************************************************/ 121 | 122 | static ERL_NIF_TERM monotonic_time(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]) 123 | { 124 | uint64_t time = get_monotonic_time(); 125 | 126 | if(time) { 127 | return enif_make_tuple2(env, ATOM_OK, enif_make_uint64(env, time)); 128 | } 129 | else { 130 | return ATOM_ERROR; 131 | } 132 | } 133 | 134 | /*********************************************************************/ 135 | 136 | static ERL_NIF_TERM monotonic_time_ms(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]) 137 | { 138 | uint64_t time = get_monotonic_time() / 1000000; 139 | 140 | if(time) { 141 | return enif_make_tuple2(env, ATOM_OK, enif_make_uint64(env, time)); 142 | } 143 | else { 144 | return ATOM_ERROR; 145 | } 146 | } 147 | 148 | /*********************************************************************/ 149 | 150 | static void init(ErlNifEnv *env) 151 | { 152 | ATOM_OK = enif_make_atom(env, "ok"); 153 | ATOM_ERROR = enif_make_atom(env, "error"); 154 | 155 | #if defined(__MACH__) && defined(__APPLE__) 156 | (void) mach_timebase_info(&timebase_info); 157 | #endif 158 | } 159 | 160 | static int on_load(ErlNifEnv* env, void** priv_data, ERL_NIF_TERM load_info) 161 | { 162 | init(env); 163 | return 0; 164 | } 165 | 166 | static int on_upgrade(ErlNifEnv* env, void** priv_data, void** old_priv_data, 167 | ERL_NIF_TERM load_info) 168 | { 169 | init(env); 170 | return 0; 171 | } 172 | 173 | static void on_unload(ErlNifEnv *env, void *priv_data) 174 | { 175 | } 176 | 177 | /*********************************************************************/ 178 | 179 | static ErlNifFunc nif_funcs[] = { 180 | {"monotonic_time", 0, monotonic_time}, 181 | {"monotonic_time_ms", 0, monotonic_time_ms} 182 | }; 183 | 184 | ERL_NIF_INIT(riak_ensemble_clock, nif_funcs, &on_load, NULL, &on_upgrade, &on_unload) 185 | -------------------------------------------------------------------------------- /doc/cluster.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/basho/riak_ensemble/d57c457ee738a60153d9307a12f4bc86d10c85bd/doc/cluster.png -------------------------------------------------------------------------------- /doc/hierarchy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/basho/riak_ensemble/d57c457ee738a60153d9307a12f4bc86d10c85bd/doc/hierarchy.png -------------------------------------------------------------------------------- /doc/hierarchy.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 19 | 21 | 46 | 48 | 49 | 51 | image/svg+xml 52 | 54 | 55 | 56 | 57 | 58 | 63 | 68 | 74 | 79 | 85 | 90 | 96 | 102 | 107 | 112 | 115 | 122 | riak_ensemble_sup 133 | 134 | 137 | 145 | riak_ensemble_router_sup 157 | 158 | 161 | 169 | riak_ensemble_peer_sup 180 | 181 | 184 | 186 | 194 | riak_ensemble_storage 206 | 217 | 218 | 219 | 222 | 230 | riak_ensemble_manager 242 | 243 | 263 | 271 | 279 | 287 | riak_ensemble_router_1 298 | 306 | 314 | 322 | riak_ensemble_peer 333 | 336 | 344 | 352 | 360 | peer workers 371 | 372 | 375 | 383 | lease worker 394 | 395 | 398 | 406 | sync tree 417 | 418 | 419 | 420 | -------------------------------------------------------------------------------- /doc/peer_states.dot: -------------------------------------------------------------------------------- 1 | digraph PeerStates 2 | { 3 | graph [ fontname = "Verdana", pad = 1 , rankdir=LR, ranksep=.50 ]; 4 | node [fontname = "Verdana", style=filled, fillcolor = "#ffccaa" ]; 5 | edge [fontname = "Verdana"]; 6 | Setup -> Probe; 7 | Probe -> Pending [label="Is pending member"] ; 8 | Probe -> Election [label="Quorum, no leader"]; 9 | Probe -> Following [label="Quorum, found leader"]; 10 | Probe -> Exchange [label="Quorum, tree untrusted"]; 11 | Election -> Prefollow [label="Got prepare"]; 12 | Election -> Following [label="Got commit"]; 13 | Election -> Prepare [label="Random time-out, send prepare"]; 14 | Pending -> Probe [label="Time out"]; 15 | Pending -> Prefollow [label="Got prepare"]; 16 | Pending -> Following [label="Got commit"]; 17 | Prefollow -> Following [label="Got epoch"]; 18 | Prefollow -> Probe [label="Timed out or got unexpected epoch"]; 19 | Following -> Probe [label="Time out or exchanged failed"]; 20 | Prepare -> Probe [label="Time out"]; 21 | Prepare -> Prelead [label="Quorum replied, send new epoch"]; 22 | Prelead -> Leading [label="Quorum accepted new epoch"]; 23 | Prelead -> Probe [label="Epoch replies timed out"]; 24 | Leading -> Probe [label="Most failures"]; 25 | Leading -> Prepare [label="KV failure"]; 26 | Exchange -> Election [label="Exchange complete"]; 27 | Exchange -> Probe [label="Failed"]; 28 | Repair -> Exchange [taillabel="Repair complete"]; 29 | "Any State" -> Repair [label="Tree corrupted"]; 30 | } 31 | -------------------------------------------------------------------------------- /doc/peer_states.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/basho/riak_ensemble/d57c457ee738a60153d9307a12f4bc86d10c85bd/doc/peer_states.png -------------------------------------------------------------------------------- /doc/round_trip.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/basho/riak_ensemble/d57c457ee738a60153d9307a12f4bc86d10c85bd/doc/round_trip.png -------------------------------------------------------------------------------- /doc/round_trip.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 19 | 21 | 44 | 51 | 52 | 54 | 55 | 57 | image/svg+xml 58 | 60 | 61 | 62 | 63 | 64 | 69 | 79 | 87 | 95 | 101 | 107 | 113 | 119 | A 130 | B 141 | C 152 | 162 | 170 | 178 | 184 | 190 | A 201 | B 212 | C 223 | 229 | 230 | 231 | -------------------------------------------------------------------------------- /doc/update_line_numbers.erl: -------------------------------------------------------------------------------- 1 | %% Escript that will update the line numbers for links to functions 2 | %% in the markdown documentation. 3 | %% 4 | %% It expects function references like the following in the text: 5 | %% - [module:function/arity][] 6 | %% - [name for reference][module:function/arity] 7 | %% - [module:function/arity](path/to/source.erl#L34) 8 | %% 9 | %% And writes a markdown reference section at the end of the file. 10 | %% These references are not visible when markdown is rendered. 11 | %% 12 | %% [mod:fun1/arity]: ../src/mod_file.erl#L34 13 | %% [mod:fun2/arity]: ../src/mod_file.erl#L56 14 | %% ... 15 | %% 16 | %% To update the docs, run this after compiling the library: 17 | %% 18 | %% $ cd riak_ensemble/doc 19 | %% $ escript update_line_numbers.erl ../ebin *.md 20 | 21 | -module(update_line_numbers). 22 | -mode(compile). 23 | -export([main/1]). 24 | 25 | main([Ebin | MdFiles]) -> 26 | [update_file(Ebin, MdFile) || MdFile <- MdFiles]. 27 | 28 | update_file(Ebin, MdFile) -> 29 | % Parse function references in file 30 | {ok, Text} = file:read_file(MdFile), 31 | WantedFuns = lists:usort(parse_funs(Text)), 32 | FunSet = sets:from_list(WantedFuns), 33 | WantedMods = lists:usort([Mod || {Mod, _, _} <- WantedFuns]), 34 | % Get line numbers from beam files into a dict 35 | FoundLines = lists:usort(lists:flatten([get_line_nums(beam_filename(Ebin, Mod), Mod, FunSet) || Mod <- WantedMods])), 36 | FoundSet = sets:from_list([MFA || {MFA, _Line} <- FoundLines]), 37 | MissingSet = sets:subtract(FunSet, FoundSet), 38 | sets:size(MissingSet) > 0 andalso report_missing(MissingSet), 39 | LineList = [fun_line_text(E) || E <- FoundLines], 40 | LineMap = dict:from_list(LineList), 41 | % Insert line number info into target files 42 | update_file(MdFile, Text, LineMap, LineList), 43 | io:format("Updated function line numbers in ~p\n", [MdFile]). 44 | 45 | report_missing(MissingSet) -> 46 | MissingList = lists:sort(sets:to_list(MissingSet)), 47 | io:format(standard_error, "[WARNING] Could not find the following functions:\n", []), 48 | [io:format(standard_error, "\t~p:~p/~p\n", [M, F, A]) || 49 | {M, F, A} <- MissingList]. 50 | 51 | parse_funs(Text) -> 52 | {ok, Re} = re:compile("\\[(\\w+):(\\w+)/(\\d+)\\]"), 53 | case re:run(Text, Re, [{capture, [1,2,3], binary}, global]) of 54 | nomatch -> 55 | []; 56 | {match, Matches} -> 57 | [{binary_to_atom(Mod, utf8), 58 | binary_to_atom(Fun, utf8), 59 | binary_to_integer(Arity)} 60 | || [Mod, Fun, Arity] <- lists:usort(Matches)] 61 | end. 62 | 63 | beam_filename(Ebin, Mod) -> 64 | filename:join(Ebin, atom_to_list(Mod) ++ ".beam"). 65 | 66 | update_file(Filename, Text, LineMap, LineList) -> 67 | NewText = update_lines(Text, LineMap, LineList), 68 | file:write_file(Filename, NewText). 69 | 70 | % Extracts all function line numbers from abstract code chunk in beam file. 71 | get_line_nums(BeamFile, Mod, FunSet) -> 72 | {ok, {_, [{abstract_code, {_, Items}}]}} = beam_lib:chunks(BeamFile, [abstract_code]), 73 | [{{Mod, Fun, Arity}, Line} || {function, Line, Fun, Arity, _} <- Items, 74 | sets:is_element({Mod, Fun, Arity}, FunSet)]. 75 | 76 | fun_line_text({{Mod, Fun, Arity}, Line}) -> 77 | {mfa_bin(Mod, Fun, Arity), line_url(Mod, Line)}. 78 | 79 | mfa_bin(M, F, A) -> 80 | BM = atom_to_binary(M, utf8), 81 | BF = atom_to_binary(F, utf8), 82 | BA = integer_to_binary(A), 83 | <<"[", BM/binary, ":", BF/binary,"/", BA/binary, "]">>. 84 | 85 | line_url(M, L) -> 86 | list_to_binary("../src/" ++ atom_to_list(M) ++ ".erl#L" ++ integer_to_list(L)). 87 | 88 | update_lines(Text, LineMap, LineList) -> 89 | Tokens = re:split(Text, "(\\[\\w+:\\w+/\\d+\\])\\s*(:.*$\n|\\([^)]*\\))", [multiline]), 90 | Lines1 = replace_line_nums(Tokens, LineMap, []), 91 | RefLines = lists:flatten([ [Fun, <<": ">>, Line, <<"\n">>] || {Fun, Line} <- LineList]), 92 | Lines1 ++ RefLines. 93 | 94 | replace_line_nums([], _, Acc) -> 95 | lists:reverse(Acc); 96 | replace_line_nums([_, <<":", _/binary>> | Rest], LineMap, Acc) -> 97 | replace_line_nums(Rest, LineMap, Acc); 98 | replace_line_nums([MaybeFun, Bin = <<"(", _/binary>> | Rest], LineMap, Acc) -> 99 | case dict:find(MaybeFun, LineMap) of 100 | {ok, _Line} -> 101 | NewText = list_to_binary([MaybeFun, "[]"]), 102 | replace_line_nums(Rest, LineMap, [NewText|Acc]); 103 | _ -> 104 | replace_line_nums(Rest, LineMap, [Bin, MaybeFun | Acc]) 105 | end; 106 | replace_line_nums([Bin|Rest], LineMap, Acc) -> 107 | replace_line_nums(Rest, LineMap, [Bin|Acc]). 108 | 109 | -------------------------------------------------------------------------------- /eqc/synctree_eqc.erl: -------------------------------------------------------------------------------- 1 | %% Port of EQC test from riak_core/hashtree.erl 2 | 3 | -module(synctree_eqc). 4 | -export([prop_correct/0]). 5 | 6 | -include_lib("eqc/include/eqc.hrl"). 7 | -include_lib("eunit/include/eunit.hrl"). 8 | 9 | bin(X) -> 10 | list_to_binary(integer_to_list(X)). 11 | 12 | objects() -> 13 | ?SIZED(Size, objects(Size+3)). 14 | 15 | objects(N) -> 16 | ?LET(Keys, shuffle(lists:seq(1,N)), 17 | [{bin(K), binary(8)} || K <- Keys] 18 | ). 19 | 20 | lengths(N) -> 21 | ?LET(MissingN1, choose(0,N), 22 | ?LET(MissingN2, choose(0,N-MissingN1), 23 | ?LET(DifferentN, choose(0,N-MissingN1-MissingN2), 24 | {MissingN1, MissingN2, DifferentN}))). 25 | 26 | mutate(Binary) -> 27 | L1 = binary_to_list(Binary), 28 | [X|Xs] = L1, 29 | X2 = (X+1) rem 256, 30 | L2 = [X2|Xs], 31 | list_to_binary(L2). 32 | 33 | prop_correct() -> 34 | ?FORALL(Objects, objects(), 35 | ?FORALL({MissingN1, MissingN2, DifferentN}, lengths(length(Objects)), 36 | begin 37 | {RemoteOnly, Objects2} = lists:split(MissingN1, Objects), 38 | {LocalOnly, Objects3} = lists:split(MissingN2, Objects2), 39 | {Different, Same} = lists:split(DifferentN, Objects3), 40 | 41 | Different2 = [{Key, mutate(Hash)} || {Key, Hash} <- Different], 42 | 43 | Insert = fun(Tree, Vals) -> 44 | lists:foldl(fun({Key, Hash}, Acc) -> 45 | synctree:insert(Key, Hash, Acc) 46 | end, Tree, Vals) 47 | end, 48 | 49 | [begin 50 | A1 = synctree:new({0,Id}), 51 | B1 = synctree:new({0,Id}), 52 | 53 | A2 = Insert(A1, Same), 54 | A3 = Insert(A2, LocalOnly), 55 | A4 = Insert(A3, Different), 56 | 57 | B2 = Insert(B1, Same), 58 | B3 = Insert(B2, RemoteOnly), 59 | B4 = Insert(B3, Different2), 60 | 61 | A5 = A4, 62 | B5 = B4, 63 | 64 | Expected = 65 | [{missing, Key} || {Key, _} <- RemoteOnly] ++ 66 | [{remote_missing, Key} || {Key, _} <- LocalOnly] ++ 67 | [{different, Key} || {Key, _} <- Different], 68 | 69 | KeyDiff = compare(A5, B5), 70 | 71 | ?assertEqual(lists:usort(Expected), 72 | lists:usort(KeyDiff)), 73 | 74 | %% Reconcile trees 75 | A6 = Insert(A5, RemoteOnly), 76 | B6 = Insert(B5, LocalOnly), 77 | B7 = Insert(B6, Different), 78 | A7 = A6, 79 | B8 = B7, 80 | ?assertEqual([], compare(A7, B8)), 81 | true 82 | end || Id <- lists:seq(0, 10)], 83 | %% close(A0), 84 | %% close(B0), 85 | %% destroy(A0), 86 | %% destroy(B0), 87 | true 88 | end)). 89 | 90 | compare(T1, T2) -> 91 | KeyDiff = synctree:local_compare(T1, T2), 92 | [case Delta of 93 | {Key, {'$none', _}} -> 94 | {missing, Key}; 95 | {Key, {_, '$none'}} -> 96 | {remote_missing, Key}; 97 | {Key, {_, _}} -> 98 | {different, Key} 99 | end || Delta <- KeyDiff]. 100 | -------------------------------------------------------------------------------- /include/riak_ensemble_types.hrl: -------------------------------------------------------------------------------- 1 | -type ensemble_id() :: term(). 2 | -type peer_id() :: {term(), node()}. 3 | -type leader_id() :: undefined | peer_id(). 4 | -type fixme() :: any(). 5 | -type views() :: [[peer_id()]]. 6 | -type peer_change() :: term(). %% FIXME 7 | -type change_error() :: already_member | not_member. 8 | -type std_reply() :: timeout | failed | unavailable | nack | {ok, term()}. 9 | -type maybe_pid() :: pid() | undefined. 10 | -type peer_pids() :: [{peer_id(), maybe_pid()}]. 11 | -type peer_reply() :: {peer_id(), term()}. 12 | -type epoch() :: integer(). 13 | -type seq() :: integer(). 14 | -type vsn() :: {epoch(), seq()}. 15 | -type peer_info() :: nodedown | undefined | {any(), boolean(), epoch()}. 16 | 17 | -type orddict(Key,Val) :: [{Key, Val}]. 18 | -type ordsets(Val) :: [Val]. 19 | 20 | -record(ensemble_info, {vsn :: vsn(), 21 | mod = riak_ensemble_basic_backend :: module(), 22 | args = [] :: [any()], 23 | leader :: leader_id(), 24 | views :: [[peer_id()]], 25 | seq :: {integer(), integer()} | undefined 26 | }). 27 | -type ensemble_info() :: #ensemble_info{}. 28 | 29 | %% -type ensemble_info() :: {leader_id(), [peer_id()], {integer(), integer()}, module(), [any()]}. 30 | 31 | -define(ENSEMBLE_TICK, riak_ensemble_config:tick()). 32 | -------------------------------------------------------------------------------- /rebar.config: -------------------------------------------------------------------------------- 1 | {minimum_otp_vsn, "22.0"}. 2 | 3 | {erl_opts, [debug_info, 4 | warnings_as_errors, 5 | warn_untyped_record, 6 | {platform_define, "^[0-9]+", namespaced_types}]}. 7 | 8 | {eunit_opts, [verbose]}. 9 | {edoc_opts, [preprocess, 10 | {dir, "edoc"}]}. 11 | {cover_enabled, true}. 12 | {xref_checks, [undefined_function_calls]}. 13 | {deps, [ 14 | {eleveldb, {git, "https://github.com/basho/eleveldb.git", {branch, "develop"}}} 15 | ]}. 16 | 17 | {profiles, [ 18 | {gha, [{erl_opts, [{d, 'GITHUBEXCLUDE'}]}]} 19 | ]}. 20 | 21 | {plugins, [{eqc_rebar, {git, "https://github.com/Quviq/eqc-rebar", {branch, "master"}}}, pc]}. 22 | {provider_hooks, 23 | [ 24 | {pre, 25 | [ 26 | {compile, {pc, compile}}, 27 | {clean, {pc, clean}} 28 | ] 29 | } 30 | ] 31 | }. 32 | 33 | {port_specs, 34 | [{".*", "priv/riak_ensemble.so", 35 | ["c_src/*.c*"], 36 | [{env, [{"CFLAGS", "$CFLAGS"}]}] 37 | }]}. 38 | -------------------------------------------------------------------------------- /rebar3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/basho/riak_ensemble/d57c457ee738a60153d9307a12f4bc86d10c85bd/rebar3 -------------------------------------------------------------------------------- /src/riak_ensemble.app.src: -------------------------------------------------------------------------------- 1 | {application, riak_ensemble, 2 | [ 3 | {description, "Multi-Paxos framework in Erlang"}, 4 | {vsn, git}, 5 | {registered, []}, 6 | {applications, [ 7 | kernel, 8 | stdlib, 9 | eleveldb 10 | ]}, 11 | {mod, { riak_ensemble_app, []}}, 12 | {env, []} 13 | ]}. 14 | -------------------------------------------------------------------------------- /src/riak_ensemble_app.erl: -------------------------------------------------------------------------------- 1 | %% ------------------------------------------------------------------- 2 | %% 3 | %% Copyright (c) 2013 Basho Technologies, Inc. All Rights Reserved. 4 | %% 5 | %% This file is provided to you under the Apache License, 6 | %% Version 2.0 (the "License"); you may not use this file 7 | %% except in compliance with the License. You may obtain 8 | %% a copy of the License at 9 | %% 10 | %% http://www.apache.org/licenses/LICENSE-2.0 11 | %% 12 | %% Unless required by applicable law or agreed to in writing, 13 | %% software distributed under the License is distributed on an 14 | %% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | %% KIND, either express or implied. See the License for the 16 | %% specific language governing permissions and limitations 17 | %% under the License. 18 | %% 19 | %% ------------------------------------------------------------------- 20 | -module(riak_ensemble_app). 21 | 22 | -behaviour(application). 23 | 24 | %% Application callbacks 25 | -export([start/2, stop/1]). 26 | 27 | %% =================================================================== 28 | %% Application callbacks 29 | %% =================================================================== 30 | 31 | start(_StartType, _StartArgs) -> 32 | riak_ensemble_sup:start_link(). 33 | 34 | stop(_State) -> 35 | ok. 36 | -------------------------------------------------------------------------------- /src/riak_ensemble_backend.erl: -------------------------------------------------------------------------------- 1 | %% ------------------------------------------------------------------- 2 | %% 3 | %% Copyright (c) 2013 Basho Technologies, Inc. All Rights Reserved. 4 | %% 5 | %% This file is provided to you under the Apache License, 6 | %% Version 2.0 (the "License"); you may not use this file 7 | %% except in compliance with the License. You may obtain 8 | %% a copy of the License at 9 | %% 10 | %% http://www.apache.org/licenses/LICENSE-2.0 11 | %% 12 | %% Unless required by applicable law or agreed to in writing, 13 | %% software distributed under the License is distributed on an 14 | %% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | %% KIND, either express or implied. See the License for the 16 | %% specific language governing permissions and limitations 17 | %% under the License. 18 | %% 19 | %% ------------------------------------------------------------------- 20 | 21 | %% @doc 22 | %% This module defines riak_ensemble_backend behavior used to implement 23 | %% custom peer behavior. 24 | 25 | -module(riak_ensemble_backend). 26 | 27 | -include_lib("riak_ensemble_types.hrl"). 28 | 29 | %% API 30 | -export([start/4, 31 | get_obj/3, 32 | set_obj/4, 33 | latest_obj/3, 34 | reply/2, 35 | pong/1]). 36 | 37 | %%=================================================================== 38 | 39 | -type state() :: any(). 40 | -type obj() :: any(). 41 | -type key() :: any(). 42 | -type value() :: any(). 43 | -type from() :: {{_,_}, peer_id()} | 44 | {riak_ensemble_msg:msg_from(), peer_id()}. 45 | 46 | -export_type([from/0]). 47 | 48 | %%=================================================================== 49 | 50 | %% Initialization callback that returns initial module state. 51 | -callback init(ensemble_id(), peer_id(), [any()]) -> state(). 52 | 53 | %% Create a new opaque key/value object using whatever representation 54 | %% the defining module desires. 55 | -callback new_obj(epoch(), seq(), key(), value()) -> obj(). 56 | 57 | %% Accessors to retrieve epoch/seq/key/value from an opaque object. 58 | -callback obj_epoch(obj()) -> epoch(). 59 | -callback obj_seq (obj()) -> seq(). 60 | -callback obj_key (obj()) -> term(). 61 | -callback obj_value(obj()) -> term(). 62 | 63 | %% Setters for epoch/seq/value for opaque objects. 64 | -callback set_obj_epoch(epoch(), obj()) -> obj(). 65 | -callback set_obj_seq (seq(), obj()) -> obj(). 66 | -callback set_obj_value(term(), obj()) -> obj(). 67 | 68 | %% Callback for get operations. Responsible for sending a reply to the 69 | %% waiting `from' process using {@link reply/2}. 70 | -callback get(key(), from(), state()) -> state(). 71 | 72 | %% Callback for put operations. Responsible for sending a reply to 73 | %% the waiting `from' process using {@link reply/2}. 74 | -callback put(key(), obj(), from(), state()) -> state(). 75 | 76 | %% Callback for periodic leader tick. This function is called periodically 77 | %% by an elected leader. Can be used to implement custom housekeeping. 78 | -callback tick(epoch(), seq(), peer_id(), views(), state()) -> state(). 79 | 80 | %% Callback used to ensure that the backend is still healthy. If `async' 81 | %% is returned, backend should eventually call {@link pong/1}. 82 | -callback ping(pid(), state()) -> {ok|async|failed, state()}. 83 | 84 | %% Callback to handle `'DOWN'` messages from monitored, backend related 85 | %% processes. Returns `false` to indicate that this is a reference not 86 | %% related to the backend. Returns `{ok, state()}` to indicate that the 87 | %% backend handled the message and that the peer can continue executing as 88 | %% before. Returns `{reset, state()}` to indicate that in flight requests are 89 | %% likely to fail and that any thing the peer needs to do to reset itself, such 90 | %% as restarting workers, should occur. 91 | -callback handle_down(reference(), pid(), term(), state()) -> false | 92 | {ok, state()} | 93 | {reset, state()}. 94 | %% Callback used to determine if peers using this backend can be started. 95 | -callback ready_to_start() -> boolean(). 96 | 97 | %% Callback that allows a backend to override where a peer's synctree 98 | %% is stored. By default, each peer has an entirely independent on-disk 99 | %% synctree. Using this callback, a backend could do a M:1 or M:N 100 | %% style mapping where multiple peers share an on-disk tree. 101 | %% 102 | %% If this function does not return `default`, it must return a tuple 103 | %% where the first element is an unique tree-id (as a binary) and the 104 | %% second element is the filename for the synctree. Multiple trees 105 | %% stored in the same on-disk synctree will be internally partitioned 106 | %% using the provided tree-id. 107 | -callback synctree_path(ensemble_id(), peer_id()) -> default | 108 | {binary(), string()}. 109 | 110 | %%=================================================================== 111 | 112 | start(Mod, Ensemble, Id, Args) -> 113 | Mod:init(Ensemble, Id, Args). 114 | 115 | get_obj(Mod, X, Obj) -> 116 | case X of 117 | epoch -> 118 | Mod:obj_epoch(Obj); 119 | seq -> 120 | Mod:obj_seq(Obj); 121 | key -> 122 | Mod:obj_key(Obj); 123 | value -> 124 | Mod:obj_value(Obj) 125 | end. 126 | 127 | set_obj(Mod, X, Val, Obj) -> 128 | case X of 129 | epoch -> 130 | Mod:set_obj_epoch(Val, Obj); 131 | seq -> 132 | Mod:set_obj_seq(Val, Obj); 133 | value -> 134 | Mod:set_obj_value(Val, Obj) 135 | end. 136 | 137 | latest_obj(Mod, ObjA, ObjB) -> 138 | A = {Mod:obj_epoch(ObjA), Mod:obj_seq(ObjA)}, 139 | B = {Mod:obj_epoch(ObjB), Mod:obj_seq(ObjB)}, 140 | case B > A of 141 | true -> ObjB; 142 | false -> ObjA 143 | end. 144 | 145 | -spec reply(from(), any()) -> ok. 146 | reply({{To, Tag}, _Id}, Reply) -> 147 | catch To ! {Tag, Reply}, 148 | ok; 149 | reply({From, Id}, Reply) -> 150 | riak_ensemble_msg:reply(From, Id, Reply), 151 | ok. 152 | 153 | -spec pong(pid()) -> ok. 154 | pong(From) -> 155 | riak_ensemble_peer:backend_pong(From). 156 | -------------------------------------------------------------------------------- /src/riak_ensemble_basic_backend.erl: -------------------------------------------------------------------------------- 1 | %% ------------------------------------------------------------------- 2 | %% 3 | %% Copyright (c) 2013 Basho Technologies, Inc. All Rights Reserved. 4 | %% 5 | %% This file is provided to you under the Apache License, 6 | %% Version 2.0 (the "License"); you may not use this file 7 | %% except in compliance with the License. You may obtain 8 | %% a copy of the License at 9 | %% 10 | %% http://www.apache.org/licenses/LICENSE-2.0 11 | %% 12 | %% Unless required by applicable law or agreed to in writing, 13 | %% software distributed under the License is distributed on an 14 | %% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | %% KIND, either express or implied. See the License for the 16 | %% specific language governing permissions and limitations 17 | %% under the License. 18 | %% 19 | %% ------------------------------------------------------------------- 20 | 21 | %% @doc 22 | %% Implementation of the {@link riak_ensemble_backend} behavior that 23 | %% that stores simple key/value objects in an in-process orddict that 24 | %% is synchronously written to disk on each put. 25 | %% 26 | %% Note: this is used as the peer type for the built-in root ensemble 27 | %% that stores system-wide metadata, bootstraps additional ensembles, 28 | %% etc. 29 | 30 | -module(riak_ensemble_basic_backend). 31 | -behaviour(riak_ensemble_backend). 32 | 33 | -export([init/3, new_obj/4]). 34 | -export([obj_epoch/1, obj_seq/1, obj_key/1, obj_value/1]). 35 | -export([set_obj_epoch/2, set_obj_seq/2, set_obj_value/2]). 36 | -export([get/3, put/4, tick/5, ping/2, ready_to_start/0]). 37 | -export([synctree_path/2]). 38 | -export([handle_down/4]). 39 | 40 | -include_lib("kernel/include/logger.hrl"). 41 | 42 | -include_lib("riak_ensemble_types.hrl"). 43 | 44 | -record(obj, {epoch :: epoch(), 45 | seq :: seq(), 46 | key :: term(), 47 | value :: term()}). 48 | 49 | -record(state, {savefile :: file:filename(), 50 | id :: peer_id(), 51 | data :: orddict:orddict()}). 52 | 53 | -type obj() :: #obj{}. 54 | -type state() :: #state{}. 55 | -type key() :: any(). 56 | -type value() :: any(). 57 | 58 | %%=================================================================== 59 | 60 | -spec init(ensemble_id(), peer_id(), []) -> state(). 61 | init(Ensemble, Id, []) -> 62 | %% TODO: Any concerns about using hash here? 63 | %% TODO: For root ensemble, should we use different naming scheme? 64 | <> = riak_ensemble_util:sha(term_to_binary({Ensemble, Id})), 65 | Name = integer_to_list(Hash), 66 | {ok, Root} = application:get_env(riak_ensemble, data_root), 67 | File = filename:join([Root, "ensembles", Name ++ "_kv"]), 68 | Data = reload_data(File), 69 | #state{savefile=File, data=Data, id=Id}. 70 | 71 | %%=================================================================== 72 | 73 | -spec new_obj(epoch(), seq(), key(), value()) -> obj(). 74 | new_obj(Epoch, Seq, Key, Value) -> 75 | #obj{epoch=Epoch, seq=Seq, key=Key, value=Value}. 76 | 77 | %%=================================================================== 78 | 79 | -spec obj_epoch(obj()) -> epoch(). 80 | obj_epoch(Obj) -> 81 | Obj#obj.epoch. 82 | 83 | -spec obj_seq(obj()) -> seq(). 84 | obj_seq(Obj) -> 85 | Obj#obj.seq. 86 | 87 | -spec obj_key(obj()) -> key(). 88 | obj_key(Obj) -> 89 | Obj#obj.key. 90 | 91 | -spec obj_value(obj()) -> value(). 92 | obj_value(Obj) -> 93 | Obj#obj.value. 94 | 95 | %%=================================================================== 96 | 97 | -spec set_obj_epoch(epoch(), obj()) -> obj(). 98 | set_obj_epoch(Epoch, Obj) -> 99 | Obj#obj{epoch=Epoch}. 100 | 101 | -spec set_obj_seq(seq(), obj()) -> obj(). 102 | set_obj_seq(Seq, Obj) -> 103 | Obj#obj{seq=Seq}. 104 | 105 | -spec set_obj_value(value(), obj()) -> obj(). 106 | set_obj_value(Value, Obj) -> 107 | Obj#obj{value=Value}. 108 | 109 | %%=================================================================== 110 | 111 | -spec get(key(), riak_ensemble_backend:from(), state()) -> state(). 112 | get(Key, From, State=#state{data=Data}) -> 113 | Reply = case orddict:find(Key, Data) of 114 | {ok, Value} -> 115 | Value; 116 | error -> 117 | notfound 118 | end, 119 | riak_ensemble_backend:reply(From, Reply), 120 | State. 121 | 122 | -spec put(key(), obj(), riak_ensemble_backend:from(), state()) -> state(). 123 | put(Key, Obj, From, State=#state{savefile=File, data=Data}) -> 124 | Data2 = orddict:store(Key, Obj, Data), 125 | save_data(File, Data2), 126 | riak_ensemble_backend:reply(From, Obj), 127 | State#state{data=Data2}. 128 | 129 | %%=================================================================== 130 | 131 | -spec tick(epoch(), seq(), peer_id(), views(), state()) -> state(). 132 | tick(_Epoch, _Seq, _Leader, _Views, State) -> 133 | State. 134 | 135 | -spec ping(pid(), state()) -> {ok, state()}. 136 | ping(_From, State) -> 137 | {ok, State}. 138 | 139 | ready_to_start() -> 140 | true. 141 | 142 | synctree_path(_Ensemble, _Id) -> 143 | default. 144 | 145 | %%=================================================================== 146 | 147 | -spec handle_down(reference(), pid(), term(), state()) -> false. 148 | handle_down(_Ref, _Pid, _Reason, _State) -> 149 | false. 150 | 151 | %%=================================================================== 152 | 153 | -spec reload_data(file:filename()) -> orddict:orddict(). 154 | reload_data(File) -> 155 | case load_saved_data(File) of 156 | {ok, Data} -> 157 | Data; 158 | not_found -> 159 | [] 160 | end. 161 | 162 | -spec load_saved_data(file:filename()) -> not_found | {ok, orddict:orddict()}. 163 | load_saved_data(File) -> 164 | case riak_ensemble_util:read_file(File) of 165 | {ok, <>} -> 166 | case erlang:crc32(Binary) of 167 | CRC -> 168 | try 169 | {ok, binary_to_term(Binary)} 170 | catch 171 | _:_ -> 172 | ?LOG_WARNING("Corrupted state detected. " 173 | "Reverting to empty state."), 174 | not_found 175 | end; 176 | _ -> 177 | not_found 178 | end; 179 | {error, _} -> 180 | not_found 181 | end. 182 | 183 | -spec save_data(file:filename(), orddict:orddict()) -> ok. 184 | save_data(File, Data) -> 185 | Binary = term_to_binary(Data), 186 | CRC = erlang:crc32(Binary), 187 | ok = filelib:ensure_dir(File), 188 | ok = riak_ensemble_util:replace_file(File, [<>, Binary]), 189 | ok. 190 | -------------------------------------------------------------------------------- /src/riak_ensemble_client.erl: -------------------------------------------------------------------------------- 1 | %% ------------------------------------------------------------------- 2 | %% 3 | %% Copyright (c) 2013 Basho Technologies, Inc. All Rights Reserved. 4 | %% 5 | %% This file is provided to you under the Apache License, 6 | %% Version 2.0 (the "License"); you may not use this file 7 | %% except in compliance with the License. You may obtain 8 | %% a copy of the License at 9 | %% 10 | %% http://www.apache.org/licenses/LICENSE-2.0 11 | %% 12 | %% Unless required by applicable law or agreed to in writing, 13 | %% software distributed under the License is distributed on an 14 | %% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | %% KIND, either express or implied. See the License for the 16 | %% specific language governing permissions and limitations 17 | %% under the License. 18 | %% 19 | %% ------------------------------------------------------------------- 20 | 21 | -module(riak_ensemble_client). 22 | -export([kget/3, kupdate/5, kput_once/4, kover/4, kdelete/3, ksafe_delete/4]). 23 | -export([kget/4, kupdate/6, kput_once/5, kover/5, kdelete/4, ksafe_delete/5]). 24 | -export([kget/5]). 25 | 26 | -include_lib("riak_ensemble_types.hrl"). 27 | 28 | -type obj() :: any(). 29 | -type client_reply() :: {ok, obj()} | 30 | {error, failed | timeout | unavailable}. 31 | 32 | %%%=================================================================== 33 | 34 | -spec kget(_,_,timeout()) -> client_reply(). 35 | kget(Ensemble, Key, Timeout) -> 36 | kget(node(), Ensemble, Key, Timeout). 37 | 38 | -spec kget(node(),_,_,timeout()) -> client_reply(). 39 | kget(Node, Ensemble, Key, Timeout) -> 40 | kget(Node, Ensemble, Key, Timeout, []). 41 | 42 | -spec kget(node(),_,_,timeout(),_) -> client_reply(). 43 | kget(Node, Ensemble, Key, Timeout, Opts) -> 44 | maybe(Node, 45 | fun() -> 46 | translate(riak_ensemble_peer:kget(Node, Ensemble, Key, Timeout, Opts)) 47 | end). 48 | 49 | %%%=================================================================== 50 | 51 | -spec kupdate(_,_,_,_,timeout()) -> client_reply(). 52 | kupdate(Ensemble, Key, Obj, NewObj, Timeout) -> 53 | kupdate(node(), Ensemble, Key, Obj, NewObj, Timeout). 54 | 55 | -spec kupdate(node(),_,_,_,_,timeout()) -> client_reply(). 56 | kupdate(Node, Ensemble, Key, Obj, NewObj, Timeout) -> 57 | maybe(Node, 58 | fun() -> translate(riak_ensemble_peer:kupdate(Node, Ensemble, Key, 59 | Obj, NewObj, Timeout)) 60 | end). 61 | 62 | %%%=================================================================== 63 | 64 | -spec kput_once(_,_,_,timeout()) -> client_reply(). 65 | kput_once(Ensemble, Key, NewObj, Timeout) -> 66 | kput_once(node(), Ensemble, Key, NewObj, Timeout). 67 | 68 | -spec kput_once(node(),_,_,_,timeout()) -> client_reply(). 69 | kput_once(Node, Ensemble, Key, NewObj, Timeout) -> 70 | maybe(Node, 71 | fun() -> translate(riak_ensemble_peer:kput_once(Node, Ensemble, Key, 72 | NewObj, Timeout)) 73 | end). 74 | 75 | %%%=================================================================== 76 | 77 | -spec kover(_,_,_,timeout()) -> client_reply(). 78 | kover(Ensemble, Key, NewObj, Timeout) -> 79 | kover(node(), Ensemble, Key, NewObj, Timeout). 80 | 81 | -spec kover(node(),_,_,_,timeout()) -> client_reply(). 82 | kover(Node, Ensemble, Key, NewObj, Timeout) -> 83 | maybe(Node, 84 | fun() -> 85 | translate(riak_ensemble_peer:kover(Node, Ensemble, Key, NewObj, 86 | Timeout)) 87 | end). 88 | 89 | %%%=================================================================== 90 | 91 | -spec kdelete(_,_,timeout()) -> client_reply(). 92 | kdelete(Ensemble, Key, Timeout) -> 93 | kdelete(node(), Ensemble, Key, Timeout). 94 | 95 | -spec kdelete(node(),_,_,timeout()) -> client_reply(). 96 | kdelete(Node, Ensemble, Key, Timeout) -> 97 | maybe(Node, 98 | fun() -> 99 | translate(riak_ensemble_peer:kdelete(Node, Ensemble, Key, Timeout)) 100 | end). 101 | 102 | %%%=================================================================== 103 | 104 | -spec ksafe_delete(_,_,_,timeout()) -> client_reply(). 105 | ksafe_delete(Ensemble, Key, Obj, Timeout) -> 106 | ksafe_delete(node(), Ensemble, Key, Obj, Timeout). 107 | 108 | -spec ksafe_delete(node(),_,_,_,timeout()) -> client_reply(). 109 | ksafe_delete(Node, Ensemble, Key, Obj, Timeout) -> 110 | maybe(Node, 111 | fun() -> 112 | translate(riak_ensemble_peer:ksafe_delete(Node, Ensemble, Key, 113 | Obj, Timeout)) 114 | end). 115 | 116 | %%%=================================================================== 117 | 118 | %% TODO: Change riak_ensemble_peer to use {error, X} and remove translation 119 | -spec translate(failed | timeout | unavailable | {ok, obj()}) -> client_reply(). 120 | translate(Result) -> 121 | case Result of 122 | unavailable -> 123 | {error, unavailable}; 124 | timeout -> 125 | {error, timeout}; 126 | failed -> 127 | {error, failed}; 128 | {ok, _Obj} -> 129 | %% TODO: This may be "notfound" object and we check in riak_client. 130 | %% Perhaps build this logic into peer to return {error, notfound}? 131 | Result 132 | end. 133 | 134 | -spec maybe(node(), fun()) -> client_reply(). 135 | maybe(Node, Fun) when Node =:= node() -> 136 | case riak_ensemble_manager:enabled() of 137 | true -> 138 | Fun(); 139 | _ -> 140 | {error, unavailable} 141 | end; 142 | maybe(_Node, Fun) -> 143 | Fun(). 144 | -------------------------------------------------------------------------------- /src/riak_ensemble_clock.erl: -------------------------------------------------------------------------------- 1 | %% ------------------------------------------------------------------- 2 | %% 3 | %% Copyright (c) 2014 Basho Technologies, Inc. All Rights Reserved. 4 | %% 5 | %% This file is provided to you under the Apache License, 6 | %% Version 2.0 (the "License"); you may not use this file 7 | %% except in compliance with the License. You may obtain 8 | %% a copy of the License at 9 | %% 10 | %% http://www.apache.org/licenses/LICENSE-2.0 11 | %% 12 | %% Unless required by applicable law or agreed to in writing, 13 | %% software distributed under the License is distributed on an 14 | %% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | %% KIND, either express or implied. See the License for the 16 | %% specific language governing permissions and limitations 17 | %% under the License. 18 | %% 19 | %% ------------------------------------------------------------------- 20 | -module(riak_ensemble_clock). 21 | -on_load(init/0). 22 | -export([monotonic_time/0, monotonic_time_ms/0]). 23 | 24 | monotonic_time() -> 25 | erlang:nif_error({error, not_loaded}). 26 | 27 | monotonic_time_ms() -> 28 | erlang:nif_error({error, not_loaded}). 29 | 30 | init() -> 31 | case code:priv_dir(riak_ensemble) of 32 | {error, bad_name} -> 33 | case code:which(?MODULE) of 34 | Filename when is_list(Filename) -> 35 | SoName = filename:join([filename:dirname(Filename),"../priv", "riak_ensemble"]); 36 | _ -> 37 | SoName = filename:join("../priv", "riak_ensemble") 38 | end; 39 | Dir -> 40 | SoName = filename:join(Dir, "riak_ensemble") 41 | end, 42 | erlang:load_nif(SoName, 0). 43 | -------------------------------------------------------------------------------- /src/riak_ensemble_config.erl: -------------------------------------------------------------------------------- 1 | %% ------------------------------------------------------------------- 2 | %% 3 | %% Copyright (c) 2014 Basho Technologies, Inc. All Rights Reserved. 4 | %% 5 | %% This file is provided to you under the Apache License, 6 | %% Version 2.0 (the "License"); you may not use this file 7 | %% except in compliance with the License. You may obtain 8 | %% a copy of the License at 9 | %% 10 | %% http://www.apache.org/licenses/LICENSE-2.0 11 | %% 12 | %% Unless required by applicable law or agreed to in writing, 13 | %% software distributed under the License is distributed on an 14 | %% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | %% KIND, either express or implied. See the License for the 16 | %% specific language governing permissions and limitations 17 | %% under the License. 18 | %% 19 | %% ------------------------------------------------------------------- 20 | -module(riak_ensemble_config). 21 | 22 | -compile(nowarn_export_all). 23 | -compile(export_all). 24 | 25 | -include_lib("riak_ensemble_types.hrl"). 26 | 27 | %% @doc 28 | %% The primary ensemble tick that determines the rate at which an elected 29 | %% leader attempts to refresh its lease. 30 | tick() -> 31 | get_env(ensemble_tick, 500). 32 | 33 | %% @doc 34 | %% The leader lease duration. Should be greater than the leader tick to give 35 | %% the leader time to refresh before expiration, but lower than the follower 36 | %% timeout. 37 | lease() -> 38 | get_env(lease_duration, tick() * 3 div 2). 39 | 40 | %% @doc 41 | %% This setting determines if leader leases are trusted or not. Trusting the 42 | %% lease allows a leader to reply to reads without contacting remote peers 43 | %% as long as its lease has not yet expired. 44 | trust_lease() -> 45 | get_env(trust_lease, true). 46 | 47 | %% @doc 48 | %% The follower timeout determines how long a follower waits to hear from 49 | %% the leader before abandoning it. 50 | follower_timeout() -> 51 | get_env(follower_timeout, lease() * 4). 52 | 53 | %% @doc 54 | %% The election timeout used for randomized election. 55 | election_timeout() -> 56 | Timeout = follower_timeout(), 57 | Timeout + rand:uniform(Timeout). 58 | 59 | %% @doc 60 | %% The prefollow timeout determines how long a peer waits to hear from the 61 | %% preliminary leader before abandoning it. 62 | prefollow_timeout() -> 63 | tick() * 2. 64 | 65 | %% @doc 66 | %% The pending timeout determines how long a pending peer waits in the pending 67 | %% state to hear from an existing leader. 68 | pending_timeout() -> 69 | tick() * 10. 70 | 71 | %% @doc 72 | %% The amount of time between probe attempts. 73 | probe_delay() -> 74 | 1000. 75 | 76 | %% @doc The internal timeout used by peer worker FSMs when performing gets. 77 | local_get_timeout() -> 78 | get_env(peer_get_timeout, 60000). 79 | 80 | %% @doc The internal timeout used by peer worker FSMs when performing puts. 81 | local_put_timeout() -> 82 | get_env(peer_put_timeout, 60000). 83 | 84 | %% @doc 85 | %% The number of leader ticks that can go by without hearing from the ensemble 86 | %% backend. 87 | alive_ticks() -> 88 | get_env(alive_tokens, 2). 89 | 90 | %% @doc The number of peer workers/FSM processes used by the leader. 91 | peer_workers() -> 92 | get_env(peer_workers, 1). 93 | 94 | %% @doc 95 | %% The operation delay used by {@link riak_ensemble_storage} to coalesce 96 | %% multiple local operations into a single disk oepration. 97 | storage_delay() -> 98 | get_env(storage_delay, 50). 99 | 100 | %% @doc 101 | %% The periodic tick at which {@link riak_ensemble_storage} flushes operations 102 | %% to disk even if there are no explicit sync requests. 103 | storage_tick() -> 104 | get_env(storage_tick, 5000). 105 | 106 | %% @doc 107 | %% When true, synctrees are not trusted after a peer restart, requiring an 108 | %% exchange with a trusted majority to become trusted. This provides the 109 | %% strongest guarantees against byzantine faults. 110 | tree_validation() -> 111 | get_env(tree_validation, true). 112 | 113 | %% @doc 114 | %% Determines if remote synctree updates are performed synchronously. 115 | %% When true, tree updates are performed before replying to the user. 116 | synchronous_tree_updates() -> 117 | get_env(synchronous_tree_updates, false). 118 | 119 | %% @doc 120 | %% Determines how long to wait for additional responses to come in on 121 | %% certain reads that may return notfound. If we receive responses from 122 | %% every peer in the ensemble, we do not need to write a tombstone for 123 | %% the notfound key. If set to zero, no additional time will be waited, 124 | %% but it is still possible we may be able to skip writing the tombstone 125 | %% if all the responses arrive within a very close window of time. 126 | %% The default of 1ms should be enough to avoid most tombstones that 127 | %% would otherwise be created, but a higher value may be specified in 128 | %% cases where unpredictable latencies necessitate it. 129 | notfound_read_delay() -> 130 | get_env(notfound_read_delay, 1). 131 | 132 | get_env(Key, Default) -> 133 | application:get_env(riak_ensemble, Key, Default). 134 | -------------------------------------------------------------------------------- /src/riak_ensemble_exchange.erl: -------------------------------------------------------------------------------- 1 | %% ------------------------------------------------------------------- 2 | %% 3 | %% Copyright (c) 2013 Basho Technologies, Inc. All Rights Reserved. 4 | %% 5 | %% This file is provided to you under the Apache License, 6 | %% Version 2.0 (the "License"); you may not use this file 7 | %% except in compliance with the License. You may obtain 8 | %% a copy of the License at 9 | %% 10 | %% http://www.apache.org/licenses/LICENSE-2.0 11 | %% 12 | %% Unless required by applicable law or agreed to in writing, 13 | %% software distributed under the License is distributed on an 14 | %% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | %% KIND, either express or implied. See the License for the 16 | %% specific language governing permissions and limitations 17 | %% under the License. 18 | %% 19 | %% ------------------------------------------------------------------- 20 | -module(riak_ensemble_exchange). 21 | 22 | -compile(nowarn_export_all). 23 | -compile(export_all). 24 | 25 | -compile({nowarn_deprecated_function, 26 | [{gen_fsm, sync_send_event, 3}, 27 | {gen_fsm, send_event, 2}]}). 28 | 29 | start_exchange(Ensemble, Peer, Id, Tree, Peers, Views, Trusted) -> 30 | spawn(fun() -> 31 | try 32 | perform_exchange(Ensemble, Peer, Id, Tree, Peers, Views, Trusted) 33 | catch Class:Reason:Stacktrace -> 34 | io:format("CAUGHT: ~p/~p~n~p~n", [Class, Reason, Stacktrace]), 35 | gen_fsm:send_event(Peer, exchange_failed) 36 | end 37 | end). 38 | 39 | perform_exchange(Ensemble, Peer, Id, Tree, Peers, Views, Trusted) -> 40 | Required = case Trusted of 41 | true -> quorum; 42 | false -> other 43 | end, 44 | RemotePeers = 45 | case trust_majority(Id, Peers, Views, Required) of 46 | {ok, Quorum} -> 47 | Quorum; 48 | failed -> 49 | case all_trust_majority(Id, Peers, Views) of 50 | {ok, All} -> 51 | All; 52 | failed -> 53 | failed 54 | end 55 | end, 56 | case RemotePeers of 57 | failed -> 58 | gen_fsm:send_event(Peer, exchange_failed), 59 | ok; 60 | _ -> 61 | perform_exchange2(Ensemble, Peer, Id, Tree, RemotePeers) 62 | end. 63 | 64 | perform_exchange2(Ensemble, Peer, Id, Tree, RemotePeers) -> 65 | case riak_ensemble_peer_tree:verify_upper(Tree) of 66 | true -> 67 | exchange(Ensemble, Peer, Id, Tree, RemotePeers); 68 | false -> 69 | %% io:format(user, "~p: tree_corrupted (perform_exchange2)~n", [Id]), 70 | gen_fsm:sync_send_event(Peer, tree_corrupted, infinity) 71 | end. 72 | 73 | exchange(_Ensemble, Peer, _Id, _Tree, []) -> 74 | gen_fsm:send_event(Peer, exchange_complete); 75 | exchange(Ensemble, Peer, Id, Tree, [RemotePeer|RemotePeers]) -> 76 | RemotePid = riak_ensemble_manager:get_peer_pid(Ensemble, RemotePeer), 77 | RemoteTree = gen_fsm:sync_send_event(RemotePid, tree_pid, infinity), 78 | Local = fun(exchange_get, {L,B}) -> 79 | exchange_get(L, B, Peer, Tree); 80 | (start_exchange_level, _) -> 81 | ok 82 | end, 83 | Remote = fun(exchange_get, {L,B}) -> 84 | exchange_get(L, B, RemotePid, RemoteTree); 85 | (start_exchange_level, _) -> 86 | ok 87 | end, 88 | Height = riak_ensemble_peer_tree:height(Tree), 89 | Result = synctree:compare(Height, Local, Remote), 90 | %% io:format("~p: Result: ~p~n", [Id, Result]), 91 | _ = [case Diff of 92 | {Key, {'$none', B}} -> 93 | riak_ensemble_peer_tree:insert(Key, B, Tree); 94 | {_Key, {_, '$none'}} -> 95 | ok; 96 | {Key, {A,B}} -> 97 | case riak_ensemble_peer:valid_obj_hash(B, A) of 98 | true -> 99 | riak_ensemble_peer_tree:insert(Key, B, Tree); 100 | false -> 101 | ok 102 | end 103 | end || Diff <- Result], 104 | exchange(Ensemble, Peer, Id, Tree, RemotePeers). 105 | 106 | exchange_get(L, B, PeerPid, Tree) -> 107 | case riak_ensemble_peer_tree:exchange_get(L, B, Tree) of 108 | corrupted -> 109 | gen_fsm:sync_send_event(PeerPid, tree_corrupted, infinity), 110 | throw(corrupted); 111 | Hashes -> 112 | Hashes 113 | end. 114 | 115 | trust_majority(Id, Peers, Views, Required) -> 116 | X = riak_ensemble_msg:blocking_send_all(exchange, Id, Peers, 117 | Views, Required), 118 | {Future, _} = X, 119 | Parent = self(), 120 | spawn_link(fun() -> 121 | Result = case riak_ensemble_msg:wait_for_quorum(Future) of 122 | {quorum_met, Replies} -> 123 | {ok, [Peer || {Peer,_} <- Replies]}; 124 | {timeout, _Replies} -> 125 | failed 126 | end, 127 | %% io:format(user, "trust majority: ~p~n", [Result]), 128 | Parent ! {trust, Result} 129 | end), 130 | receive {trust, Trusted} -> 131 | Trusted 132 | end. 133 | 134 | all_trust_majority(Id, Peers, Views) -> 135 | X = riak_ensemble_msg:blocking_send_all(all_exchange, Id, Peers, 136 | Views, all), 137 | {Future, _} = X, 138 | Parent = self(), 139 | spawn_link(fun() -> 140 | Result = case riak_ensemble_msg:wait_for_quorum(Future) of 141 | {quorum_met, Replies} -> 142 | {ok, [Peer || {Peer,_} <- Replies]}; 143 | {timeout, _Replies} -> 144 | failed 145 | end, 146 | %% io:format(user, "all_trust majority: ~p~n", [Result]), 147 | Parent ! {trust, Result} 148 | end), 149 | receive {trust, Trusted} -> 150 | Trusted 151 | end. 152 | -------------------------------------------------------------------------------- /src/riak_ensemble_lease.erl: -------------------------------------------------------------------------------- 1 | %% ------------------------------------------------------------------- 2 | %% 3 | %% Copyright (c) 2014 Basho Technologies, Inc. All Rights Reserved. 4 | %% 5 | %% This file is provided to you under the Apache License, 6 | %% Version 2.0 (the "License"); you may not use this file 7 | %% except in compliance with the License. You may obtain 8 | %% a copy of the License at 9 | %% 10 | %% http://www.apache.org/licenses/LICENSE-2.0 11 | %% 12 | %% Unless required by applicable law or agreed to in writing, 13 | %% software distributed under the License is distributed on an 14 | %% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | %% KIND, either express or implied. See the License for the 16 | %% specific language governing permissions and limitations 17 | %% under the License. 18 | %% 19 | %% ------------------------------------------------------------------- 20 | 21 | %% @doc 22 | %% This module is used by {@link riak_ensemble_peer} to keep track of 23 | %% an established leader lease. The leader is responsible for periodically 24 | %% refreshing its lease, otherwise the lease will timeout. 25 | %% 26 | %% Using a time-based lease in a distributed system is not without issue. 27 | %% This module does its best to address these concerns as follows: 28 | %% 29 | %% 1. This module uses Erlang based timeouts to trigger lease expiration. 30 | %% Erlang uses time correction to attempt to occur for clock issues, 31 | %% as discussed here: 32 | %% http://www.erlang.org/doc/apps/erts/time_correction.html 33 | %% 34 | %% 2. In addition to Erlang time, this module also double checks the 35 | %% lease against the OS monotonic clock. The monotonic clock is 36 | %% not affected by the user/NTP changing the system clock, and 37 | %% is designed to always move forward (although, virtualization 38 | %% sometimes affects this guarantee). 39 | %% 40 | %% Likewise, riak_ensemble is designed such that the lease and leader refresh 41 | %% are much smaller than the follower timeout. All of these factors, along 42 | %% with riak_ensemble being designed to maintain strong leadership (unlike 43 | %% other systems such as Raft) make the use of leader leases safe in practice. 44 | %% As a reminder, Google is also known to use leader leases it its paxos 45 | %% implementation as discussed in their "Paxos Made Live" paper. 46 | %% 47 | %% Of course, users that do not trust leader leases can always set the 48 | %% trust_lease application variable to false, causing riak_ensemble to ignore 49 | %% leader leases and always perform full quorum operations. 50 | %% 51 | 52 | -module(riak_ensemble_lease). 53 | 54 | -export([start_link/0, 55 | check_lease/1, 56 | lease/2, 57 | unlease/1]). 58 | 59 | %% internal exports 60 | -export([init/2, loop/2]). 61 | 62 | -type lease_ref() :: {pid(), ets:tid()}. 63 | -export_type([lease_ref/0]). 64 | 65 | %%%=================================================================== 66 | 67 | -spec start_link() -> {ok, lease_ref()}. 68 | start_link() -> 69 | Ref = make_ref(), 70 | spawn_link(?MODULE, init, [self(), Ref]), 71 | receive 72 | {Ref, Reply} -> 73 | Reply 74 | end. 75 | 76 | -spec check_lease(lease_ref()) -> boolean(). 77 | check_lease({_, T}) -> 78 | case ets:lookup_element(T, lease, 2) of 79 | undefined -> 80 | false; 81 | Until -> 82 | case riak_ensemble_clock:monotonic_time_ms() of 83 | {ok, Time} when Time < Until -> 84 | true; 85 | _ -> 86 | false 87 | end 88 | end. 89 | 90 | -spec lease(lease_ref(), timeout()) -> ok. 91 | lease({Pid,_}, Duration) -> 92 | ok = call(Pid, {lease, Duration}). 93 | 94 | -spec unlease(lease_ref()) -> ok. 95 | unlease({Pid,_}) -> 96 | ok = call(Pid, unlease). 97 | 98 | %%%=================================================================== 99 | 100 | init(Parent, Ref) -> 101 | T = ets:new(?MODULE, [protected, set, {read_concurrency, true}]), 102 | ets:insert(T, {lease, undefined}), 103 | Reply = {ok, {self(), T}}, 104 | Parent ! {Ref, Reply}, 105 | loop(T, infinity). 106 | 107 | %%%=================================================================== 108 | 109 | loop(T, Timeout) -> 110 | receive 111 | {{lease, Duration}, From} -> 112 | case riak_ensemble_clock:monotonic_time_ms() of 113 | {ok, Time} -> 114 | ets:insert(T, {lease, Time + Duration}); 115 | error -> 116 | ets:insert(T, {lease, undefined}) 117 | end, 118 | reply(From, ok), 119 | ?MODULE:loop(T, Duration); 120 | {unlease, From} -> 121 | ets:insert(T, {lease, undefined}), 122 | reply(From, ok), 123 | ?MODULE:loop(T, infinity) 124 | after Timeout -> 125 | ets:insert(T, {lease, undefined}), 126 | ?MODULE:loop(T, infinity) 127 | end. 128 | 129 | %%%=================================================================== 130 | 131 | call(Pid, Msg) -> 132 | MRef = monitor(process, Pid), 133 | From = {self(), MRef}, 134 | Pid ! {Msg, From}, 135 | receive 136 | {MRef, Reply} -> 137 | erlang:demonitor(MRef, [flush]), 138 | Reply; 139 | {'DOWN', MRef, _, _, Reason} -> 140 | exit(Reason) 141 | end. 142 | 143 | reply({Pid, Ref}, Reply) -> 144 | Pid ! {Ref, Reply}, 145 | ok. 146 | -------------------------------------------------------------------------------- /src/riak_ensemble_peer_sup.erl: -------------------------------------------------------------------------------- 1 | %% ------------------------------------------------------------------- 2 | %% 3 | %% Copyright (c) 2013 Basho Technologies, Inc. All Rights Reserved. 4 | %% 5 | %% This file is provided to you under the Apache License, 6 | %% Version 2.0 (the "License"); you may not use this file 7 | %% except in compliance with the License. You may obtain 8 | %% a copy of the License at 9 | %% 10 | %% http://www.apache.org/licenses/LICENSE-2.0 11 | %% 12 | %% Unless required by applicable law or agreed to in writing, 13 | %% software distributed under the License is distributed on an 14 | %% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | %% KIND, either express or implied. See the License for the 16 | %% specific language governing permissions and limitations 17 | %% under the License. 18 | %% 19 | %% ------------------------------------------------------------------- 20 | -module(riak_ensemble_peer_sup). 21 | -behaviour(supervisor). 22 | -export([start_link/0, init/1]). 23 | -export([start_peer/4, stop_peer/2, peers/0]). 24 | -export([get_peer_pid/2, register_peer/4]). 25 | 26 | -include_lib("riak_ensemble_types.hrl"). 27 | -define(ETS, riak_ensemble_peers). 28 | 29 | start_link() -> 30 | supervisor:start_link({local, ?MODULE}, ?MODULE, []). 31 | 32 | init([]) -> 33 | %% Owned by riak_ensemble_peer_sup to ensure table has lifetime that 34 | %% is greater than or equal to all riak_ensemble_peers. 35 | ?ETS = ets:new(?ETS, [named_table, public, 36 | {read_concurrency, true}, 37 | {write_concurrency, true}]), 38 | {ok, {{one_for_one, 5, 10}, []}}. 39 | 40 | -spec start_peer(module(), ensemble_id(), peer_id(), [any()]) -> pid(). 41 | start_peer(Mod, Ensemble, Id, Args) -> 42 | Ref = peer_ref(Mod, Ensemble, Id, Args), 43 | Pid = case supervisor:start_child(?MODULE, Ref) of 44 | {ok, Child} -> Child; 45 | {error, {already_started, Child}} -> Child; 46 | {error, already_present} -> 47 | ok = supervisor:delete_child(?MODULE, {Ensemble, Id}), 48 | start_peer(Mod, Ensemble, Id, Args) 49 | end, 50 | Pid. 51 | 52 | -spec stop_peer(ensemble_id(), peer_id()) -> ok. 53 | stop_peer(Ensemble, Id) -> 54 | _ = supervisor:terminate_child(?MODULE, {Ensemble, Id}), 55 | ok = unregister_peer(Ensemble, Id), 56 | _ = supervisor:delete_child(?MODULE, {Ensemble, Id}), 57 | ok. 58 | 59 | -spec peers() -> [{{ensemble_id(), peer_id()}, pid()}]. 60 | peers() -> 61 | Children = supervisor:which_children(?MODULE), 62 | [{Id,Pid} || {Id, Pid, worker, _} <- Children, 63 | is_pid(Pid)]. 64 | 65 | -spec get_peer_pid(ensemble_id(), peer_id()) -> pid() | undefined. 66 | get_peer_pid(Ensemble, Id) -> 67 | try 68 | ets:lookup_element(?ETS, {pid, {Ensemble, Id}}, 2) 69 | catch 70 | _:_ -> 71 | undefined 72 | end. 73 | 74 | -spec register_peer(ensemble_id(), peer_id(), pid(), ets:tid()) -> ok. 75 | register_peer(Ensemble, Id, Pid, ETS) -> 76 | true = ets:insert(?ETS, [{{pid, {Ensemble, Id}}, Pid}, 77 | {{ets, {Ensemble, Id}}, ETS}]), 78 | ok. 79 | 80 | %% @private 81 | unregister_peer(Ensemble, Id) -> 82 | true = ets:delete(?ETS, {pid, {Ensemble, Id}}), 83 | true = ets:delete(?ETS, {ets, {Ensemble, Id}}), 84 | ok. 85 | 86 | %% @private 87 | peer_ref(Mod, Ensemble, Id, Args) -> 88 | {{Ensemble, Id}, 89 | {riak_ensemble_peer, start_link, [Mod, Ensemble, Id, Args]}, 90 | permanent, 5000, worker, [riak_ensemble_peer]}. 91 | -------------------------------------------------------------------------------- /src/riak_ensemble_peer_tree.erl: -------------------------------------------------------------------------------- 1 | %% ------------------------------------------------------------------- 2 | %% 3 | %% Copyright (c) 2013 Basho Technologies, Inc. All Rights Reserved. 4 | %% 5 | %% This file is provided to you under the Apache License, 6 | %% Version 2.0 (the "License"); you may not use this file 7 | %% except in compliance with the License. You may obtain 8 | %% a copy of the License at 9 | %% 10 | %% http://www.apache.org/licenses/LICENSE-2.0 11 | %% 12 | %% Unless required by applicable law or agreed to in writing, 13 | %% software distributed under the License is distributed on an 14 | %% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | %% KIND, either express or implied. See the License for the 16 | %% specific language governing permissions and limitations 17 | %% under the License. 18 | %% 19 | %% ------------------------------------------------------------------- 20 | -module(riak_ensemble_peer_tree). 21 | -behaviour(gen_server). 22 | 23 | -compile({nowarn_deprecated_function, 24 | [{gen_fsm, send_event, 2}]}). 25 | 26 | %% API 27 | -export([start_link/3]). 28 | -export([get/2, 29 | insert/3, 30 | rehash_upper/1, 31 | rehash/1, 32 | verify_upper/1, 33 | verify/1, 34 | exchange_get/3, 35 | top_hash/1, 36 | height/1, 37 | repair/1]). 38 | -export([async_rehash_upper/1, 39 | async_rehash/1, 40 | async_verify_upper/1, 41 | async_verify/1, 42 | async_repair/1]). 43 | 44 | %% gen_server callbacks 45 | -export([init/1, handle_call/3, handle_cast/2, handle_info/2, 46 | terminate/2, code_change/3]). 47 | 48 | -record(state, {tree :: any(), 49 | corrupted :: {integer(), integer()} | undefined 50 | }). 51 | -type state() :: #state{}. 52 | 53 | %%%=================================================================== 54 | %%% API 55 | %%%=================================================================== 56 | 57 | %% TODO: 58 | %% For most of these APIs, really should return {ok, any()} and 59 | %% corrupted or {error, corrupted}. As is, any() | corrupted reduces 60 | %% to any() which gives us zero dialyzer benefits. 61 | 62 | start_link(Id, TreeId, Path) -> 63 | gen_server:start_link(?MODULE, [Id, TreeId, Path], []). 64 | 65 | -spec get(_,pid()) -> any(). 66 | get(Key, Pid) -> 67 | gen_server:call(Pid, {get, Key}, infinity). 68 | 69 | -spec insert(_,_,pid()) -> ok | corrupted. 70 | insert(Key, ObjHash, Pid) -> 71 | gen_server:call(Pid, {insert, Key, ObjHash}, infinity). 72 | 73 | -spec rehash_upper(pid()) -> ok. 74 | rehash_upper(Pid) -> 75 | gen_server:call(Pid, rehash_upper, infinity). 76 | 77 | -spec rehash(pid()) -> ok. 78 | rehash(Pid) -> 79 | gen_server:call(Pid, rehash, infinity). 80 | 81 | -spec top_hash(pid()) -> any(). 82 | top_hash(Pid) -> 83 | gen_server:call(Pid, top_hash, infinity). 84 | 85 | -spec exchange_get(_,_,pid()) -> any(). 86 | exchange_get(Level, Bucket, Pid) -> 87 | gen_server:call(Pid, {exchange_get, Level, Bucket}, infinity). 88 | 89 | -spec height(pid()) -> pos_integer(). 90 | height(Pid) -> 91 | gen_server:call(Pid, height, infinity). 92 | 93 | -spec repair(pid()) -> ok. 94 | repair(Pid) -> 95 | gen_server:call(Pid, repair, infinity). 96 | 97 | -spec verify_upper(pid()) -> boolean(). 98 | verify_upper(Pid) -> 99 | gen_server:call(Pid, verify_upper, infinity). 100 | 101 | -spec verify(pid()) -> boolean(). 102 | verify(Pid) -> 103 | gen_server:call(Pid, verify, infinity). 104 | 105 | %%%=================================================================== 106 | 107 | %% These async operations must only be called by a process that is 108 | %% monitoring or linked to the tree process (eg. riak_ensemble_peer). 109 | 110 | %% Asynchronously sends rehash_complete to caller 111 | -spec async_rehash_upper(pid()) -> ok. 112 | async_rehash_upper(Pid) -> 113 | gen_server:cast(Pid, {async_rehash_upper, self()}). 114 | 115 | %% Asynchronously sends rehash_complete to caller 116 | -spec async_rehash(pid()) -> ok. 117 | async_rehash(Pid) -> 118 | gen_server:cast(Pid, {async_rehash, self()}). 119 | 120 | %% Asynchronously sends {verify_complete, boolean()} 121 | -spec async_verify_upper(pid()) -> ok. 122 | async_verify_upper(Pid) -> 123 | gen_server:cast(Pid, {async_verify_upper, self()}). 124 | 125 | %% Asynchronously sends {verify_complete, boolean()} 126 | -spec async_verify(pid()) -> ok. 127 | async_verify(Pid) -> 128 | gen_server:cast(Pid, {async_verify, self()}). 129 | 130 | %% Asynchronously sends repair_complete 131 | -spec async_repair(pid()) -> ok. 132 | async_repair(Pid) -> 133 | gen_server:cast(Pid, {async_repair, self()}). 134 | 135 | %%%=================================================================== 136 | %%% gen_server callbacks 137 | %%%=================================================================== 138 | 139 | init([Id, TreeId, Path]) -> 140 | Tree = synctree:newdb(Id, [{path, Path}, 141 | {tree_id, TreeId}]), 142 | State = #state{tree=Tree}, 143 | {ok, State}. 144 | 145 | handle_call({get, Key}, _From, State) -> 146 | {Reply, State2} = do_get(Key, State), 147 | {reply, Reply, State2}; 148 | handle_call({insert, Key, ObjHash}, _From, State) -> 149 | {Reply, State2} = do_insert(Key, ObjHash, State), 150 | {reply, Reply, State2}; 151 | handle_call(rehash_upper, _From, State) -> 152 | State2 = do_rehash_upper(State), 153 | {reply, ok, State2}; 154 | handle_call(rehash, _From, State) -> 155 | State2 = do_rehash(State), 156 | {reply, ok, State2}; 157 | handle_call(top_hash, _From, State) -> 158 | Reply = do_top_hash(State), 159 | {reply, Reply, State}; 160 | handle_call({exchange_get, Level, Bucket}, _From, State) -> 161 | {Reply, State2} = do_exchange_get(Level, Bucket, State), 162 | {reply, Reply, State2}; 163 | handle_call(height, _From, State) -> 164 | Reply = do_height(State), 165 | {reply, Reply, State}; 166 | handle_call(repair, _From, State) -> 167 | State2 = do_repair(State), 168 | {reply, ok, State2}; 169 | handle_call(verify_upper, _From, State) -> 170 | Reply = do_verify_upper(State), 171 | {reply, Reply, State}; 172 | handle_call(verify, _From, State) -> 173 | Reply = do_verify(State), 174 | {reply, Reply, State}; 175 | handle_call(_Request, _From, State) -> 176 | {reply, ok, State}. 177 | 178 | handle_cast({async_rehash_upper, From}, State) -> 179 | State2 = do_rehash_upper(State), 180 | async_reply(From, rehash_complete), 181 | {noreply, State2}; 182 | handle_cast({async_rehash, From}, State) -> 183 | State2 = do_rehash(State), 184 | async_reply(From, rehash_complete), 185 | {noreply, State2}; 186 | handle_cast({async_verify_upper, From}, State) -> 187 | Reply = do_verify_upper(State), 188 | async_reply(From, {verify_complete, Reply}), 189 | {noreply, State}; 190 | handle_cast({async_verify, From}, State) -> 191 | Reply = do_verify(State), 192 | async_reply(From, {verify_complete, Reply}), 193 | {noreply, State}; 194 | handle_cast({async_repair, From}, State) -> 195 | State2 = do_repair(State), 196 | async_reply(From, repair_complete), 197 | {noreply, State2}; 198 | handle_cast(_Msg, State) -> 199 | {noreply, State}. 200 | 201 | handle_info(_Info, State) -> 202 | {noreply, State}. 203 | 204 | terminate(_Reason, _State) -> 205 | ok. 206 | 207 | code_change(_OldVsn, State, _Extra) -> 208 | {ok, State}. 209 | 210 | %%%=================================================================== 211 | %%% Internal functions 212 | %%%=================================================================== 213 | 214 | %% Hardcoded to send FSM event as expected by riak_ensemble_peer 215 | async_reply(From, Reply) when is_pid(From) -> 216 | gen_fsm:send_event(From, Reply). 217 | 218 | -spec do_get(_,state()) -> {any(), state()}. 219 | do_get(Key, State=#state{tree=Tree}) -> 220 | case synctree:get(Key, Tree) of 221 | {corrupted, Level, Bucket} -> 222 | State2 = State#state{corrupted={Level, Bucket}}, 223 | {corrupted, State2}; 224 | Other -> 225 | {Other, State} 226 | end. 227 | 228 | -spec do_insert(_,_,state()) -> {ok, state()} | {corrupted, state()}. 229 | do_insert(Key, ObjHash, State=#state{tree=Tree}) -> 230 | case synctree:insert(Key, ObjHash, Tree) of 231 | {corrupted, Level, Bucket} -> 232 | State2 = State#state{corrupted={Level, Bucket}}, 233 | {corrupted, State2}; 234 | NewTree -> 235 | %% io:format("Hash updated: ~p :: ~p~n", [NewTree, synctree:top_hash(NewTree)]), 236 | State2 = State#state{tree=NewTree}, 237 | {ok, State2} 238 | end. 239 | 240 | -spec do_rehash_upper(state()) -> state(). 241 | do_rehash_upper(State=#state{tree=Tree}) -> 242 | NewTree = synctree:rehash_upper(Tree), 243 | State#state{tree=NewTree}. 244 | 245 | -spec do_rehash(state()) -> state(). 246 | do_rehash(State=#state{tree=Tree}) -> 247 | NewTree = synctree:rehash(Tree), 248 | State#state{tree=NewTree}. 249 | 250 | -spec do_top_hash(state()) -> any(). 251 | do_top_hash(#state{tree=Tree}) -> 252 | synctree:top_hash(Tree). 253 | 254 | -spec do_exchange_get(_,_,state()) -> any(). 255 | do_exchange_get(Level, Bucket, State=#state{tree=Tree}) -> 256 | case synctree:exchange_get(Level, Bucket, Tree) of 257 | {corrupted, Level, Bucket} -> 258 | State2 = State#state{corrupted={Level, Bucket}}, 259 | {corrupted, State2}; 260 | Hashes -> 261 | {Hashes, State} 262 | end. 263 | 264 | -spec do_height(state()) -> pos_integer(). 265 | do_height(#state{tree=Tree}) -> 266 | synctree:height(Tree). 267 | 268 | do_repair(State=#state{corrupted=undefined}) -> 269 | State; 270 | do_repair(State=#state{corrupted=Corrupted, tree=Tree}) -> 271 | Final = synctree:height(Tree) + 1, 272 | case Corrupted of 273 | {Final, _Bucket} -> 274 | %% io:format("REPAIR SEGMENT: ~p~n", [Corrupted]), 275 | Tree2 = synctree:m_flush(synctree:m_batch({delete, Corrupted}, Tree)), 276 | Tree3 = synctree:rehash(Tree2), 277 | State#state{tree=Tree3, corrupted=undefined}; 278 | _ -> 279 | %% io:format("REPAIR INNER: ~p~n", [Corrupted]), 280 | State#state{corrupted=undefined} 281 | end. 282 | 283 | -spec do_verify_upper(state()) -> boolean(). 284 | do_verify_upper(#state{tree=Tree}) -> 285 | synctree:verify_upper(Tree). 286 | 287 | -spec do_verify(state()) -> boolean(). 288 | do_verify(#state{tree=Tree}) -> 289 | synctree:verify(Tree). 290 | -------------------------------------------------------------------------------- /src/riak_ensemble_peer_worker.erl: -------------------------------------------------------------------------------- 1 | %% ------------------------------------------------------------------- 2 | %% 3 | %% Copyright (c) 2013 Basho Technologies, Inc. All Rights Reserved. 4 | %% 5 | %% This file is provided to you under the Apache License, 6 | %% Version 2.0 (the "License"); you may not use this file 7 | %% except in compliance with the License. You may obtain 8 | %% a copy of the License at 9 | %% 10 | %% http://www.apache.org/licenses/LICENSE-2.0 11 | %% 12 | %% Unless required by applicable law or agreed to in writing, 13 | %% software distributed under the License is distributed on an 14 | %% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | %% KIND, either express or implied. See the License for the 16 | %% specific language governing permissions and limitations 17 | %% under the License. 18 | %% 19 | %% ------------------------------------------------------------------- 20 | 21 | %% @doc 22 | %% Long-lived worker process used by {@link riak_ensemble_peer} to handle 23 | %% asynchronous work such as K/V gets and puts. The peer maintains a pool 24 | %% of workers and routes requests to workers based on object key, ensuring 25 | %% that operations that affect the same key are serialized on the same worker. 26 | %% Thus, workers also enable partitioned queuing. These workers can also be 27 | %% paused and resumed. This is used by the peer when necessary to enforce 28 | %% necessary barriers. For example, a leader will pause all workers before 29 | %% attempting to change ensemble membership, then resume workers after. This 30 | %% prevents workers from issuing requests during a changing ownership set, 31 | %% since those requests will likely be rejected. 32 | %% 33 | %% Note: pausing/resuming is best-effort. A worker currently involved 34 | %% in a request will not pause until after completing/failing the request. 35 | %% Thus, pause/resume is not designed to provide guarantees for correctness, 36 | %% but rather as a tool for optimization (eg. to prevent issues requests 37 | %% that will likely fail because some other correctness mechanism rejects them). 38 | 39 | -module(riak_ensemble_peer_worker). 40 | -export([start/1, pause_workers/2, unpause_workers/2]). 41 | -export([loop/1, maybe_pause/1]). %% For internal use 42 | 43 | %%=================================================================== 44 | 45 | -spec start(ets:tid()) -> {ok, pid()}. 46 | start(ETS) -> 47 | Parent = self(), 48 | Pid = spawn(fun() -> 49 | init(Parent, ETS) 50 | end), 51 | {ok, Pid}. 52 | 53 | -spec pause_workers([pid()], ets:tid()) -> ok. 54 | pause_workers(_Workers, ETS) -> 55 | ets:insert(ETS, {paused, true}), 56 | ok. 57 | 58 | -spec unpause_workers([pid()], ets:tid()) -> ok. 59 | unpause_workers(Workers, ETS) -> 60 | ets:delete(ETS, paused), 61 | _ = [Pid ! unpause || Pid <- Workers], 62 | ok. 63 | 64 | %%=================================================================== 65 | 66 | init(Parent, ETS) -> 67 | monitor(process, Parent), 68 | loop(ETS). 69 | 70 | loop(ETS) -> 71 | receive 72 | {async, Fun} -> 73 | maybe_pause(ETS), 74 | Fun(); 75 | {'DOWN', _, _, _, _} -> 76 | exit(normal); 77 | _ -> 78 | ok 79 | end, 80 | ?MODULE:loop(ETS). 81 | 82 | maybe_pause(ETS) -> 83 | case ets:lookup(ETS, paused) of 84 | [{paused, true}] -> 85 | pause(ETS); 86 | [] -> 87 | ok 88 | end. 89 | 90 | pause(ETS) -> 91 | receive 92 | unpause -> 93 | ok; 94 | {'DOWN', _, _, _, _} -> 95 | exit(normal) 96 | after 5000 -> 97 | ok 98 | end, 99 | ?MODULE:maybe_pause(ETS). 100 | -------------------------------------------------------------------------------- /src/riak_ensemble_root.erl: -------------------------------------------------------------------------------- 1 | %% ------------------------------------------------------------------- 2 | %% 3 | %% Copyright (c) 2013 Basho Technologies, Inc. All Rights Reserved. 4 | %% 5 | %% This file is provided to you under the Apache License, 6 | %% Version 2.0 (the "License"); you may not use this file 7 | %% except in compliance with the License. You may obtain 8 | %% a copy of the License at 9 | %% 10 | %% http://www.apache.org/licenses/LICENSE-2.0 11 | %% 12 | %% Unless required by applicable law or agreed to in writing, 13 | %% software distributed under the License is distributed on an 14 | %% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | %% KIND, either express or implied. See the License for the 16 | %% specific language governing permissions and limitations 17 | %% under the License. 18 | %% 19 | %% ------------------------------------------------------------------- 20 | 21 | -module(riak_ensemble_root). 22 | -include_lib("riak_ensemble_types.hrl"). 23 | 24 | %% API 25 | -export([update_ensemble/4, set_ensemble/2, join/1, remove/1, gossip/4]). 26 | 27 | %% Exported internal callback functions 28 | -export([do_root_call/3, do_root_cast/3]). 29 | 30 | -include_lib("kernel/include/logger.hrl"). 31 | 32 | %%%=================================================================== 33 | %%% API 34 | %%%=================================================================== 35 | 36 | -spec update_ensemble(ensemble_id(), peer_id(), views(), vsn()) -> ok. 37 | update_ensemble(Ensemble, Leader, Views, Vsn) -> 38 | ok = cast({update_ensemble, Ensemble, Leader, Views, Vsn}). 39 | 40 | -spec set_ensemble(ensemble_id(), ensemble_info()) -> ok | {error, term()}. 41 | set_ensemble(Ensemble, Info) -> 42 | case call({set_ensemble, Ensemble, Info}) of 43 | ok -> 44 | ok; 45 | Error -> 46 | {error, Error} 47 | end. 48 | 49 | -spec join(node()) -> ok | {error, term()}. 50 | join(Node) -> 51 | case call(Node, {join, node()}, 60000) of 52 | ok -> 53 | ?LOG_INFO("JOIN: success"), 54 | ok; 55 | Error -> 56 | {error, Error} 57 | end. 58 | 59 | -spec remove(node()) -> ok | {error, term()}. 60 | remove(Node) -> 61 | case call(node(), {remove, Node}, 60000) of 62 | ok -> 63 | ?LOG_INFO("REMOVE: success"), 64 | ok; 65 | Error -> 66 | {error, Error} 67 | end. 68 | 69 | 70 | -spec gossip(pid(), vsn(), peer_id(), views()) -> ok. 71 | gossip(Pid, Vsn, Leader, Views) when is_pid(Pid) -> 72 | ok = cast(node(), Pid, {gossip, Vsn, Leader, Views}). 73 | 74 | %%%=================================================================== 75 | 76 | call(Cmd) -> 77 | call(node(), Cmd, 5000). 78 | 79 | call(Node, Cmd, Timeout) -> 80 | Default = root_init(), 81 | Result = riak_ensemble_peer:kmodify(Node, 82 | root, 83 | cluster_state, 84 | {?MODULE, do_root_call, Cmd}, 85 | Default, 86 | Timeout), 87 | case Result of 88 | {ok, _Obj} -> 89 | ok; 90 | Other -> 91 | Other 92 | end. 93 | 94 | cast(Cmd) -> 95 | cast(node(), Cmd). 96 | 97 | cast(Node, Cmd) -> 98 | cast(Node, root, Cmd). 99 | 100 | cast(Node, Target, Cmd) -> 101 | Default = root_init(), 102 | spawn(fun() -> 103 | riak_ensemble_peer:kmodify(Node, 104 | Target, 105 | cluster_state, 106 | {?MODULE, do_root_cast, Cmd}, 107 | Default, 108 | 5000) 109 | end), 110 | ok. 111 | 112 | do_root_call(Seq, State, Cmd) -> 113 | root_call(Cmd, Seq, State). 114 | 115 | do_root_cast(Seq, State, Cmd) -> 116 | root_cast(Cmd, Seq, State). 117 | 118 | %%%=================================================================== 119 | 120 | root_init() -> 121 | riak_ensemble_manager:get_cluster_state(). 122 | 123 | %%%=================================================================== 124 | 125 | root_call({join, Node}, Vsn, State) -> 126 | ?LOG_INFO("join(Vsn): ~p :: ~p :: ~p", [Vsn, Node, riak_ensemble_state:members(State)]), 127 | case riak_ensemble_state:add_member(Vsn, Node, State) of 128 | {ok, State2} -> 129 | State2; 130 | error -> 131 | failed 132 | end; 133 | root_call({remove, Node}, Vsn, State) -> 134 | ?LOG_INFO("remove(Vsn): ~p :: ~p :: ~p", [Vsn, Node, riak_ensemble_state:members(State)]), 135 | case riak_ensemble_state:del_member(Vsn, Node, State) of 136 | {ok, State2} -> 137 | State2; 138 | error -> 139 | failed 140 | end; 141 | root_call({set_ensemble, Ensemble, Info}, _Vsn, State) -> 142 | case riak_ensemble_state:set_ensemble(Ensemble, Info, State) of 143 | error -> 144 | failed; 145 | {ok, State2} -> 146 | State2 147 | end. 148 | 149 | %%%=================================================================== 150 | 151 | root_cast({gossip, Vsn, Leader, Views}, _Vsn, State) -> 152 | Info = #ensemble_info{vsn=Vsn, leader=Leader, views=Views}, 153 | case riak_ensemble_state:set_ensemble(root, Info, State) of 154 | {ok, State2} -> 155 | maybe_async_gossip(State2), 156 | State2; 157 | error -> 158 | maybe_async_gossip(State), 159 | failed 160 | end; 161 | root_cast({update_ensemble, Ensemble, Leader, Views, Vsn}, _Vsn, State) -> 162 | case riak_ensemble_state:update_ensemble(Vsn, Ensemble, Leader, Views, State) of 163 | error -> 164 | failed; 165 | {ok, State2} -> 166 | State2 167 | end. 168 | 169 | %% This function implements a non-blocking w/ backpressure approach to sending 170 | %% a message to the ensemble manager. Directly calling _manager:gossip would 171 | %% block the root leader. Changing _manager:gossip to use a cast would provide 172 | %% no backpressure. Instead, the leader spawns a singleton process that blocks 173 | %% on the call. As long as the singleton helper is still alive, no new process 174 | %% will be spawned. 175 | maybe_async_gossip(State) -> 176 | Async = erlang:get(async_gossip_pid), 177 | CurrentAsync = is_pid(Async) andalso is_process_alive(Async), 178 | case CurrentAsync of 179 | true -> 180 | ok; 181 | false -> 182 | Async2 = spawn(fun() -> 183 | riak_ensemble_manager:gossip(State) 184 | end), 185 | erlang:put(async_gossip_pid, Async2), 186 | ok 187 | end. 188 | -------------------------------------------------------------------------------- /src/riak_ensemble_router.erl: -------------------------------------------------------------------------------- 1 | %% ------------------------------------------------------------------- 2 | %% 3 | %% Copyright (c) 2013 Basho Technologies, Inc. All Rights Reserved. 4 | %% 5 | %% This file is provided to you under the Apache License, 6 | %% Version 2.0 (the "License"); you may not use this file 7 | %% except in compliance with the License. You may obtain 8 | %% a copy of the License at 9 | %% 10 | %% http://www.apache.org/licenses/LICENSE-2.0 11 | %% 12 | %% Unless required by applicable law or agreed to in writing, 13 | %% software distributed under the License is distributed on an 14 | %% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | %% KIND, either express or implied. See the License for the 16 | %% specific language governing permissions and limitations 17 | %% under the License. 18 | %% 19 | %% ------------------------------------------------------------------- 20 | 21 | %% @doc 22 | %% The primary purpose of this module is to route requests to ensemble 23 | %% leaders given ensemble names, even if the requests are originating 24 | %% from nodes that are not part of the ensemble system: eg. a remote 25 | %% Erlang node using {@link riak_ensemble_client}. This router also 26 | %% addresses the issue that ensemble and peer names are arbitrary terms 27 | %% and not registered names, and therefore Erlang's built-in messaging 28 | %% cannot directly address ensemble peers. 29 | %% 30 | %% This routing layer is handled by multiple instances of this module 31 | %% that run on each node in the ensemble cluster. A request is sent to 32 | %% a random router on a given node, which then looks up the ensemble 33 | %% leader using its local `riak_ensemble_manager' state, routing the 34 | %% request directly to a local pid (if the leader is local) or forwarding 35 | %% on to a router on the respective leading node. 36 | %% 37 | %% The reason for running multiple router instances per node is to enable 38 | %% additional concurrency and not have a single router bottleneck traffic. 39 | %% 40 | %% A secondary purpose of this module is to provide an isolated version 41 | %% of `gen_fsm:send_sync_event' that converts timeouts into error tuples 42 | %% rather than exit conditions, as well as discarding late/delayed messages. 43 | %% This isolation is provided by spawning an intermediary proxy process. 44 | 45 | -module(riak_ensemble_router). 46 | 47 | -compile(nowarn_export_all). 48 | -compile(export_all). 49 | 50 | -compile({nowarn_deprecated_function, 51 | [{gen_fsm, sync_send_event, 3}]}). 52 | 53 | -behaviour(gen_server). 54 | 55 | -include_lib("riak_ensemble_types.hrl"). 56 | 57 | %% API 58 | -export([start_link/1]). 59 | 60 | %% gen_server callbacks 61 | -export([init/1, handle_call/3, handle_cast/2, handle_info/2, 62 | terminate/2, code_change/3]). 63 | 64 | -record(state, {}). 65 | 66 | -type target() :: pid() | ensemble_id(). 67 | -type msg() :: term(). 68 | 69 | %%%=================================================================== 70 | %%% API 71 | %%%=================================================================== 72 | 73 | -spec start_link(atom()) -> ignore | {error, _} | {ok, pid()}. 74 | start_link(Name) -> 75 | gen_server:start_link({local, Name}, ?MODULE, [], []). 76 | 77 | -spec sync_send_event(target(), msg(), timeout()) -> timeout | term(). 78 | sync_send_event(Target, Event, Timeout) -> 79 | sync_send_event(node(), Target, Event, Timeout). 80 | 81 | -spec sync_send_event(node(), target(), msg(), timeout()) -> timeout | term(). 82 | sync_send_event(_Node, Target, _Event, infinity) when not is_pid(Target) -> 83 | %% TODO: Consider handling this case 84 | throw("infinity timeout not currently safe for non-pid target"); 85 | sync_send_event(Node, Target, Event, Timeout) -> 86 | Ref = make_ref(), 87 | spawn_link(?MODULE, sync_proxy, [self(), Ref, Node, Target, Event, Timeout]), 88 | receive 89 | {Ref, nack} -> 90 | timeout; 91 | {Ref, Result} -> 92 | Result 93 | end. 94 | 95 | -spec sync_proxy(pid(), reference(), node(), target(), msg(), timeout()) -> ok. 96 | sync_proxy(From, Ref, _Node, Target, Event, Timeout) when is_pid(Target) -> 97 | sync_proxy_direct(From, Ref, Target, Event, Timeout); 98 | sync_proxy(From, Ref, Node, Target, Event, Timeout) -> 99 | sync_proxy_router(From, Ref, Node, Target, Event, Timeout). 100 | 101 | -spec sync_proxy_direct(pid(), reference(), pid(), msg(), timeout()) -> ok. 102 | sync_proxy_direct(From, Ref, Pid, Event, Timeout) -> 103 | try 104 | Result = gen_fsm:sync_send_event(Pid, Event, Timeout), 105 | From ! {Ref, Result}, 106 | ok 107 | catch 108 | _:_ -> 109 | From ! {Ref, timeout}, 110 | ok 111 | end. 112 | 113 | -spec sync_proxy_router(pid(), reference(), node(), ensemble_id(), msg(), timeout()) -> ok. 114 | sync_proxy_router(From, Ref, Node, Target, Event, Timeout) -> 115 | case riak_ensemble_router:cast(Node, Target, {sync_send_event, self(), Ref, Event, Timeout}) of 116 | ok -> 117 | receive 118 | {Ref, _}=Reply -> 119 | From ! Reply, 120 | ok 121 | after Timeout -> 122 | From ! {Ref, timeout}, 123 | ok 124 | end; 125 | error -> 126 | From ! {Ref, timeout}, 127 | ok 128 | end. 129 | 130 | -spec cast(ensemble_id(), msg()) -> error | ok. 131 | cast(Ensemble, Msg) -> 132 | ensemble_cast(Ensemble, Msg). 133 | 134 | -spec cast(node(), ensemble_id(), msg()) -> error | ok. 135 | cast(Node, Ensemble, Msg) when Node =:= node() -> 136 | cast(Ensemble, Msg); 137 | cast(Node, Ensemble, Msg) -> 138 | NumRouters = tuple_size(routers()), 139 | Pick = random(NumRouters), 140 | Router = element(Pick + 1, routers()), 141 | %% gen_server:cast({Router, Node}, {ensemble_cast, Ensemble, Msg}). 142 | case noconnect_cast({Router, Node}, {ensemble_cast, Ensemble, Msg}) of 143 | nodedown -> 144 | _ = fail_cast(Msg), 145 | ok; 146 | ok -> 147 | ok 148 | end. 149 | 150 | noconnect_cast(Dest, Msg) -> 151 | case catch erlang:send(Dest, {'$gen_cast', Msg}, [noconnect]) of 152 | noconnect -> 153 | spawn(fun() -> 154 | case Dest of 155 | {_, Node} -> 156 | net_adm:ping(Node); 157 | Pid when is_pid(Pid) -> 158 | net_adm:ping(node(Pid)); 159 | _ -> 160 | ok 161 | end 162 | end), 163 | nodedown; 164 | _ -> 165 | ok 166 | end. 167 | 168 | %% TODO: Switch to using sidejob_config or copy thereof 169 | routers() -> 170 | {riak_ensemble_router_1, 171 | riak_ensemble_router_2, 172 | riak_ensemble_router_3, 173 | riak_ensemble_router_4, 174 | riak_ensemble_router_5, 175 | riak_ensemble_router_6, 176 | riak_ensemble_router_7}. 177 | 178 | %% @doc Generate "random" number X, such that `0 <= X < N'. 179 | -spec random(pos_integer()) -> pos_integer(). 180 | random(N) -> 181 | %% Note: hashing over I/O statistics seems to be fastest option when 182 | %% benchmarking with lots of concurrent processes. Inside BEAM, 183 | %% querying I/O statistics corresponds to two atomic reads. 184 | %% 185 | %% random:uniform_s(os:timestamp()), 186 | %% crypto:rand_uniform(0, NumRouters), 187 | %% element(3, os:timestamp()) rem N. 188 | %% erlang:phash2(make_ref(), N). 189 | %% erlang:phash2(erlang:statistics(context_switches), N). 190 | %% erlang:phash2(erlang:statistics(garbage_collection), NumRouters), 191 | erlang:phash2(erlang:statistics(io), N). 192 | 193 | %%%=================================================================== 194 | %%% gen_server callbacks 195 | %%%=================================================================== 196 | 197 | init([]) -> 198 | {ok, #state{}}. 199 | 200 | handle_call(_Request, _From, State) -> 201 | {reply, ok, State}. 202 | 203 | handle_cast({ensemble_cast, Ensemble, Msg}, State) -> 204 | ensemble_cast(Ensemble, Msg), 205 | {noreply, State}; 206 | handle_cast(_Msg, State) -> 207 | {noreply, State}. 208 | 209 | handle_info(_Info, State) -> 210 | {noreply, State}. 211 | 212 | terminate(_Reason, _State) -> 213 | ok. 214 | 215 | code_change(_OldVsn, State, _Extra) -> 216 | {ok, State}. 217 | 218 | %%%=================================================================== 219 | %%% Internal functions 220 | %%%=================================================================== 221 | 222 | -spec ensemble_cast(ensemble_id(), msg()) -> error | ok. 223 | ensemble_cast(Ensemble, Msg) -> 224 | case riak_ensemble_manager:get_leader(Ensemble) of 225 | {_, Node}=Leader -> 226 | %% io:format("L: ~p~n", [Leader]), 227 | if Node =:= node() -> 228 | Pid = riak_ensemble_manager:get_peer_pid(Ensemble, Leader), 229 | %% io:format("Sending to ~p~n", [Pid]), 230 | handle_ensemble_cast(Msg, Pid), 231 | ok; 232 | true -> 233 | riak_ensemble_router:cast(Node, Ensemble, Msg), 234 | ok 235 | end; 236 | undefined -> 237 | error 238 | end. 239 | 240 | -spec handle_ensemble_cast(_,_) -> ok. 241 | handle_ensemble_cast({sync_send_event, From, Ref, Event, Timeout}, Pid) -> 242 | spawn(fun() -> 243 | try 244 | Result = gen_fsm:sync_send_event(Pid, Event, Timeout), 245 | From ! {Ref, Result} 246 | catch 247 | _:_ -> 248 | From ! {Ref, timeout} 249 | end 250 | end), 251 | ok; 252 | handle_ensemble_cast(_, _Pid) -> 253 | ok. 254 | 255 | fail_cast({sync_send_event, From, Ref, _Event, _Timeout}) -> 256 | From ! {Ref, timeout}, 257 | ok. 258 | -------------------------------------------------------------------------------- /src/riak_ensemble_router_sup.erl: -------------------------------------------------------------------------------- 1 | %% ------------------------------------------------------------------- 2 | %% 3 | %% Copyright (c) 2013 Basho Technologies, Inc. All Rights Reserved. 4 | %% 5 | %% This file is provided to you under the Apache License, 6 | %% Version 2.0 (the "License"); you may not use this file 7 | %% except in compliance with the License. You may obtain 8 | %% a copy of the License at 9 | %% 10 | %% http://www.apache.org/licenses/LICENSE-2.0 11 | %% 12 | %% Unless required by applicable law or agreed to in writing, 13 | %% software distributed under the License is distributed on an 14 | %% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | %% KIND, either express or implied. See the License for the 16 | %% specific language governing permissions and limitations 17 | %% under the License. 18 | %% 19 | %% ------------------------------------------------------------------- 20 | -module(riak_ensemble_router_sup). 21 | -behaviour(supervisor). 22 | 23 | %% API 24 | -export([start_link/0]). 25 | 26 | %% Supervisor callbacks 27 | -export([init/1]). 28 | 29 | %%%=================================================================== 30 | %%% API functions 31 | %%%=================================================================== 32 | 33 | start_link() -> 34 | supervisor:start_link({local, ?MODULE}, ?MODULE, []). 35 | 36 | %%%=================================================================== 37 | %%% Supervisor callbacks 38 | %%%=================================================================== 39 | 40 | init([]) -> 41 | Routers = riak_ensemble_router:routers(), 42 | Children = [{Name, {riak_ensemble_router, start_link, [Name]}, 43 | permanent, 5000, worker, [riak_ensemble_router]} 44 | || Name <- tuple_to_list(Routers)], 45 | {ok, {{one_for_one, 10, 10}, Children}}. 46 | -------------------------------------------------------------------------------- /src/riak_ensemble_save.erl: -------------------------------------------------------------------------------- 1 | %% ------------------------------------------------------------------- 2 | %% 3 | %% Copyright (c) 2013 Basho Technologies, Inc. All Rights Reserved. 4 | %% 5 | %% This file is provided to you under the Apache License, 6 | %% Version 2.0 (the "License"); you may not use this file 7 | %% except in compliance with the License. You may obtain 8 | %% a copy of the License at 9 | %% 10 | %% http://www.apache.org/licenses/LICENSE-2.0 11 | %% 12 | %% Unless required by applicable law or agreed to in writing, 13 | %% software distributed under the License is distributed on an 14 | %% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | %% KIND, either express or implied. See the License for the 16 | %% specific language governing permissions and limitations 17 | %% under the License. 18 | %% 19 | %% ------------------------------------------------------------------- 20 | 21 | %% @doc 22 | %% Provide a safe method of saving data to disk along with a checksum 23 | %% that is verified on read. Additionally, four replicas of the data 24 | %% are stored across two files for greater redundancy/durability. 25 | 26 | -module(riak_ensemble_save). 27 | -export([write/2, read/1]). 28 | 29 | %%=================================================================== 30 | 31 | -spec write(file:filename(), binary()) -> ok | {error, term()}. 32 | write(File, Data) -> 33 | CRC = erlang:crc32(Data), 34 | Size = byte_size(Data), 35 | Meta = <>, 36 | Out = [Meta, Data, %% copy 1 37 | Data, Meta], %% copy 2 38 | ok = filelib:ensure_dir(File), 39 | try 40 | _ = Out, 41 | ok = riak_ensemble_util:replace_file(File, Out), 42 | ok = riak_ensemble_util:replace_file(File ++ ".backup", Out), 43 | ok 44 | catch 45 | _:Err -> 46 | {error, Err} 47 | end. 48 | 49 | -spec read(file:filename()) -> {ok, binary()} | not_found. 50 | read(File) -> 51 | case do_read(File) of 52 | not_found -> 53 | do_read(File ++ ".backup"); 54 | Result -> 55 | Result 56 | end. 57 | 58 | %%=================================================================== 59 | 60 | -spec do_read(file:filename()) -> {ok, binary()} | not_found. 61 | do_read(File) -> 62 | case riak_ensemble_util:read_file(File) of 63 | {ok, Binary} -> 64 | safe_read(Binary); 65 | {error, _} -> 66 | not_found 67 | end. 68 | 69 | -spec safe_read(binary()) -> {ok, binary()} | not_found. 70 | safe_read(<>) -> 71 | case erlang:crc32(Data) of 72 | CRC -> 73 | {ok, Data}; 74 | _ -> 75 | safe_read_backup(Rest) 76 | end; 77 | safe_read(Binary) -> 78 | safe_read_backup(Binary). 79 | 80 | -spec safe_read_backup(binary()) -> {ok, binary()} | not_found. 81 | safe_read_backup(Binary) when byte_size(Binary) =< 8 -> 82 | not_found; 83 | safe_read_backup(Binary) -> 84 | BinSize = byte_size(Binary), 85 | Skip = BinSize - 8, 86 | <<_:Skip/binary, CRC:32/integer, Size:32/integer>> = Binary, 87 | Skip2 = Skip - Size, 88 | case Binary of 89 | <<_:Skip2/binary, Data:Size/binary, _:8/binary>> -> 90 | case erlang:crc32(Data) of 91 | CRC -> 92 | {ok, Data}; 93 | _ -> 94 | not_found 95 | end; 96 | _ -> 97 | not_found 98 | end. 99 | -------------------------------------------------------------------------------- /src/riak_ensemble_state.erl: -------------------------------------------------------------------------------- 1 | %% ------------------------------------------------------------------- 2 | %% 3 | %% Copyright (c) 2013 Basho Technologies, Inc. All Rights Reserved. 4 | %% 5 | %% This file is provided to you under the Apache License, 6 | %% Version 2.0 (the "License"); you may not use this file 7 | %% except in compliance with the License. You may obtain 8 | %% a copy of the License at 9 | %% 10 | %% http://www.apache.org/licenses/LICENSE-2.0 11 | %% 12 | %% Unless required by applicable law or agreed to in writing, 13 | %% software distributed under the License is distributed on an 14 | %% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | %% KIND, either express or implied. See the License for the 16 | %% specific language governing permissions and limitations 17 | %% under the License. 18 | %% 19 | %% ------------------------------------------------------------------- 20 | -module(riak_ensemble_state). 21 | 22 | -include_lib("riak_ensemble_types.hrl"). 23 | 24 | -export([new/1, is_state/1]). 25 | -export([add_member/3, 26 | del_member/3]). 27 | -export([set_ensemble/3, 28 | update_ensemble/5, 29 | set_pending/4]). 30 | -export([enable/1, enabled/1]). 31 | -export([merge/2]). 32 | -export([id/1, members/1, ensembles/1, pending/1]). 33 | 34 | -include_lib("kernel/include/logger.hrl"). 35 | 36 | -type ensembles() :: orddict(ensemble_id(), ensemble_info()). 37 | -type pending() :: orddict(ensemble_id(), {vsn(), views()}). 38 | 39 | -record(cluster_state, {id :: any(), 40 | enabled :: boolean(), 41 | members :: {vsn(), ordsets(node())}, 42 | ensembles :: ensembles(), 43 | pending :: pending() 44 | }). 45 | 46 | -type state() :: #cluster_state{}. 47 | -export_type([state/0]). 48 | 49 | -define(STATE, #cluster_state). 50 | 51 | %%%=================================================================== 52 | 53 | -spec new(term()) -> state(). 54 | new(Id) -> 55 | ?STATE{id = Id, 56 | enabled = false, 57 | members = {vsn0(), ordsets:new()}, 58 | ensembles = orddict:new(), 59 | pending = orddict:new()}. 60 | 61 | -spec is_state(term()) -> boolean(). 62 | is_state(?STATE{}) -> 63 | true; 64 | is_state(_) -> 65 | false. 66 | 67 | -spec enable(state()) -> {ok, state()} | error. 68 | enable(State=?STATE{enabled=false}) -> 69 | State2 = State?STATE{enabled=true}, 70 | {ok, State2}; 71 | enable(?STATE{enabled=true}) -> 72 | error. 73 | 74 | -spec enabled(state()) -> boolean(). 75 | enabled(?STATE{enabled=Enabled}) -> 76 | Enabled. 77 | 78 | -spec id(state()) -> term(). 79 | id(?STATE{id=Id}) -> 80 | Id. 81 | 82 | -spec members(state()) -> ordsets(node()). 83 | members(?STATE{members={_Vsn, Nodes}}) -> 84 | Nodes. 85 | 86 | -spec ensembles(state()) -> ensembles(). 87 | ensembles(?STATE{ensembles=Ensembles}) -> 88 | Ensembles. 89 | 90 | -spec pending(state()) -> pending(). 91 | pending(?STATE{pending=Pending}) -> 92 | %% [{Id, Views} || {Id, {_Vsn, Views}} || <- Pending]. 93 | Pending. 94 | 95 | -spec add_member(vsn(), node(), state()) -> {ok, state()} | error. 96 | add_member(Vsn, Node, State=?STATE{members={CurVsn, Nodes}}) -> 97 | case newer(CurVsn, Vsn) of 98 | true -> 99 | Nodes2 = ordsets:add_element(Node, Nodes), 100 | State2 = State?STATE{members={Vsn, Nodes2}}, 101 | {ok, State2}; 102 | false -> 103 | error 104 | end. 105 | 106 | -spec del_member(vsn(), node(), state()) -> {ok, state()} | error. 107 | del_member(Vsn, Node, State=?STATE{members={CurVsn, Nodes}}) -> 108 | case newer(CurVsn, Vsn) of 109 | true -> 110 | Nodes2 = ordsets:del_element(Node, Nodes), 111 | State2 = State?STATE{members={Vsn, Nodes2}}, 112 | {ok, State2}; 113 | false -> 114 | error 115 | end. 116 | 117 | -spec set_ensemble(ensemble_id(), ensemble_info(), state()) -> {ok, state()} | 118 | error. 119 | set_ensemble(Ensemble, Info, State=?STATE{ensembles=Ensembles}) -> 120 | Vsn = Info#ensemble_info.vsn, 121 | CurVsn = case orddict:find(Ensemble, Ensembles) of 122 | {ok, CurInfo} -> 123 | CurInfo#ensemble_info.vsn; 124 | error -> 125 | undefined 126 | end, 127 | case newer(CurVsn, Vsn) of 128 | true -> 129 | Ensembles2 = orddict:store(Ensemble, Info, Ensembles), 130 | State2 = State?STATE{ensembles=Ensembles2}, 131 | {ok, State2}; 132 | false -> 133 | error 134 | end. 135 | 136 | -spec update_ensemble(vsn(), ensemble_id(), peer_id(), views(), state()) 137 | -> {ok, state()} | error. 138 | update_ensemble(Vsn, Ensemble, Leader, Views, State=?STATE{ensembles=Ensembles}) -> 139 | case orddict:find(Ensemble, Ensembles) of 140 | {ok, CurInfo} -> 141 | CurVsn = CurInfo#ensemble_info.vsn, 142 | case newer(CurVsn, Vsn) of 143 | true -> 144 | NewInfo = CurInfo#ensemble_info{vsn=Vsn, leader=Leader, views=Views}, 145 | Ensembles2 = orddict:store(Ensemble, NewInfo, Ensembles), 146 | State2 = State?STATE{ensembles=Ensembles2}, 147 | {ok, State2}; 148 | false -> 149 | error 150 | end; 151 | error -> 152 | error 153 | end. 154 | 155 | -spec set_pending(vsn(), ensemble_id(), views(), state()) -> {ok, state()} | 156 | error. 157 | set_pending(Vsn, Ensemble, Views, State=?STATE{pending=Pending}) -> 158 | CurVsn = case orddict:find(Ensemble, Pending) of 159 | {ok, {CV, _}} -> 160 | CV; 161 | error -> 162 | undefined 163 | end, 164 | case newer(CurVsn, Vsn) of 165 | true -> 166 | Pending2 = orddict:store(Ensemble, {Vsn, Views}, Pending), 167 | State2 = State?STATE{pending=Pending2}, 168 | {ok, State2}; 169 | false -> 170 | error 171 | end. 172 | 173 | -spec merge(state(), state()) -> state(). 174 | merge(A, B) when A?STATE.enabled and (A?STATE.id =/= B?STATE.id) -> 175 | ?LOG_WARNING("Ignoring cluster state with different id"), 176 | A; 177 | merge(A=?STATE{members=MembersA, ensembles=EnsemblesA, pending=PendingA}, 178 | _=?STATE{members=MembersB, ensembles=EnsemblesB, pending=PendingB}) -> 179 | A?STATE{members=merge_members(MembersA, MembersB), 180 | ensembles=merge_ensembles(EnsemblesA, EnsemblesB), 181 | pending=merge_pending(PendingA, PendingB)}. 182 | 183 | %%%=================================================================== 184 | 185 | merge_members(A={VsnA, _}, B={VsnB, _}) -> 186 | case newer(VsnA, VsnB) of 187 | true -> 188 | B; 189 | false -> 190 | A 191 | end. 192 | 193 | merge_ensembles(EnsemblesA, EnsemblesB) -> 194 | orddict:merge(fun merge_ensemble/3, EnsemblesA, EnsemblesB). 195 | 196 | merge_ensemble(_, InfoA, InfoB) -> 197 | case newer(InfoA#ensemble_info.vsn, InfoB#ensemble_info.vsn) of 198 | true -> 199 | InfoB; 200 | false -> 201 | InfoA 202 | end. 203 | 204 | merge_pending(PendingA, PendingB) -> 205 | orddict:merge(fun merge_pending_views/3, PendingA, PendingB). 206 | 207 | merge_pending_views(_, A={VsnA, _}, B={VsnB, _}) -> 208 | case newer(VsnA, VsnB) of 209 | true -> 210 | B; 211 | false -> 212 | A 213 | end. 214 | 215 | newer(VsnA, VsnB) -> 216 | ensure_vsn(VsnB) > ensure_vsn(VsnA). 217 | 218 | ensure_vsn(undefined) -> 219 | vsn0(); 220 | ensure_vsn(Vsn={_,_}) -> 221 | Vsn. 222 | 223 | vsn0() -> 224 | {-1,0}. 225 | -------------------------------------------------------------------------------- /src/riak_ensemble_storage.erl: -------------------------------------------------------------------------------- 1 | %% ------------------------------------------------------------------- 2 | %% 3 | %% Copyright (c) 2013 Basho Technologies, Inc. All Rights Reserved. 4 | %% 5 | %% This file is provided to you under the Apache License, 6 | %% Version 2.0 (the "License"); you may not use this file 7 | %% except in compliance with the License. You may obtain 8 | %% a copy of the License at 9 | %% 10 | %% http://www.apache.org/licenses/LICENSE-2.0 11 | %% 12 | %% Unless required by applicable law or agreed to in writing, 13 | %% software distributed under the License is distributed on an 14 | %% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | %% KIND, either express or implied. See the License for the 16 | %% specific language governing permissions and limitations 17 | %% under the License. 18 | %% 19 | %% ------------------------------------------------------------------- 20 | 21 | %% @doc 22 | %% This module implements a central storage manager for riak_ensemble. 23 | %% Previously, individual ensembles as well as the ensemble manager would 24 | %% independently save their own state to disk. However, such an approach 25 | %% scaled poorly as the number of independent ensembles increased. It was 26 | %% not uncommon to see thousands of synchronous writes issued to disk per 27 | %% second, overwhelming the I/O subsystem. To solve this issue, this storage 28 | %% manager was created. 29 | %% 30 | %% Rather than storing data independently, the storage manager combines the 31 | %% state from multiple ensembles as well as the ensemble manager into a 32 | %% single entity that is stored together in a single file. Since this file 33 | %% is now a critical single point of failure, the storage manager uses the 34 | %% new {@link riak_ensemble_save} logic to save this data to disk such that 35 | %% there are four redundant copies to recover from. 36 | %% 37 | %% This manager is also responsible for coalescing multiple writes together 38 | %% to reduce disk traffic. Individual writes are staged in an ETS table and 39 | %% then flushed to disk after a delay (eg. 50ms). 40 | %% 41 | %% There are two ways to save data to disk that are used by other components 42 | %% in riak_ensemble: synchronous and asynchronous. 43 | %% 44 | %% For synchronous writes components use the sequence: 45 | %% riak_ensemble_storage:put(Key, Data), 46 | %% riak_ensemble_storage:sync(). 47 | %% The sync() call than blocks until the data has successfully been written, 48 | %% to disk. 49 | %% 50 | %% For asynchronous writes, components simply use put() without sync(). The 51 | %% data will then be written to disk either when another component calls sync, 52 | %% or after next storage manager tick (eg. every 5 seconds). 53 | %% 54 | 55 | -module(riak_ensemble_storage). 56 | -behaviour(gen_server). 57 | 58 | %% API 59 | -export([start_link/0]). 60 | -export([get/1, put/2, sync/0]). 61 | 62 | %% gen_server callbacks 63 | -export([init/1, handle_call/3, handle_cast/2, handle_info/2, 64 | terminate/2, code_change/3]). 65 | 66 | -define(ETS, ets_riak_ensemble_storage). 67 | -define(SYNC_DELAY, riak_ensemble_config:storage_delay()). 68 | -define(TICK, riak_ensemble_config:storage_tick()). 69 | 70 | -type gen_server_from() :: any(). 71 | 72 | -record(state, {savefile :: file:filename(), 73 | waiting :: [gen_server_from()], 74 | previous :: binary() | undefined, 75 | timer :: reference() | undefined 76 | }). 77 | 78 | -type state() :: #state{}. 79 | 80 | %%%=================================================================== 81 | %%% API 82 | %%%=================================================================== 83 | 84 | start_link() -> 85 | gen_server:start_link({local, ?MODULE}, ?MODULE, [], []). 86 | 87 | -spec sync() -> ok. 88 | sync() -> 89 | gen_server:call(?MODULE, sync, infinity). 90 | 91 | -spec put(term(), term()) -> true. 92 | put(Key, Value) -> 93 | ets:insert(?ETS, {Key, Value}). 94 | 95 | -spec get(term()) -> {ok, term()} | not_found. 96 | get(Key) -> 97 | try 98 | Value = ets:lookup_element(?ETS, Key, 2), 99 | {ok, Value} 100 | catch 101 | _:_ -> 102 | %% Retry through the server in case data is being loaded 103 | gen_server:call(?MODULE, {get, Key}, infinity) 104 | end. 105 | 106 | %%%=================================================================== 107 | %%% gen_server callbacks 108 | %%%=================================================================== 109 | 110 | init([]) -> 111 | {ok, Root} = application:get_env(riak_ensemble, data_root), 112 | File = filename:join([Root, "ensembles", "ensemble_facts"]), 113 | _ = ets:new(?ETS, [named_table, public, {read_concurrency, true}, 114 | {write_concurrency, true}]), 115 | case riak_ensemble_save:read(File) of 116 | {ok, Bin} -> 117 | Existing = binary_to_term(Bin), 118 | true = ets:insert(?ETS, Existing); 119 | _ -> 120 | ok 121 | end, 122 | schedule_tick(), 123 | {ok, #state{savefile=File, waiting=[], timer=undefined}}. 124 | 125 | handle_call({get, Key}, _From, State) -> 126 | Reply = case ets:lookup(?ETS, Key) of 127 | [{_, Value}] -> 128 | {ok, Value}; 129 | _ -> 130 | not_found 131 | end, 132 | {reply, Reply, State}; 133 | 134 | handle_call(sync, From, State=#state{waiting=Waiting}) -> 135 | Waiting2 = [From|Waiting], 136 | State2 = maybe_schedule_sync(State), 137 | State3 = State2#state{waiting=Waiting2}, 138 | {noreply, State3}; 139 | 140 | handle_call(_Request, _From, State) -> 141 | {reply, ok, State}. 142 | 143 | handle_cast(_Msg, State) -> 144 | {noreply, State}. 145 | 146 | handle_info(tick, State) -> 147 | State2 = tick(State), 148 | schedule_tick(), 149 | {noreply, State2}; 150 | 151 | handle_info(do_sync, State) -> 152 | {noreply, do_sync(State)}; 153 | 154 | handle_info(_Info, State) -> 155 | {noreply, State}. 156 | 157 | terminate(_Reason, _State) -> 158 | ok. 159 | 160 | code_change(_OldVsn, State, _Extra) -> 161 | {ok, State}. 162 | 163 | %%%=================================================================== 164 | %%% Internal functions 165 | %%%=================================================================== 166 | 167 | -spec schedule_tick() -> ok. 168 | schedule_tick() -> 169 | _ = erlang:send_after(?TICK, self(), tick), 170 | ok. 171 | 172 | -spec tick(state()) -> state(). 173 | tick(State) -> 174 | State2 = maybe_schedule_sync(State), 175 | State2. 176 | 177 | -spec maybe_schedule_sync(state()) -> state(). 178 | maybe_schedule_sync(State=#state{timer=undefined}) -> 179 | Timer = erlang:send_after(?SYNC_DELAY, self(), do_sync), 180 | State#state{timer=Timer}; 181 | maybe_schedule_sync(State) -> 182 | State. 183 | 184 | -spec do_sync(state()) -> state(). 185 | do_sync(State=#state{savefile=File, waiting=Waiting, previous=PrevData}) -> 186 | Data = term_to_binary(ets:tab2list(?ETS)), 187 | case Data of 188 | PrevData -> 189 | ok; 190 | _ -> 191 | ok = riak_ensemble_save:write(File, Data) 192 | end, 193 | _ = [gen_server:reply(From, ok) || From <- Waiting], 194 | State#state{waiting=[], timer=undefined, previous=Data}. 195 | -------------------------------------------------------------------------------- /src/riak_ensemble_sup.erl: -------------------------------------------------------------------------------- 1 | %% ------------------------------------------------------------------- 2 | %% 3 | %% Copyright (c) 2013 Basho Technologies, Inc. All Rights Reserved. 4 | %% 5 | %% This file is provided to you under the Apache License, 6 | %% Version 2.0 (the "License"); you may not use this file 7 | %% except in compliance with the License. You may obtain 8 | %% a copy of the License at 9 | %% 10 | %% http://www.apache.org/licenses/LICENSE-2.0 11 | %% 12 | %% Unless required by applicable law or agreed to in writing, 13 | %% software distributed under the License is distributed on an 14 | %% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | %% KIND, either express or implied. See the License for the 16 | %% specific language governing permissions and limitations 17 | %% under the License. 18 | %% 19 | %% ------------------------------------------------------------------- 20 | -module(riak_ensemble_sup). 21 | 22 | -behaviour(supervisor). 23 | 24 | %% API 25 | -export([start_link/0, start_link/1]). 26 | 27 | %% Supervisor callbacks 28 | -export([init/1]). 29 | 30 | %% Helper macro for declaring children of supervisor 31 | -define(CHILD(I, Type), {I, {I, start_link, []}, permanent, 5000, Type, [I]}). 32 | 33 | %% =================================================================== 34 | %% API functions 35 | %% =================================================================== 36 | 37 | start_link(Path) -> 38 | application:set_env(riak_ensemble, data_root, Path), 39 | start_link(). 40 | 41 | start_link() -> 42 | supervisor:start_link({local, ?MODULE}, ?MODULE, []). 43 | 44 | %% =================================================================== 45 | %% Supervisor callbacks 46 | %% =================================================================== 47 | 48 | init([]) -> 49 | riak_ensemble_test:setup(), 50 | synctree_leveldb:init_ets(), 51 | Children = [?CHILD(riak_ensemble_router_sup, supervisor), 52 | ?CHILD(riak_ensemble_storage, worker), 53 | ?CHILD(riak_ensemble_peer_sup, supervisor), 54 | ?CHILD(riak_ensemble_manager, worker)], 55 | {ok, {{rest_for_one, 5, 10}, Children}}. 56 | -------------------------------------------------------------------------------- /src/riak_ensemble_test.erl: -------------------------------------------------------------------------------- 1 | %% ------------------------------------------------------------------- 2 | %% 3 | %% Copyright (c) 2013 Basho Technologies, Inc. All Rights Reserved. 4 | %% 5 | %% This file is provided to you under the Apache License, 6 | %% Version 2.0 (the "License"); you may not use this file 7 | %% except in compliance with the License. You may obtain 8 | %% a copy of the License at 9 | %% 10 | %% http://www.apache.org/licenses/LICENSE-2.0 11 | %% 12 | %% Unless required by applicable law or agreed to in writing, 13 | %% software distributed under the License is distributed on an 14 | %% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | %% KIND, either express or implied. See the License for the 16 | %% specific language governing permissions and limitations 17 | %% under the License. 18 | %% 19 | %% ------------------------------------------------------------------- 20 | 21 | -module(riak_ensemble_test). 22 | 23 | -compile(nowarn_export_all). 24 | -compile(export_all). 25 | 26 | -define(ETS_TEST, riak_ensemble_test). 27 | 28 | -ifdef(TEST). 29 | 30 | setup() -> 31 | _ = ets:new(?ETS_TEST, [public, named_table, {read_concurrency, true}, 32 | {write_concurrency, true}]), 33 | ok. 34 | 35 | maybe_drop(Id, PeerId) -> 36 | case catch ets:member(?ETS_TEST, {drop, {Id, PeerId}}) of 37 | true -> 38 | true; 39 | _ -> 40 | false 41 | end. 42 | 43 | -else. 44 | 45 | setup() -> 46 | ok. 47 | 48 | maybe_drop(_, _) -> 49 | false. 50 | 51 | -endif. 52 | -------------------------------------------------------------------------------- /src/riak_ensemble_util.erl: -------------------------------------------------------------------------------- 1 | %% ------------------------------------------------------------------- 2 | %% 3 | %% Copyright (c) 2013 Basho Technologies, Inc. All Rights Reserved. 4 | %% 5 | %% This file is provided to you under the Apache License, 6 | %% Version 2.0 (the "License"); you may not use this file 7 | %% except in compliance with the License. You may obtain 8 | %% a copy of the License at 9 | %% 10 | %% http://www.apache.org/licenses/LICENSE-2.0 11 | %% 12 | %% Unless required by applicable law or agreed to in writing, 13 | %% software distributed under the License is distributed on an 14 | %% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | %% KIND, either express or implied. See the License for the 16 | %% specific language governing permissions and limitations 17 | %% under the License. 18 | %% 19 | %% ------------------------------------------------------------------- 20 | -module(riak_ensemble_util). 21 | -export([replace_file/2, 22 | read_file/1, 23 | sha/1, 24 | md5/1, 25 | orddict_delta/2, 26 | shuffle/1, 27 | cast_unreliable/2]). 28 | 29 | %%=================================================================== 30 | 31 | -type delta() :: {any(), any()} | {'$none', any()} | {any(), '$none'}. 32 | -type orddict() :: orddict:orddict(). 33 | 34 | %%=================================================================== 35 | 36 | -spec replace_file(file:filename(), iodata()) -> ok | {error, term()}. 37 | replace_file(FN, Data) -> 38 | TmpFN = FN ++ ".tmp", 39 | {ok, FH} = file:open(TmpFN, [write, raw]), 40 | try 41 | ok = file:write(FH, Data), 42 | ok = file:sync(FH), 43 | ok = file:close(FH), 44 | ok = file:rename(TmpFN, FN), 45 | {ok, Contents} = read_file(FN), 46 | true = (Contents == iolist_to_binary(Data)), 47 | ok 48 | catch _:Err -> 49 | {error, Err} 50 | end. 51 | 52 | %%=================================================================== 53 | 54 | %% @doc Similar to {@link file:read_file/1} but uses raw file I/O 55 | -spec read_file(file:filename()) -> {ok, binary()} | {error, _}. 56 | read_file(FName) -> 57 | case file:open(FName, [read, raw, binary]) of 58 | {ok, FD} -> 59 | Result = read_file(FD, []), 60 | ok = file:close(FD), 61 | case Result of 62 | {ok, IOList} -> 63 | {ok, iolist_to_binary(IOList)}; 64 | {error, _}=Err -> 65 | Err 66 | end; 67 | {error,_}=Err -> 68 | Err 69 | end. 70 | 71 | -spec read_file(file:fd(), [binary()]) -> {ok, [binary()]} | {error,_}. 72 | read_file(FD, Acc) -> 73 | case file:read(FD, 4096) of 74 | {ok, Data} -> 75 | read_file(FD, [Data|Acc]); 76 | eof -> 77 | {ok, lists:reverse(Acc)}; 78 | {error, _}=Err -> 79 | Err 80 | end. 81 | 82 | %%=================================================================== 83 | 84 | -ifndef(old_hash). 85 | 86 | -spec sha(iolist() | binary()) -> binary(). 87 | sha(Bin) -> 88 | crypto:hash(sha, Bin). 89 | 90 | -spec md5(iolist() | binary()) -> binary(). 91 | md5(Bin) -> 92 | crypto:hash(md5, Bin). 93 | 94 | -else. 95 | 96 | -spec sha(iolist() | binary()) -> binary(). 97 | sha(Bin) -> 98 | crypto:sha(Bin). 99 | 100 | -spec md5(iolist() | binary()) -> binary(). 101 | md5(Bin) -> 102 | crypto:md5(Bin). 103 | 104 | -endif. 105 | 106 | %%=================================================================== 107 | 108 | %% @doc 109 | %% Compare two orddicts, returning a list of differences between 110 | %% them. Differences come in three forms: 111 | %% {Val, '$none'} :: key is in `D1' but not in `D2' 112 | %% {'$none', Val} :: key is in `D2' but not in `D1' 113 | %% {Val1, Val2} :: key is in both orddicts but values differ 114 | %% 115 | -spec orddict_delta(orddict(), orddict()) -> [{any(), delta()}]. 116 | orddict_delta(D1, D2) -> 117 | orddict_delta(D1, D2, []). 118 | 119 | -spec orddict_delta(orddict(), orddict(), [{any(), delta()}]) -> [{any(), delta()}]. 120 | orddict_delta([{K1,V1}|D1], [{K2,_}=E2|D2], Acc) when K1 < K2 -> 121 | Acc2 = [{K1,{V1,'$none'}} | Acc], 122 | orddict_delta(D1, [E2|D2], Acc2); 123 | orddict_delta([{K1,_}=E1|D1], [{K2,V2}|D2], Acc) when K1 > K2 -> 124 | Acc2 = [{K2,{'$none',V2}} | Acc], 125 | orddict_delta([E1|D1], D2, Acc2); 126 | orddict_delta([{K1,V1}|D1], [{_K2,V2}|D2], Acc) -> %K1 == K2 127 | case V1 of 128 | V2 -> 129 | orddict_delta(D1, D2, Acc); 130 | _ -> 131 | Acc2 = [{K1,{V1,V2}} | Acc], 132 | orddict_delta(D1, D2, Acc2) 133 | end; 134 | orddict_delta([], [{K2,V2}|D2], Acc) -> 135 | Acc2 = [{K2,{'$none',V2}} | Acc], 136 | orddict_delta([], D2, Acc2); 137 | orddict_delta([{K1,V1}|D1], [], Acc) -> 138 | Acc2 = [{K1,{V1,'$none'}} | Acc], 139 | orddict_delta(D1, [], Acc2); 140 | orddict_delta([], [], Acc) -> 141 | lists:reverse(Acc). 142 | 143 | 144 | -spec shuffle([T]) -> [T]. 145 | shuffle([]) -> 146 | []; 147 | shuffle(L=[_]) -> 148 | L; 149 | shuffle(L) -> 150 | Range = length(L), 151 | L2 = [{rand:uniform(Range), E} || E <- L], 152 | [E || {_, E} <- lists:sort(L2)]. 153 | 154 | %% Copied from riak_core_send_msg.erl 155 | cast_unreliable(Dest, Request) -> 156 | bang_unreliable(Dest, {'$gen_cast', Request}). 157 | 158 | bang_unreliable(Dest, Msg) -> 159 | catch erlang:send(Dest, Msg, [noconnect, nosuspend]), 160 | Msg. 161 | -------------------------------------------------------------------------------- /src/synctree_ets.erl: -------------------------------------------------------------------------------- 1 | %% ------------------------------------------------------------------- 2 | %% 3 | %% Copyright (c) 2014 Basho Technologies, Inc. All Rights Reserved. 4 | %% 5 | %% This file is provided to you under the Apache License, 6 | %% Version 2.0 (the "License"); you may not use this file 7 | %% except in compliance with the License. You may obtain 8 | %% a copy of the License at 9 | %% 10 | %% http://www.apache.org/licenses/LICENSE-2.0 11 | %% 12 | %% Unless required by applicable law or agreed to in writing, 13 | %% software distributed under the License is distributed on an 14 | %% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | %% KIND, either express or implied. See the License for the 16 | %% specific language governing permissions and limitations 17 | %% under the License. 18 | %% 19 | %% ------------------------------------------------------------------- 20 | -module(synctree_ets). 21 | 22 | -export([new/1, 23 | fetch/3, 24 | exists/2, 25 | store/3, 26 | store/2]). 27 | 28 | -record(?MODULE, {ets :: ets:tid()}). 29 | -define(STATE, #?MODULE). 30 | -type state() :: ?STATE{}. 31 | 32 | -spec new(_) -> state(). 33 | new(_) -> 34 | T = ets:new(?MODULE, []), 35 | ?STATE{ets=T}. 36 | 37 | -spec fetch(_, _, state()) -> {ok, _}. 38 | fetch(Key, Default, ?STATE{ets=T}) -> 39 | case ets:lookup(T, Key) of 40 | [] -> 41 | {ok, Default}; 42 | [{_, Value}] -> 43 | {ok, Value} 44 | end. 45 | 46 | -spec exists(_, state()) -> boolean(). 47 | exists(Key, ?STATE{ets=T}) -> 48 | ets:member(T, Key). 49 | 50 | -spec store(_, _, state()) -> state(). 51 | store(Key, Val, State=?STATE{ets=T}) -> 52 | _ = ets:insert(T, {Key, Val}), 53 | State. 54 | 55 | -spec store([{_,_}], state()) -> state(). 56 | store(Updates, State=?STATE{ets=T}) -> 57 | %% _ = ets:insert(T, Updates), 58 | Inserts = [case Update of 59 | {put, Key, Val} -> 60 | {Key, Val}; 61 | {delete, Key} -> 62 | {Key, deleted} 63 | end || Update <- Updates], 64 | _ = ets:insert(T, Inserts), 65 | _ = [ets:delete_object(T, {Key, deleted}) || {delete, Key} <- Updates], 66 | State. 67 | -------------------------------------------------------------------------------- /src/synctree_leveldb.erl: -------------------------------------------------------------------------------- 1 | %% ------------------------------------------------------------------- 2 | %% 3 | %% Copyright (c) 2014 Basho Technologies, Inc. All Rights Reserved. 4 | %% 5 | %% This file is provided to you under the Apache License, 6 | %% Version 2.0 (the "License"); you may not use this file 7 | %% except in compliance with the License. You may obtain 8 | %% a copy of the License at 9 | %% 10 | %% http://www.apache.org/licenses/LICENSE-2.0 11 | %% 12 | %% Unless required by applicable law or agreed to in writing, 13 | %% software distributed under the License is distributed on an 14 | %% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | %% KIND, either express or implied. See the License for the 16 | %% specific language governing permissions and limitations 17 | %% under the License. 18 | %% 19 | %% ------------------------------------------------------------------- 20 | -module(synctree_leveldb). 21 | 22 | -export([init_ets/0, 23 | new/1, 24 | fetch/3, 25 | exists/2, 26 | store/3, 27 | store/2]). 28 | 29 | -record(?MODULE, {id :: binary(), 30 | db :: any(), 31 | path :: term()}). 32 | 33 | -define(STATE, #?MODULE). 34 | -type state() :: ?STATE{}. 35 | 36 | -define(RETRIES, 10). 37 | 38 | %% ------------------------------------------------------------------- 39 | 40 | %% Prefix bytes used to tag keys stored in LevelDB to allow for easy 41 | %% evolution of the storage format. 42 | 43 | %% Key representing {Bucket, Level} data 44 | -define(K_BUCKET, 0). 45 | 46 | %% ------------------------------------------------------------------- 47 | 48 | %% @doc 49 | %% Called by {@link riak_ensemble_sup} to create the public ETS table used 50 | %% to keep track of shared LevelDB references. Having riak_ensemble_sup 51 | %% own the ETS table ensures it survives as long as riak_ensemble is up. 52 | -spec init_ets() -> ok. 53 | init_ets() -> 54 | _ = ets:new(?MODULE, [named_table, set, public, 55 | {read_concurrency, true}, 56 | {write_concurrency, true}]), 57 | ok. 58 | 59 | -spec new(_) -> state(). 60 | new(Opts) -> 61 | Path = get_path(Opts), 62 | {ok, DB} = maybe_open_leveldb(Path, ?RETRIES), 63 | Id = get_tree_id(Opts), 64 | ?STATE{id=Id, path=Path, db=DB}. 65 | 66 | maybe_open_leveldb(Path, Retries) -> 67 | %% Check if we have already opened this LevelDB instance, which can 68 | %% occur when peers are sharing the same on-disk instance. 69 | case ets:lookup(?MODULE, Path) of 70 | [{_, DB}] -> 71 | {ok, DB}; 72 | _ -> 73 | ok = filelib:ensure_dir(Path), 74 | case eleveldb:open(Path, leveldb_opts()) of 75 | {ok, DB} -> 76 | %% If eleveldb:open succeeded, we should have the only ref 77 | true = ets:insert_new(?MODULE, {Path, DB}), 78 | {ok, DB}; 79 | _ when Retries > 0 -> 80 | timer:sleep(100), 81 | maybe_open_leveldb(Path, Retries - 1) 82 | end 83 | end. 84 | 85 | 86 | get_path(Opts) -> 87 | case proplists:get_value(path, Opts) of 88 | undefined -> 89 | Base = "/tmp/ST", 90 | Name = integer_to_list(timestamp(os:timestamp())), 91 | filename:join(Base, Name); 92 | Path -> 93 | Path 94 | end. 95 | 96 | get_tree_id(Opts) -> 97 | case proplists:get_value(tree_id, Opts) of 98 | undefined -> 99 | <<>>; 100 | Id when is_binary(Id) -> 101 | Id 102 | end. 103 | 104 | db_key(Id, {Level, Bucket}) -> 105 | db_key(Id, Level, Bucket). 106 | 107 | db_key(Id, Level, Bucket) when is_integer(Level), is_integer(Bucket) -> 108 | BucketBin = binary:encode_unsigned(Bucket), 109 | <>. 110 | 111 | -spec fetch(_, _, state()) -> {ok, _}. 112 | fetch({Level, Bucket}, Default, ?STATE{id=Id, db=DB}) -> 113 | DBKey = db_key(Id, Level, Bucket), 114 | case eleveldb:get(DB, DBKey, []) of 115 | {ok, Bin} -> 116 | try 117 | {ok, binary_to_term(Bin)} 118 | catch 119 | _:_ -> {ok, Default} 120 | end; 121 | _ -> 122 | {ok, Default} 123 | end. 124 | 125 | exists({Level, Bucket}, ?STATE{id=Id, db=DB}) -> 126 | DBKey = db_key(Id, Level, Bucket), 127 | case eleveldb:get(DB, DBKey, []) of 128 | {ok, _} -> 129 | true; 130 | _ -> 131 | false 132 | end. 133 | 134 | -spec store(_, _, state()) -> state(). 135 | store({Level, Bucket}, Val, State=?STATE{id=Id, db=DB}) -> 136 | DBKey = db_key(Id, Level, Bucket), 137 | %% Intentionally ignore errors (TODO: Should we?) 138 | _ = eleveldb:put(DB, DBKey, term_to_binary(Val), []), 139 | State. 140 | 141 | -spec store([{_,_}], state()) -> state(). 142 | store(Updates, State=?STATE{id=Id, db=DB}) -> 143 | %% TODO: Should we sort first? Doesn't LevelDB do that automatically in memtable? 144 | DBUpdates = [case Update of 145 | {put, Key, Val} -> 146 | {put, db_key(Id, Key), term_to_binary(Val)}; 147 | {delete, Key} -> 148 | {delete, db_key(Id, Key)} 149 | end || Update <- Updates], 150 | %% Intentionally ignore errors (TODO: Should we?) 151 | _ = eleveldb:write(DB, DBUpdates, []), 152 | State. 153 | 154 | timestamp({Mega, Secs, Micro}) -> 155 | Mega*1000*1000*1000*1000 + Secs * 1000 * 1000 + Micro. 156 | 157 | leveldb_opts() -> 158 | [{is_internal_db, true}, 159 | {write_buffer_size, 4 * 1024 * 1024}, 160 | {use_bloomfilter, true}, 161 | {create_if_missing, true}]. 162 | 163 | -------------------------------------------------------------------------------- /src/synctree_orddict.erl: -------------------------------------------------------------------------------- 1 | %% ------------------------------------------------------------------- 2 | %% 3 | %% Copyright (c) 2014 Basho Technologies, Inc. All Rights Reserved. 4 | %% 5 | %% This file is provided to you under the Apache License, 6 | %% Version 2.0 (the "License"); you may not use this file 7 | %% except in compliance with the License. You may obtain 8 | %% a copy of the License at 9 | %% 10 | %% http://www.apache.org/licenses/LICENSE-2.0 11 | %% 12 | %% Unless required by applicable law or agreed to in writing, 13 | %% software distributed under the License is distributed on an 14 | %% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | %% KIND, either express or implied. See the License for the 16 | %% specific language governing permissions and limitations 17 | %% under the License. 18 | %% 19 | %% ------------------------------------------------------------------- 20 | -module(synctree_orddict). 21 | 22 | -export([new/1, 23 | fetch/3, 24 | exists/2, 25 | store/3, 26 | store/2]). 27 | 28 | -record(?MODULE, {data :: [{_,_}]}). 29 | -define(STATE, #?MODULE). 30 | -type state() :: ?STATE{}. 31 | 32 | -spec new(_) -> state(). 33 | new(_) -> 34 | L = orddict:new(), 35 | ?STATE{data=L}. 36 | 37 | -spec fetch(_, _, state()) -> {ok,_}. 38 | fetch(Key, Default, ?STATE{data=L}) -> 39 | case orddict:find(Key, L) of 40 | error -> 41 | {ok, Default}; 42 | {ok, Value} -> 43 | {ok, Value} 44 | end. 45 | 46 | -spec exists(_, state()) -> boolean(). 47 | exists(Key, ?STATE{data=L}) -> 48 | lists:keymember(Key, 1, L). 49 | 50 | -spec store(_, _, state()) -> state(). 51 | store(Key, Val, State=?STATE{data=L}) -> 52 | L2 = orddict:store(Key, Val, L), 53 | State?STATE{data=L2}. 54 | 55 | -spec store([{_,_}], state()) -> state(). 56 | store(Updates, State=?STATE{data=L}) -> 57 | Inserts = [case Update of 58 | {put, Key, Val} -> 59 | {Key, Val}; 60 | {delete, Key} -> 61 | {Key, deleted} 62 | end || Update <- Updates], 63 | L2 = lists:ukeymerge(1, lists:sort(Inserts), L), 64 | L3 = [X || X={_, Val} <- L2, 65 | Val =/= deleted], 66 | State?STATE{data=L3}. 67 | -------------------------------------------------------------------------------- /test/TESTS: -------------------------------------------------------------------------------- 1 | basic_test 2 | drop_write_test 3 | expand_test 4 | synctree_pure 5 | synctree_remote 6 | synctree_eqc 7 | corrupt_upper_test 8 | corrupt_segment_test 9 | corrupt_exchange_test 10 | corrupt_follower_test 11 | synctree_path_test 12 | lease_test 13 | ensemble_tests_pure 14 | replace_members_test 15 | read_tombstone_test 16 | leadership_watchers 17 | -------------------------------------------------------------------------------- /test/basic_test.erl: -------------------------------------------------------------------------------- 1 | -module(basic_test). 2 | -compile([export_all, nowarn_export_all]). 3 | -include_lib("eunit/include/eunit.hrl"). 4 | 5 | run_test_() -> 6 | ens_test:run(fun scenario/0, 40). 7 | 8 | scenario() -> 9 | ens_test:start(3), 10 | ens_test:wait_stable(root), 11 | {ok, _} = ens_test:kput(<<"test">>, <<"test">>), 12 | {ok, _} = ens_test:kget(<<"test">>), 13 | ens_test:kget(cluster_state), 14 | Pid = riak_ensemble_manager:get_leader_pid(root), 15 | ?debugMsg("Suspending leader"), 16 | erlang:suspend_process(Pid), 17 | ens_test:wait_stable(root), 18 | ?debugMsg("Performing get"), 19 | {ok, _} = ens_test:kget(<<"test">>), 20 | ?debugMsg("Resuming leader"), 21 | erlang:resume_process(Pid), 22 | ?debugMsg("Performing get"), 23 | {ok, _} = ens_test:kget(<<"test">>), 24 | ok. 25 | -------------------------------------------------------------------------------- /test/corrupt_exchange_test.erl: -------------------------------------------------------------------------------- 1 | -module(corrupt_exchange_test). 2 | -compile([export_all, nowarn_export_all]). 3 | 4 | run_test_() -> 5 | ens_test:run(fun scenario/0, 120). 6 | 7 | scenario() -> 8 | ens_test:start(3), 9 | rt_intercept:add(node(), {synctree, [{{m_store,2}, corrupt_segment_all}]}), 10 | io:format(user, "Leader = ~p~n", [riak_ensemble_manager:get_leader(root)]), 11 | {ok, _} = ens_test:kput(<<"corrupt">>, <<"test">>), 12 | io:format(user, "~p~n", [ens_test:kget(<<"corrupt">>)]), 13 | timer:sleep(10000), 14 | rt_intercept:add(node(), {synctree, [{{m_store,2}, m_store_normal}]}), 15 | ens_test:read_until(<<"corrupt">>), 16 | ok. 17 | 18 | %% detect corruption 19 | %% wait for all to be trusted again / corruption heals 20 | %% wait for successful read 21 | %% fail if we ever get notfound / etc 22 | 23 | -------------------------------------------------------------------------------- /test/corrupt_follower_test.erl: -------------------------------------------------------------------------------- 1 | -module(corrupt_follower_test). 2 | -compile([export_all, nowarn_export_all]). 3 | 4 | run_test_() -> 5 | ens_test:run(fun scenario/0, 120). 6 | 7 | scenario() -> 8 | ens_test:start(3), 9 | rt_intercept:add(node(), {synctree, [{{m_store,2}, corrupt_segment_follower}]}), 10 | Pid = riak_ensemble_manager:get_leader_pid(root), 11 | io:format(user, "Leader = ~p~n", [riak_ensemble_manager:get_leader(root)]), 12 | {ok, _} = ens_test:kput(<<"corrupt">>, <<"test">>), 13 | {ok, _} = ens_test:kput(<<"corrupt">>, <<"test2">>), 14 | io:format(user, "~p~n", [ens_test:kget(<<"corrupt">>)]), 15 | 16 | rt_intercept:add(node(), {synctree, [{{m_store,2}, m_store_normal}]}), 17 | erlang:suspend_process(Pid), 18 | timer:sleep(2000), 19 | erlang:resume_process(Pid), 20 | ens_test:wait_stable(root), 21 | 22 | %% timer:sleep(10000), 23 | %% rt_intercept:add(node(), {synctree, [{{m_store,2}, m_store_normal}]}), 24 | timer:sleep(10000), 25 | {ok, _} = ens_test:kget(<<"corrupt">>), 26 | ok. 27 | 28 | %% detect corruption 29 | %% wait for all to be trusted again / corruption heals 30 | %% wait for successful read 31 | %% fail if we ever get notfound / etc 32 | 33 | -------------------------------------------------------------------------------- /test/corrupt_segment_test.erl: -------------------------------------------------------------------------------- 1 | -module(corrupt_segment_test). 2 | -compile([export_all, nowarn_export_all]). 3 | 4 | run_test_() -> 5 | ens_test:run(fun scenario/0). 6 | 7 | scenario() -> 8 | ens_test:start(3), 9 | rt_intercept:add(node(), {synctree, [{{m_store,2}, corrupt_segment}]}), 10 | io:format(user, "Leader = ~p~n", [riak_ensemble_manager:get_leader(root)]), 11 | {ok, _} = ens_test:kput(<<"corrupt">>, <<"test">>), 12 | %% rt_intercept:add(node(), {synctree, [{{m_store,2}, m_store_normal}]}), 13 | io:format(user, "~p~n", [ens_test:kget(<<"corrupt">>)]), 14 | [begin 15 | timer:sleep(1000), 16 | io:format(user, "~p~n", [ens_test:kget(<<"corrupt">>)]) 17 | end || _ <- lists:seq(1,10)], 18 | 19 | timer:sleep(10000), 20 | {ok, _} = ens_test:kget(<<"corrupt">>), 21 | ok. 22 | 23 | %% detect corruption 24 | %% wait for all to be trusted again / corruption heals 25 | %% wait for successful read 26 | %% fail if we ever get notfound / etc 27 | 28 | -------------------------------------------------------------------------------- /test/corrupt_upper_test.erl: -------------------------------------------------------------------------------- 1 | -module(corrupt_upper_test). 2 | -compile([export_all, nowarn_export_all]). 3 | 4 | run_test_() -> 5 | ens_test:run(fun scenario/0). 6 | 7 | scenario() -> 8 | ens_test:start(5), 9 | rt_intercept:add(node(), {synctree, [{{m_store,2}, corrupt_upper}]}), 10 | {ok, _} = ens_test:kput(<<"corrupt">>, <<"test">>), 11 | %% rt_intercept:add(node(), {synctree, [{{m_store,2}, m_store_normal}]}), 12 | io:format(user, "~p~n", [ens_test:kget(<<"corrupt">>)]), 13 | timer:sleep(20000), 14 | ok. 15 | 16 | %% detect corruption / not-trusted 17 | %% wait until all peers (esp. root) are trusted 18 | %% check some keys? 19 | 20 | -------------------------------------------------------------------------------- /test/drop_write_test.erl: -------------------------------------------------------------------------------- 1 | -module(drop_write_test). 2 | -compile([export_all, nowarn_export_all]). 3 | -include_lib("eunit/include/eunit.hrl"). 4 | 5 | run_test_() -> 6 | ens_test:run(fun scenario/0). 7 | 8 | scenario() -> 9 | rt_intercept:add(node(), {riak_ensemble_basic_backend, [{{put,4}, drop_put}]}), 10 | ens_test:start(5), 11 | Pid = riak_ensemble_manager:get_leader_pid(root), 12 | {ok, _} = ens_test:kput(<<"drop">>, <<"test">>), 13 | {ok, _} = ens_test:kget(<<"drop">>), 14 | erlang:suspend_process(Pid), 15 | ens_test:wait_stable(root), 16 | erlang:resume_process(Pid), 17 | ens_test:read_until(<<"drop">>), 18 | ok. 19 | -------------------------------------------------------------------------------- /test/ens_test.erl: -------------------------------------------------------------------------------- 1 | -module(ens_test). 2 | -compile([export_all, nowarn_export_all]). 3 | -include_lib("eunit/include/eunit.hrl"). 4 | 5 | -define(TEST_DIR, "_build/test/test-tmp"). 6 | 7 | run(Test) -> 8 | %% run(Test, 5*60). 9 | run(Test, 45). 10 | 11 | run(Test, Timeout) -> 12 | {setup, 13 | fun() -> 14 | application:load(crypto), 15 | application:load(riak_ensemble), 16 | os:cmd("rm -rf " ++ ?TEST_DIR), 17 | application:set_env(riak_ensemble, data_root, ?TEST_DIR), 18 | {ok, _} = application:ensure_all_started(riak_ensemble), 19 | ok 20 | end, 21 | fun(_) -> 22 | application:stop(riak_ensemble) 23 | end, 24 | {timeout, Timeout, Test}}. 25 | 26 | start() -> 27 | Node = node(), 28 | riak_ensemble_manager:enable(), 29 | [{root, Node}] = riak_ensemble_manager:get_members(root), 30 | ens_test:wait_stable(root), 31 | ok. 32 | 33 | start(N) -> 34 | start(), 35 | expand(N). 36 | 37 | expand(N) -> 38 | NewMembers = [{X, node()} || X <- lists:seq(2,N)], 39 | Changes = [{add, Member} || Member <- NewMembers], 40 | Pid = riak_ensemble_manager:get_leader_pid(root), 41 | riak_ensemble_peer:update_members(Pid, Changes, 5000), 42 | ens_test:wait_stable(root), 43 | 44 | Members = [{root, node()} | NewMembers], 45 | ens_test:wait_members(root, Members), 46 | ens_test:wait_stable(root), 47 | ok. 48 | 49 | wait_stable(Ensemble) -> 50 | case check_stable(Ensemble) of 51 | true -> 52 | ok; 53 | false -> 54 | wait_stable(Ensemble) 55 | end. 56 | 57 | check_stable(Ensemble) -> 58 | case riak_ensemble_manager:check_quorum(Ensemble, 1000) of 59 | true -> 60 | case riak_ensemble_peer:stable_views(Ensemble, 1000) of 61 | {ok, true} -> 62 | true; 63 | _Other -> 64 | false 65 | end; 66 | false -> 67 | false 68 | end. 69 | 70 | wait_members(Ensemble, Expected) -> 71 | Members = riak_ensemble_manager:get_members(Ensemble), 72 | case (Expected -- Members) of 73 | [] -> 74 | ok; 75 | _ -> 76 | timer:sleep(1000), 77 | wait_members(Ensemble, Expected) 78 | end. 79 | 80 | kput(Key, Val) -> 81 | riak_ensemble_client:kover(root, Key, Val, 5000). 82 | 83 | kget(Key) -> 84 | riak_ensemble_client:kget(root, Key, 5000). 85 | 86 | kget(Key, Opts) -> 87 | riak_ensemble_client:kget(node(), root, Key, 5000, Opts). 88 | 89 | read_until(Key) -> 90 | case ens_test:kget(Key) of 91 | {ok, Obj} -> 92 | Value = riak_ensemble_basic_backend:obj_value(Obj), 93 | ?assert(Value =/= notfound), 94 | ok; 95 | {error, _} -> 96 | timer:sleep(100), 97 | read_until(Key) 98 | end. 99 | 100 | %% @doc Utility function used to construct test predicates. Retries the 101 | %% function `Fun' until it returns `true', or until the maximum 102 | %% number of retries is reached. 103 | wait_until(Fun) when is_function(Fun) -> 104 | wait_until(Fun, 50, 100). 105 | 106 | wait_until(Fun, Retry, Delay) when Retry > 0 -> 107 | Res = Fun(), 108 | case Res of 109 | true -> 110 | ok; 111 | _ when Retry == 1 -> 112 | {fail, Res}; 113 | _ -> 114 | timer:sleep(Delay), 115 | wait_until(Fun, Retry-1, Delay) 116 | end. 117 | -------------------------------------------------------------------------------- /test/ensemble_tests_pure.erl: -------------------------------------------------------------------------------- 1 | %% Various pure tests 2 | -module(ensemble_tests_pure). 3 | -compile([export_all, nowarn_export_all]). 4 | -include_lib("eunit/include/eunit.hrl"). 5 | 6 | -define(TEST(X), {timeout, 60, {test, ?MODULE, X}}). 7 | 8 | run_test_() -> 9 | [?TEST(test_monotonic_time)]. 10 | 11 | test_monotonic_time() -> 12 | {ok, N1} = riak_ensemble_clock:monotonic_time(), 13 | {ok, M1} = riak_ensemble_clock:monotonic_time_ms(), 14 | timer:sleep(1000), 15 | {ok, N2} = riak_ensemble_clock:monotonic_time(), 16 | {ok, M2} = riak_ensemble_clock:monotonic_time_ms(), 17 | ?assert((N2 - N1) >= 1000000000), 18 | ?assert((M2 - M1) >= 1000), 19 | ok. 20 | -------------------------------------------------------------------------------- /test/expand_test.erl: -------------------------------------------------------------------------------- 1 | -module(expand_test). 2 | -compile([export_all, nowarn_export_all]). 3 | -include_lib("eunit/include/eunit.hrl"). 4 | 5 | run_test_() -> 6 | ens_test:run(fun scenario/0, 40). 7 | 8 | scenario() -> 9 | ens_test:start(1), 10 | ens_test:wait_stable(root), 11 | {ok, _} = ens_test:kput(<<"test">>, <<"test">>), 12 | {ok, _} = ens_test:kget(<<"test">>), 13 | ens_test:expand(3), 14 | ens_test:wait_stable(root), 15 | %% Should trigger read repair 16 | {ok, _} = ens_test:kget(<<"test">>, [read_repair]), 17 | Pid = riak_ensemble_manager:get_leader_pid(root), 18 | ?debugMsg("Suspending leader"), 19 | erlang:suspend_process(Pid), 20 | ens_test:wait_stable(root), 21 | {ok, _} = ens_test:kget(<<"test">>), 22 | ok. 23 | 24 | -------------------------------------------------------------------------------- /test/intercept.erl: -------------------------------------------------------------------------------- 1 | %% ------------------------------------------------------------------- 2 | %% 3 | %% Copyright (c) 2015 Basho Technologies, Inc. 4 | %% 5 | %% This file is provided to you under the Apache License, 6 | %% Version 2.0 (the "License"); you may not use this file 7 | %% except in compliance with the License. You may obtain 8 | %% a copy of the License at 9 | %% 10 | %% http://www.apache.org/licenses/LICENSE-2.0 11 | %% 12 | %% Unless required by applicable law or agreed to in writing, 13 | %% software distributed under the License is distributed on an 14 | %% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | %% KIND, either express or implied. See the License for the 16 | %% specific language governing permissions and limitations 17 | %% under the License. 18 | %% 19 | %%------------------------------------------------------------------- 20 | 21 | -module(intercept). 22 | %% Export explicit API but also send compile directive to export all 23 | %% because some of these private functions are useful in their own 24 | %% right. 25 | -export([add/3, add/4, clean/1]). 26 | -compile([export_all, nowarn_export_all]). 27 | 28 | -type abstract_code() :: term(). 29 | -type form() :: term(). 30 | -type proplist(K, V) :: proplists:proplist(K, V). 31 | -type fun_name() :: atom(). 32 | -type fun_type() :: fun_name() | tuple(). 33 | -type target_fun() :: {fun_name(), arity()}. 34 | -type intercept_fun() :: fun_type(). 35 | -type mapping() :: proplist(target_fun(), intercept_fun()). 36 | -type form_mod() :: fun((form()) -> form()). 37 | -type code_mod() :: fun((form(), abstract_code()) -> abstract_code()). 38 | 39 | %% The "original" is the `Target' module with the suffix `_orig'. It 40 | %% is where original code for the `Target' module resides after 41 | %% intercepts are added. 42 | -define(ORIGINAL(Mod), list_to_atom(atom_to_list(Mod) ++ "_orig")). 43 | -define(FAKE_LINE_NO,1). 44 | 45 | %% @doc Add intercepts against the `Target' module. 46 | %% 47 | %% `Target' - The module on which to intercept calls. 48 | %% E.g. `hashtree'. 49 | %% 50 | %% `Intercept' - The module containing intercept definitions. 51 | %% E.g. `hashtree_intercepts' 52 | %% 53 | %% `Mapping' - The mapping from target functions to intercept 54 | %% functions. 55 | %% 56 | %% E.g. `[{{update_perform,2}, sleep_update_perform}]' 57 | -spec add(module(), module(), mapping(), string()) -> ok. 58 | add(Target, Intercept, Mapping, OutDir) -> 59 | Original = ?ORIGINAL(Target), 60 | TargetAC = get_abstract_code(Target), 61 | 62 | ProxyAC = make_proxy_abstract_code(Target, Intercept, Mapping, 63 | Original, TargetAC), 64 | OrigAC = make_orig_abstract_code(Target, Original, TargetAC), 65 | 66 | ok = compile_and_load(Original, OrigAC, OutDir), 67 | ok = compile_and_load(Target, ProxyAC, OutDir). 68 | 69 | -spec add(module(), module(), mapping()) -> ok. 70 | add(Target, Intercept, Mapping) -> 71 | add(Target, Intercept, Mapping, undefined). 72 | 73 | %% @doc Cleanup proxy and backuped original module 74 | -spec clean(module()) -> ok|{error, term()}. 75 | clean(Target) -> 76 | _ = code:purge(Target), 77 | _ = code:purge(?ORIGINAL(Target)), 78 | case code:load_file(Target) of 79 | {module, Target} -> 80 | ok; 81 | {error, Reason} -> 82 | {error, Reason} 83 | end. 84 | 85 | %% @private 86 | %% 87 | %% @doc Compile the abstract code `AC' and load it into the code server. 88 | -spec compile_and_load(module(), abstract_code(), undefined | string()) -> ok. 89 | compile_and_load(Module, AC, OutDir) -> 90 | {ok, Module, Bin} = compile:forms(AC,[debug_info]), 91 | ModStr = atom_to_list(Module), 92 | _ = is_list(OutDir) andalso 93 | file:write_file(filename:join(OutDir, ModStr ++ ".beam"), Bin), 94 | {module, Module} = code:load_binary(Module, ModStr, Bin), 95 | ok. 96 | 97 | %% @private 98 | -spec make_orig_abstract_code(module(), module(), abstract_code()) -> 99 | abstract_code(). 100 | make_orig_abstract_code(Target, OrigName, TargetAC) -> 101 | export_all(move_all_funs(Target, change_module_name(OrigName, TargetAC))). 102 | 103 | %% @private 104 | %% 105 | %% @doc Make the abstract code for the proxy module. The proxy module 106 | %% sits in place of the original module and decides whether to 107 | %% forward to the `Intercept' or the `Original' depending on the 108 | %% `Mapping'. 109 | -spec make_proxy_abstract_code(module(), module(), mapping(), 110 | module(), abstract_code()) -> 111 | abstract_code(). 112 | make_proxy_abstract_code(Target, Intercept, Mapping, Original, TargetAC) -> 113 | AC1 = forward_all(Original, TargetAC), 114 | AC2 = export_all(change_module_name(Target, AC1)), 115 | apply_intercepts(AC2, Intercept, Mapping). 116 | 117 | 118 | %% @private 119 | %% 120 | %% @doc Apply intercepts to the abstract code `AC' based on `Mapping'. 121 | -spec apply_intercepts(abstract_code(), module(), mapping()) -> abstract_code(). 122 | apply_intercepts(AC, Intercept, Mapping) -> 123 | apply_to_funs(mapping_fun(Intercept, Mapping), AC). 124 | 125 | %% @private 126 | %% 127 | %% @doc Return a form modifier function that uses `Mapping' to 128 | %% determine if a function should be modified to forward to the 129 | %% `Intercept' module. 130 | -spec mapping_fun(module(), proplists:proplist()) -> form_mod(). 131 | mapping_fun(Intercept, Mapping) -> 132 | fun(Form) -> 133 | Key = {fun_name(Form), fun_arity(Form)}, 134 | case proplists:get_value(Key, Mapping, '$none') of 135 | '$none' -> 136 | Form; 137 | InterceptFun -> 138 | forward(Intercept, InterceptFun, Form) 139 | end 140 | end. 141 | 142 | %% @private 143 | %% 144 | %% @doc Modify the abstract code `AC' to forward all function calls to 145 | %% `Module' and move the original definitions under 146 | %% `_orig'. 147 | -spec move_all_funs(module(), abstract_code()) -> abstract_code(). 148 | move_all_funs(Module, AC) -> 149 | lists:reverse(lists:foldl(move_all_funs(Module), [], AC)). 150 | 151 | %% @private 152 | %% 153 | %% @doc Return a function which folds over the abstract code of a 154 | %% module, represented by `Form'. Every function is modified to 155 | %% forward to `ModuleName' and it's original definition is stored 156 | %% under `_orig'. 157 | -spec move_all_funs(module()) -> code_mod(). 158 | move_all_funs(ModuleName) -> 159 | fun(Form, NewAC) -> 160 | case is_fun(Form) of 161 | false -> 162 | [Form|NewAC]; 163 | true -> 164 | %% Move current function code under different name 165 | Name = fun_name(Form), 166 | OrigForm = setelement(3, Form, ?ORIGINAL(Name)), 167 | 168 | %% Modify original function to forward to `ModuleName' 169 | FwdForm = forward(ModuleName, Name, Form), 170 | [FwdForm,OrigForm|NewAC] 171 | end 172 | end. 173 | 174 | %% @private 175 | %% 176 | %% @doc Modify all function definitions in the abstract code `AC' to 177 | %% forward to `Module:FunName_orig'. 178 | -spec forward_all(module(), abstract_code()) -> abstract_code(). 179 | forward_all(Module, AC) -> 180 | F = fun(Form) -> 181 | forward(Module, ?ORIGINAL(fun_name(Form)), Form) 182 | end, 183 | apply_to_funs(F, AC). 184 | 185 | %% @private 186 | %% 187 | %% @doc Modify the function `Form' to forward to `Module:Fun'. 188 | -spec forward(module(), fun_type(), form()) -> form(). 189 | forward(Module, Fun, Form) -> 190 | Clause = hd(fun_clauses(Form)), 191 | Args = clause_args(Clause), 192 | NumArgs = length(Args), 193 | GenArgs = [{var,?FAKE_LINE_NO,list_to_atom("Arg" ++ integer_to_list(I))} 194 | || I <- lists:seq(1,NumArgs)], 195 | Clause2 = clause_set_args(Clause, GenArgs), 196 | Clause3 = clause_clear_guards(Clause2), 197 | Body = [{call, 1, 198 | case Fun of 199 | Fun when is_atom(Fun) -> 200 | {remote,1,{atom,1,Module},{atom,1,Fun}}; 201 | %% If Fun is a tuple, it's a pair comprising a list of 202 | %% local variables to capture and an anonymous function 203 | %% that's already in the abstract format. The anonymous 204 | %% function uses the local variables. 205 | {FreeVars, AnonFun} -> 206 | generate_fun_wrapper(FreeVars, AnonFun, NumArgs) 207 | end, GenArgs}], 208 | Clause4 = clause_set_body(Clause3, Body), 209 | fun_set_clauses(Form, [Clause4]). 210 | 211 | change_module_name(NewName, AC) -> 212 | lists:keyreplace(module, 3, AC, {attribute,1,module,NewName}). 213 | 214 | %% @private 215 | %% 216 | %% @doc Generate an anonymous function wrapper that sets up calls for an 217 | %% anonymous function interceptor. 218 | %% 219 | %% This function returns the abstract code equivalent of the following 220 | %% code. If you change this code, please update this comment. 221 | %% 222 | %% fun(__A0_, __A1_, ...) -> 223 | %% __Bindings0_ = lists:foldl(fun({__Bn_,__Bv_},__Acc_) -> 224 | %% erl_eval:add_binding(__Bn_,__Bv_,__Acc_) 225 | %% end, 226 | %% erl_eval:new_bindings(), 227 | %% ), 228 | %% __Bindings = lists:foldl(fun({{var,_,__Vn_},__V_},__Acc) -> 229 | %% erl_eval:add_binding(__Vn_,__V_,__Acc_) 230 | %% end, 231 | %% __Bindings0_, 232 | %% <__A0_ etc. args from generate_freevars>), 233 | %% erl_eval:expr(, 234 | %% __Bindings_, none, none, value). 235 | %% 236 | generate_fun_wrapper(FreeVars, AnonFun, NumArgs) -> 237 | L = ?FAKE_LINE_NO, 238 | Args = [{var,L,list_to_atom(lists:flatten(["__A",Var+$0],"_"))} || 239 | Var <- lists:seq(1, NumArgs)], 240 | {'fun',L, 241 | {clauses, 242 | [{clause,L,Args,[], 243 | [{match,L+1, 244 | {var,L+1,'__Bindings0_'}, 245 | {call,L+1, 246 | {remote,L+1,{atom,L+1,lists},{atom,L+1,foldl}}, 247 | [{'fun',L+1, 248 | {clauses, 249 | [{clause,L+1, 250 | [{tuple,L+1,[{var,L+1,'__Bn_'},{var,L+1,'__Bv_'}]}, 251 | {var,L+1,'__Acc_'}], 252 | [], 253 | [{call,L+2, 254 | {remote,L+2, 255 | {atom,L+2,erl_eval}, 256 | {atom,L+2,add_binding}}, 257 | [{var,L+2,'__Bn_'},{var,L+2,'__Bv_'},{var,L+2,'__Acc_'}]}] 258 | }]}}, 259 | {call,L+3, 260 | {remote,L+3,{atom,L+3,erl_eval},{atom,L+3,new_bindings}},[]}, 261 | generate_freevars(FreeVars,L+3)]}}, 262 | {match,L+4, 263 | {var,L+4,'__Bindings_'}, 264 | {call,L+4, 265 | {remote,L+4,{atom,L+4,lists},{atom,L+4,foldl}}, 266 | [{'fun',L+4, 267 | {clauses, 268 | [{clause,L+4, 269 | [{tuple,L+4,[{tuple,L+4,[{atom,L+4,var},{var,L+4,'_'}, 270 | {var,L+4,'__Vn_'}]},{var,L+4,'__V_'}]}, 271 | {var,L+4,'__Acc_'}], 272 | [], 273 | [{call,L+5, 274 | {remote,L+5, 275 | {atom,L+5,erl_eval}, 276 | {atom,L+5,add_binding}}, 277 | [{var,L+5,'__Vn_'},{var,L+5,'__V_'},{var,L+5,'__Acc_'}]}] 278 | }]}}, 279 | {var,L+6,'__Bindings0_'}, 280 | lists:foldl(fun(V,Acc) -> 281 | AV = erl_parse:abstract(V), 282 | {cons,L+6,{tuple,L+6,[AV,V]},Acc} 283 | end,{nil,L+6},Args)]}}, 284 | {call,L+7, 285 | {remote,L+7, 286 | {atom,L+7,erl_eval}, 287 | {atom,L+7,expr}}, 288 | [erl_parse:abstract({call,L+7,AnonFun, 289 | [{var,L+7,V} || {var,_,V} <- Args]},L+7), 290 | {var,L+7,'__Bindings_'}, 291 | {atom,L+7,none}, 292 | {atom,L+7,none}, 293 | {atom,L+7,value}]}]}]}}. 294 | 295 | %% @private 296 | %% 297 | %% @doc Convert generate_fun_wrapper freevars to abstract code 298 | generate_freevars([], L) -> 299 | {nil,L}; 300 | generate_freevars([FreeVar|FreeVars], L) -> 301 | {cons,L, 302 | generate_freevar(FreeVar,L), 303 | generate_freevars(FreeVars,L)}. 304 | 305 | %% @private 306 | %% 307 | %% @doc Convert one freevar to abstract code 308 | %% 309 | %% This returns an abstract format tuple representing a freevar as 310 | %% {VarName, VarValue}. For function values we check their env for their 311 | %% own freevars, but if no env is available, we raise an error. Pids, 312 | %% ports, and references have no abstract format, so they are first 313 | %% converted to binaries and the abstract format of the binary is used 314 | %% instead. Their abstract format values generated here convert them back 315 | %% from binaries to terms when accessed. 316 | generate_freevar({Name,Var},L) when is_function(Var) -> 317 | {env, Env} = erlang:fun_info(Var, env), 318 | case Env of 319 | [] -> 320 | error({badarg, Var}); 321 | [FreeVars,_,_,Clauses] -> 322 | {arity, Arity} = erlang:fun_info(Var, arity), 323 | AnonFun = {'fun',L,{clauses,Clauses}}, 324 | {tuple,L, 325 | [{atom,L,Name}, 326 | generate_fun_wrapper(FreeVars, AnonFun, Arity)]} 327 | end; 328 | generate_freevar({Name,Var}, L) 329 | when is_pid(Var); is_port(Var); is_reference(Var) -> 330 | NVar = term_to_binary(Var), 331 | {tuple,L, 332 | [{atom,L,Name}, 333 | {call,L, 334 | {remote,L,{atom,L,erlang},{atom,L,binary_to_term}}, 335 | [erl_parse:abstract(NVar)]}]}; 336 | generate_freevar(NameVar, L) -> 337 | erl_parse:abstract(NameVar,L). 338 | 339 | %% @private 340 | %% 341 | %% @doc Add the `export_all' compile directive to the abstract code `AC'. 342 | export_all(AC) -> 343 | [A,B|Rest] = AC, 344 | [A,B,{attribute,2,compile,export_all}|Rest]. 345 | 346 | %% @private 347 | %% 348 | %% @doc Apply the form modify `F' to all forms in `AC' that are 349 | %% function definitions. 350 | -spec apply_to_funs(form_mod(), abstract_code()) -> abstract_code(). 351 | apply_to_funs(F, AC) -> 352 | F2 = apply_if_fun_def(F), 353 | lists:map(F2, AC). 354 | 355 | %% @private 356 | %% 357 | %% @doc Get the abstract code for `Module'. This function assumes 358 | %% code is compiled with `debug_info'. 359 | -spec get_abstract_code(module()) -> abstract_code(). 360 | get_abstract_code(Module) -> 361 | {_, Bin, _} = code:get_object_code(Module), 362 | {ok,{_,[{abstract_code,{_,AC}}]}} = beam_lib:chunks(Bin, [abstract_code]), 363 | AC. 364 | 365 | %% @private 366 | apply_if_fun_def(Fun) -> 367 | fun(Form) when element(1, Form) == function -> Fun(Form); 368 | (Form) -> Form 369 | end. 370 | 371 | %% @private 372 | is_fun(Form) -> 373 | element(1, Form) == function. 374 | 375 | %% @private 376 | clause_args(Form) -> 377 | element(3, Form). 378 | 379 | %% @private 380 | clause_set_args(Form, Args) -> 381 | setelement(3, Form, Args). 382 | 383 | %% @private 384 | clause_clear_guards(Form) -> 385 | setelement(4, Form, []). 386 | 387 | %% @private 388 | clause_set_body(Form, Body) -> 389 | setelement(5, Form, Body). 390 | 391 | %% @private 392 | fun_arity(Form) -> 393 | element(4, Form). 394 | 395 | %% @private 396 | fun_clauses(Form) -> 397 | element(5, Form). 398 | 399 | %% @private 400 | fun_set_clauses(Form, Clauses) -> 401 | setelement(5, Form, Clauses). 402 | 403 | %% @private 404 | fun_name(Form) -> 405 | element(3, Form). 406 | -------------------------------------------------------------------------------- /test/leadership_watchers.erl: -------------------------------------------------------------------------------- 1 | -module(leadership_watchers). 2 | -compile([export_all, nowarn_export_all]). 3 | -include_lib("eunit/include/eunit.hrl"). 4 | 5 | run_test_() -> 6 | ens_test:run(fun scenario/0, 40). 7 | 8 | scenario() -> 9 | ens_test:start(3), 10 | ens_test:wait_stable(root), 11 | 12 | Pid = riak_ensemble_manager:get_leader_pid(root), 13 | 14 | ?assertEqual(0, length(riak_ensemble_peer:get_watchers(Pid))), 15 | ?debugMsg("Watching leader"), 16 | riak_ensemble_peer:watch_leader_status(Pid), 17 | ?assertEqual(1, length(riak_ensemble_peer:get_watchers(Pid))), 18 | ?debugMsg("Waiting for is_leading notification"), 19 | wait_status(is_leading, Pid), 20 | 21 | ?debugMsg("Stopping watching leader"), 22 | riak_ensemble_peer:stop_watching(Pid), 23 | ?assertEqual(0, length(riak_ensemble_peer:get_watchers(Pid))), 24 | 25 | ?debugMsg("Starting watching leader again"), 26 | riak_ensemble_peer:watch_leader_status(Pid), 27 | ?assertEqual(1, length(riak_ensemble_peer:get_watchers(Pid))), 28 | wait_status(is_leading, Pid), 29 | 30 | ?debugMsg("Suspending leader, and waiting for new leader to be elected"), 31 | erlang:suspend_process(Pid), 32 | ens_test:wait_stable(root), 33 | 34 | ?debugMsg("Resuming former leader, and waiting for is_not_leading notification"), 35 | erlang:resume_process(Pid), 36 | wait_status(is_not_leading, Pid), 37 | 38 | ?debugMsg("Watching leader in external process"), 39 | Watcher = spawn_link(fun() -> watcher(Pid) end), 40 | wait_until_n_watchers(2, Pid), 41 | 42 | ?debugMsg("Killing external watcher process and checking peer state"), 43 | Watcher ! die, 44 | wait_until_n_watchers(1, Pid). 45 | 46 | wait_status(Status, Pid) -> 47 | receive 48 | {Status, Pid, _, _, _} -> 49 | ok 50 | after 51 | 5000 -> 52 | throw(timeout_waiting_for_leader_status) 53 | end. 54 | 55 | %% Just a fun to spawn a process that will exit when we tell it to, so we 56 | %% can test that the watcher gets removed from the ensemble peer state when 57 | %% a watcher process dies. 58 | watcher(PeerPid) -> 59 | riak_ensemble_peer:watch_leader_status(PeerPid), 60 | receive 61 | die -> 62 | ok 63 | end. 64 | 65 | wait_until_n_watchers(N, Pid) -> 66 | WatcherCountCheck = fun() -> N =:= length(riak_ensemble_peer:get_watchers(Pid)) end, 67 | ?assertEqual(ok, ens_test:wait_until(WatcherCountCheck)). 68 | -------------------------------------------------------------------------------- /test/lease_test.erl: -------------------------------------------------------------------------------- 1 | -module(lease_test). 2 | -compile([export_all, nowarn_export_all]). 3 | -include_lib("eunit/include/eunit.hrl"). 4 | 5 | run_test_() -> 6 | ens_test:run(fun scenario/0). 7 | 8 | scenario() -> 9 | ens_test:start(3), 10 | {ok, _} = ens_test:kput(<<"test">>, <<"test">>), 11 | 12 | %% Test with lease trusted 13 | {ok, _} = ens_test:kget(<<"test">>), 14 | 15 | %% Test with lease not trusted 16 | ok = application:set_env(riak_ensemble, trust_lease, false), 17 | {ok, _} = ens_test:kget(<<"test">>), 18 | 19 | %% Test with lease not trusted and followers intercepted to 20 | %% nack epoch check. 21 | rt_intercept:add(node(), {riak_ensemble_peer, [{{check_epoch,3}, check_epoch_false}]}), 22 | {error, timeout} = ens_test:kget(<<"test">>), 23 | 24 | %% Test with lease trusted again 25 | ok = application:set_env(riak_ensemble, trust_lease, true), 26 | %% Because of error above, leader may have changed. Wait until stable. 27 | ens_test:wait_stable(root), 28 | %% Read twice because leader change forces first read to rewrite, which 29 | %% ignores the lease entirely. 30 | {ok, _} = ens_test:kget(<<"test">>), 31 | {ok, _} = ens_test:kget(<<"test">>), 32 | 33 | %% Test with simulated expired lease 34 | ok = application:set_env(riak_ensemble, follower_timeout, 1000), 35 | ok = application:set_env(riak_ensemble, lease_duration, 0), 36 | timer:sleep(1000), 37 | {error, _} = ens_test:kget(<<"test">>), 38 | 39 | %% Remove intercept and test that all is well 40 | rt_intercept:add(node(), {riak_ensemble_peer, [{{check_epoch,3}, check_epoch}]}), 41 | ens_test:wait_stable(root), 42 | {ok, _} = ens_test:kget(<<"test">>), 43 | {ok, _} = ens_test:kget(<<"test">>), 44 | 45 | ok. 46 | -------------------------------------------------------------------------------- /test/read_tombstone_test.erl: -------------------------------------------------------------------------------- 1 | -module(read_tombstone_test). 2 | -compile([export_all, nowarn_export_all]). 3 | -include_lib("eunit/include/eunit.hrl"). 4 | 5 | %% This test specifically targets an optimization in riak_ensemble that is 6 | %% intended to avoid writing tombstones when possible on reads that return 7 | %% notfound. Normally reads that return notfound require a tombstone to be 8 | %% written in case there's a partial write somewhere in the ensemble that 9 | %% hasn't been read yet. If we wait an extra, say, 1ms though, then it's 10 | %% very likely we will see replies from every peer in the ensemble, in 11 | %% which case we can definitively see whether any partial writes exist or 12 | %% not, and potentially avoid the need to write a tombstone. 13 | 14 | run_test_() -> 15 | ens_test:run(fun scenario/0). 16 | 17 | scenario() -> 18 | ens_test:start(3), 19 | ens_test:wait_stable(root), 20 | application:set_env(riak_ensemble, notfound_read_delay, 3000), 21 | 22 | Peers = riak_ensemble_manager:get_members(root), 23 | Leader = riak_ensemble_manager:get_leader(root), 24 | [Follow1, Follow2] = Peers -- [Leader], 25 | 26 | ?debugMsg("Running kget on a nonexistent key"), 27 | {ok, _} = ens_test:kget(<<"test">>), 28 | 29 | ?debugMsg("Testing that no peers have tombstones"), 30 | ?assert(is_notfound(Leader, <<"test">>)), 31 | ?assert(is_notfound(Follow1, <<"test">>)), 32 | ?assert(is_notfound(Follow2, <<"test">>)), 33 | 34 | ?debugMsg("Running kget with one member suspended"), 35 | application:set_env(riak_ensemble, notfound_read_delay, 0), 36 | Peer2Pid = riak_ensemble_manager:get_peer_pid(root, Follow2), 37 | erlang:suspend_process(Peer2Pid), 38 | {ok, _} = ens_test:kget(<<"test2">>), 39 | erlang:resume_process(Peer2Pid), 40 | 41 | ?debugMsg("Testing that active peers have tombstones"), 42 | ?assertNot(is_notfound(Leader, <<"test2">>)), 43 | ?assertNot(is_notfound(Follow1, <<"test2">>)), 44 | 45 | ok. 46 | 47 | %% This can be used to check for tombstones, because if a key 48 | %% has a tombstone, we'll see an object returned that wraps 49 | %% the 'notfound' value rather than just the atom 'notfound' alone. 50 | is_notfound(Member, Key) -> 51 | Pid = riak_ensemble_manager:get_peer_pid(root, Member), 52 | Res = riak_ensemble_peer:debug_local_get(Pid, Key), 53 | Res =:= notfound. 54 | -------------------------------------------------------------------------------- /test/replace_members_test.erl: -------------------------------------------------------------------------------- 1 | -module(replace_members_test). 2 | -compile([export_all, nowarn_export_all]). 3 | -include_lib("eunit/include/eunit.hrl"). 4 | 5 | -define(TIMEOUT, 60). %% seconds 6 | run_test_() -> 7 | ens_test:run(fun scenario/0, ?TIMEOUT). 8 | 9 | scenario() -> 10 | ens_test:start(3), 11 | ens_test:wait_stable(root), 12 | {ok, V} = ens_test:kput(<<"test">>, <<"test">>), 13 | {ok, V} = ens_test:kget(<<"test">>), 14 | Pid = riak_ensemble_manager:get_leader_pid(root), 15 | Adds = [{I, node()} || I <- [4,5,6]], 16 | Dels = [{I, node()} || I <- [root, 2, 3]], 17 | Changes = [{add, Member} || Member <- Adds] ++ 18 | [{del, Member} || Member <- Dels], 19 | ?debugMsg("Replacing Root Members: root/2/3 -> 4/5/6"), 20 | ok = riak_ensemble_peer:update_members(Pid, Changes, 10000), 21 | 22 | ?debugMsg("Waiting for root ensemble to stabilize"), 23 | ens_test:wait_members(root, Adds), 24 | ens_test:wait_stable(root), 25 | ?debugMsg("Done waiting for root ensemble to stabilize"), 26 | 27 | %% This fails because we sync metadata trees, but not the data itself. 28 | %% Riak KV hands off data before the transition so this shouldn't happen in 29 | %% practice. 30 | ?debugMsg("Performing failing get"), 31 | {error, failed} = ens_test:kget(<<"test">>), 32 | 33 | %% The failure above causes the leader to step down. We need to wait for a 34 | %% new election or the call to update_members/3 below will fail. 35 | ?debugMsg("Waiting for root ensemble to stabilize"), 36 | ens_test:wait_members(root, Adds), 37 | ens_test:wait_stable(root), 38 | ?debugMsg("Done waiting for root ensemble to stabilize"), 39 | 40 | ?debugMsg("Replacing root members with original members"), 41 | Changes2 = [{add, M} || M <- Dels] ++ [{del, M} || M <- Adds], 42 | Leader = riak_ensemble_manager:get_leader_pid(root), 43 | ok = riak_ensemble_peer:update_members(Leader, Changes2, 10000), 44 | 45 | ?debugMsg("Waiting for root ensemble to stabilize"), 46 | ens_test:wait_members(root, Dels), 47 | ens_test:wait_stable(root), 48 | ?debugMsg("Done waiting for root ensemble to stabilize"), 49 | 50 | %% The following get succeeds because the data is still stored on root/2/3 51 | ?debugMsg("Performing successful get"), 52 | {ok, _} = ens_test:kget(<<"test">>), 53 | ok. 54 | -------------------------------------------------------------------------------- /test/riak_ensemble_basic_backend_intercepts.erl: -------------------------------------------------------------------------------- 1 | -module(riak_ensemble_basic_backend_intercepts). 2 | -compile([export_all, nowarn_export_all]). 3 | -include("riak_ensemble_types.hrl"). 4 | 5 | -define(M, riak_ensemble_basic_backend_orig). 6 | 7 | %% copied from riak_ensemble_basic_backend.erl 8 | -record(state, {savefile :: file:filename(), 9 | id :: peer_id(), 10 | data :: orddict:orddict()}). 11 | 12 | drop_put(Key, Obj, From, State=#state{id=Id}) -> 13 | case Key of 14 | <<"drop",_/binary>> -> 15 | case Id of 16 | {root, _} -> 17 | ?M:put_orig(Key, Obj, From, State); 18 | _ -> 19 | riak_ensemble_backend:reply(From, Obj), 20 | State 21 | end; 22 | _ -> 23 | ?M:put_orig(Key, Obj, From, State) 24 | end. 25 | 26 | synctree_path_shared(root, Id) -> 27 | TreeId = term_to_binary(Id), 28 | Path = "root", 29 | {TreeId, Path}; 30 | synctree_path_shared(_, _) -> 31 | default. 32 | -------------------------------------------------------------------------------- /test/riak_ensemble_peer_intercepts.erl: -------------------------------------------------------------------------------- 1 | -module(riak_ensemble_peer_intercepts). 2 | -compile([export_all, nowarn_export_all]). 3 | -include("riak_ensemble_types.hrl"). 4 | 5 | -define(M, riak_ensemble_peer_orig). 6 | 7 | check_epoch_false(_Peer, _Epoch, _State) -> 8 | false. 9 | 10 | check_epoch(Peer, Epoch, State) -> 11 | ?M:check_epoch_orig(Peer, Epoch, State). 12 | -------------------------------------------------------------------------------- /test/rt_intercept.erl: -------------------------------------------------------------------------------- 1 | %% ------------------------------------------------------------------- 2 | %% 3 | %% Copyright (c) 2013-2014 Basho Technologies, Inc. 4 | %% 5 | %% This file is provided to you under the Apache License, 6 | %% Version 2.0 (the "License"); you may not use this file 7 | %% except in compliance with the License. You may obtain 8 | %% a copy of the License at 9 | %% 10 | %% http://www.apache.org/licenses/LICENSE-2.0 11 | %% 12 | %% Unless required by applicable law or agreed to in writing, 13 | %% software distributed under the License is distributed on an 14 | %% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | %% KIND, either express or implied. See the License for the 16 | %% specific language governing permissions and limitations 17 | %% under the License. 18 | %% 19 | %% ------------------------------------------------------------------- 20 | -module(rt_intercept). 21 | -compile([export_all, nowarn_export_all]). 22 | 23 | -include_lib("kernel/include/logger.hrl"). 24 | 25 | -define(DEFAULT_INTERCEPT(Target), 26 | list_to_atom(atom_to_list(Target) ++ "_intercepts")). 27 | 28 | files_to_mods(Files) -> 29 | [list_to_atom(filename:basename(F, ".erl")) || F <- Files]. 30 | 31 | default_intercept_path_glob() -> 32 | filename:join([rt_local:home_dir(), "intercepts", "*.erl"]). 33 | 34 | intercept_files() -> 35 | intercept_files([default_intercept_path_glob()]). 36 | 37 | intercept_files(Globs) -> 38 | lists:concat([filelib:wildcard(Glob) || Glob <- Globs]). 39 | 40 | %% @doc Load the intercepts on the nodes under test. 41 | -spec load_intercepts([node()]) -> ok. 42 | load_intercepts(Nodes) -> 43 | load_intercepts(Nodes, [default_intercept_path_glob()]). 44 | 45 | -spec load_intercepts([node()], [string()]) -> ok. 46 | load_intercepts(Nodes, Globs) -> 47 | case rt_config:get(load_intercepts, true) of 48 | false -> 49 | ok; 50 | true -> 51 | Intercepts = rt_config:get(intercepts, []), 52 | rt:pmap(fun(N) -> load_code(N, Globs) end, Nodes), 53 | rt:pmap(fun(N) -> add(N, Intercepts) end, Nodes), 54 | ok 55 | end. 56 | 57 | load_code(Node) -> 58 | load_code(Node, [default_intercept_path_glob()]). 59 | 60 | load_code(Node, Globs) -> 61 | rt:wait_until_pingable(Node), 62 | [ok = remote_compile_and_load(Node, F) || F <- intercept_files(Globs)], 63 | ok. 64 | 65 | add_and_save(Node, Intercepts) -> 66 | CodePaths = rpc:call(Node, code, get_path, []), 67 | [PatchesDir] = [P || P <- CodePaths, lists:suffix("basho-patches", P)], 68 | add(Node, Intercepts, PatchesDir). 69 | 70 | add(Node, Intercepts) -> 71 | add(Node, Intercepts, undefined). 72 | 73 | add(Node, Intercepts, OutDir) when is_list(Intercepts) -> 74 | [ok = add(Node, I, OutDir) || I <- Intercepts], 75 | ok; 76 | 77 | add(Node, {Target, Mapping}, OutDir) -> 78 | add(Node, {Target, ?DEFAULT_INTERCEPT(Target), Mapping}, OutDir); 79 | 80 | add(Node, {Target, Intercept, Mapping}, OutDir) -> 81 | NMapping = [transform_anon_fun(M) || M <- Mapping], 82 | ok = rpc:call(Node, intercept, add, [Target, Intercept, NMapping, OutDir]). 83 | 84 | clean(Node, Targets) when is_list(Targets) -> 85 | [ok = clean(Node, T) || T <- Targets], 86 | ok; 87 | clean(Node, Target) -> 88 | ok = rpc:call(Node, intercept, clean, [Target]). 89 | 90 | %% The following function transforms anonymous function mappings passed 91 | %% from an Erlang shell. Anonymous intercept functions from compiled code 92 | %% require the developer to supply free variables themselves, and also 93 | %% requires use of the rt_intercept_pt parse transform. 94 | transform_anon_fun({FunArity, Intercept}=Mapping) when is_function(Intercept) -> 95 | {env, Env} = erlang:fun_info(Intercept, env), 96 | case Env of 97 | [] -> 98 | error({badarg, Mapping}); 99 | [FreeVars,_,_,Clauses] -> 100 | NewIntercept = {FreeVars, {'fun',1,{clauses,Clauses}}}, 101 | {FunArity, NewIntercept} 102 | end; 103 | transform_anon_fun(Mapping) -> 104 | Mapping. 105 | 106 | remote_compile_and_load(Node, F) -> 107 | ?LOG_DEBUG("Compiling and loading file ~s on node ~s", [F, Node]), 108 | {ok, _, Bin} = rpc:call(Node, compile, file, [F, [binary]]), 109 | ModName = list_to_atom(filename:basename(F, ".erl")), 110 | {module, _} = rpc:call(Node, code, load_binary, [ModName, F, Bin]), 111 | ok. 112 | 113 | wait_until_loaded(Node) -> 114 | wait_until_loaded(Node, 0). 115 | 116 | wait_until_loaded(Node, 5) -> 117 | {failed_to_load_intercepts, Node}; 118 | 119 | wait_until_loaded(Node, Tries) -> 120 | case rt_config:get(load_intercepts, true) of 121 | false -> 122 | ok; 123 | true -> 124 | case are_intercepts_loaded(Node) of 125 | true -> 126 | ok; 127 | false -> 128 | timer:sleep(500), 129 | wait_until_loaded(Node, Tries + 1) 130 | end 131 | end. 132 | 133 | are_intercepts_loaded(Node) -> 134 | are_intercepts_loaded(Node, [default_intercept_path_glob()]). 135 | 136 | are_intercepts_loaded(Node, Globs) -> 137 | Results = [rpc:call(Node, code, is_loaded, [Mod]) 138 | || Mod <- files_to_mods(intercept_files(Globs))], 139 | lists:all(fun is_loaded/1, Results). 140 | 141 | is_loaded({file,_}) -> true; 142 | is_loaded(_) -> false. 143 | -------------------------------------------------------------------------------- /test/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ## Run each module separately to provide isolation 4 | ## Assumption: 1 test per module 5 | 6 | ran=0 7 | failed=0 8 | failures="" 9 | 10 | for name in $(grep -v '#' test/TESTS); do 11 | ./rebar -C rebar.test.config skip_deps=true suites=$name eunit 12 | if [ $? -ne 0 ]; then 13 | failed=$((failed+1)) 14 | failures="$failures $name" 15 | fi 16 | ran=$((ran+1)) 17 | done 18 | 19 | echo "Results: ran $ran tests of which $failed failed". 20 | if [ $failed -gt 0 ]; then 21 | echo "Failed:" 22 | for name in $failures; do 23 | echo " $name" 24 | done 25 | exit 1 26 | fi 27 | 28 | ## remove deps.test so top level riak generate doesn't fail 29 | rm -rf deps.test 30 | -------------------------------------------------------------------------------- /test/synctree_intercepts.erl: -------------------------------------------------------------------------------- 1 | -module(synctree_intercepts). 2 | -compile([export_all, nowarn_export_all]). 3 | -define(M, synctree_orig). 4 | 5 | %% copied from synctree.erl 6 | -record(tree, {id :: term(), 7 | width :: pos_integer(), 8 | segments :: pos_integer(), 9 | height :: pos_integer(), 10 | shift :: pos_integer(), 11 | shift_max :: pos_integer(), 12 | top_hash :: any(), 13 | buffer :: [_], 14 | buffered :: non_neg_integer(), 15 | mod :: module(), 16 | modstate :: any() 17 | }). 18 | 19 | corrupt_upper(Updates, Tree=#tree{id=Id}) -> 20 | %% io:format("~p: corrupt_upper: ~p~n", [Id, Updates]), 21 | case should_corrupt(Id, Updates) of 22 | true -> 23 | %% io:format(user, "~p: corrupt: ~p~n", [Id, Updates]), 24 | %% io:format(user, "CORRUPT!!!!!!!!!!!!!!!~n", []), 25 | %% Corrupt two levels above the segment 26 | [A,B,C|Rest] = lists:reverse(Updates), 27 | Updates2 = [A, B, corrupt_level(C) | Rest], 28 | ?M:m_store_orig(Updates2, Tree); 29 | false -> 30 | ?M:m_store_orig(Updates, Tree) 31 | end. 32 | 33 | corrupt_segment(Updates, Tree=#tree{id=Id}) -> 34 | case should_corrupt(Id, Updates) of 35 | true -> 36 | %% io:format(user, "~p: corrupt: ~p~n", [Id, Updates]), 37 | %% io:format(user, "CORRUPT!!!!!!!!!!!!!!!~n", []), 38 | [A|Rest] = lists:reverse(Updates), 39 | Updates2 = [corrupt_level(A)|Rest], 40 | ?M:m_store_orig(Updates2, Tree); 41 | false -> 42 | ?M:m_store_orig(Updates, Tree) 43 | end. 44 | 45 | corrupt_segment_all(Updates, Tree=#tree{id=_Id}) -> 46 | case should_corrupt(all, Updates) of 47 | true -> 48 | %% io:format(user, "~p: corrupt: ~p~n", [_Id, Updates]), 49 | %% io:format(user, "CORRUPT!!!!!!!!!!!!!!!~n", []), 50 | [A|Rest] = lists:reverse(Updates), 51 | Updates2 = [corrupt_level(A)|Rest], 52 | ?M:m_store_orig(Updates2, Tree); 53 | false -> 54 | ?M:m_store_orig(Updates, Tree) 55 | end. 56 | 57 | corrupt_segment_follower(Updates, Tree=#tree{id=Id}) -> 58 | case should_corrupt({follower, Id}, Updates) of 59 | true -> 60 | %% io:format(user, "~p: corrupt: ~p~n", [Id, Updates]), 61 | %% io:format(user, "CORRUPT!!!!!!!!!!!!!!!~n", []), 62 | [A|Rest] = lists:reverse(Updates), 63 | Updates2 = [corrupt_level(A)|Rest], 64 | ?M:m_store_orig(Updates2, Tree); 65 | false -> 66 | ?M:m_store_orig(Updates, Tree) 67 | end. 68 | 69 | m_store_normal(Updates, Tree) -> 70 | ?M:m_store_orig(Updates, Tree). 71 | 72 | should_corrupt(Id, Updates) -> 73 | Leader = riak_ensemble_manager:get_leader(root), 74 | %% io:format(user, "CC: ~p / ~p: ~p~n", [Leader, Id, Updates]), 75 | case Id of 76 | all -> 77 | should_corrupt_updates(Updates); 78 | {root, {root, _}} -> 79 | %% {root, Leader} -> 80 | should_corrupt_updates(Updates); 81 | {follower, PeerId} when (PeerId =/= {root, Leader}) -> 82 | should_corrupt_updates(Updates); 83 | _ -> 84 | false 85 | end. 86 | 87 | should_corrupt_updates(Updates) -> 88 | Segment = hd(lists:reverse(Updates)), 89 | case Segment of 90 | {put, _, Data} -> 91 | is_list(Data) andalso lists:keymember(<<"corrupt">>, 1, Data); 92 | _ -> 93 | false 94 | end. 95 | 96 | corrupt_level({put, Key, Data}) -> 97 | [{VictimKey, VictimBin}|T] = Data, 98 | CorruptBin = corrupt_binary(VictimBin), 99 | %% io:format(user, "VB: ~p :: ~p~n", [VictimBin, CorruptBin]), 100 | {put, Key, [{VictimKey, CorruptBin}|T]}. 101 | 102 | corrupt_binary(<>) -> 103 | Y = (X + 1) rem 256, 104 | <>. 105 | -------------------------------------------------------------------------------- /test/synctree_path_test.erl: -------------------------------------------------------------------------------- 1 | -module(synctree_path_test). 2 | -compile([export_all, nowarn_export_all]). 3 | -include_lib("eunit/include/eunit.hrl"). 4 | 5 | run_test_() -> 6 | ens_test:run(fun scenario/0, 40). 7 | 8 | scenario() -> 9 | rt_intercept:add(node(), {riak_ensemble_basic_backend, 10 | [{{synctree_path,2}, synctree_path_shared}]}), 11 | ens_test:start(3), 12 | ens_test:wait_stable(root), 13 | {ok, _} = ens_test:kput(<<"test">>, <<"test">>), 14 | {ok, _} = ens_test:kget(<<"test">>), 15 | ens_test:kget(cluster_state), 16 | Pid = riak_ensemble_manager:get_leader_pid(root), 17 | erlang:suspend_process(Pid), 18 | ens_test:wait_stable(root), 19 | {ok, _} = ens_test:kget(<<"test">>), 20 | erlang:resume_process(Pid), 21 | {ok, _} = ens_test:kget(<<"test">>), 22 | ok. 23 | -------------------------------------------------------------------------------- /test/synctree_pure.erl: -------------------------------------------------------------------------------- 1 | %% Simple synctree tests that are stateless. 2 | -module(synctree_pure). 3 | -compile([export_all, nowarn_export_all]). 4 | -include_lib("eunit/include/eunit.hrl"). 5 | 6 | -define(TEST(X), {timeout, 60, {test, ?MODULE, X}}). 7 | 8 | run_test_() -> 9 | %% Violating "pure" principle a bit here, alas 10 | synctree_leveldb:init_ets(), 11 | Timeout = 60, 12 | Tests = [?TEST(test_basic_orddict), 13 | ?TEST(test_basic_ets), 14 | ?TEST(test_basic_leveldb), 15 | %% Kinda slow with orddict, disabling for now 16 | %% ?TEST(test_corrupt_orddict), 17 | ?TEST(test_corrupt_ets), 18 | ?TEST(test_corrupt_leveldb), 19 | ?TEST(test_exchange_orddict), 20 | ?TEST(test_exchange_ets), 21 | ?TEST(test_exchange_leveldb)], 22 | {timeout, Timeout, Tests}. 23 | 24 | test_basic_orddict() -> test_basic(synctree_orddict). 25 | test_basic_ets() -> test_basic(synctree_ets). 26 | test_basic_leveldb() -> test_basic(synctree_leveldb). 27 | 28 | test_basic(Mod) -> 29 | T = build(100, Mod), 30 | Result = synctree:get(42, T), 31 | Expect = <<420:64/integer>>, 32 | ?assertEqual(Expect, Result), 33 | T2 = synctree:insert(42, <<42:64/integer>>, T), 34 | Result2 = synctree:get(42, T2), 35 | Expect2 = <<42:64/integer>>, 36 | ?assertEqual(Expect2, Result2), 37 | ok. 38 | 39 | test_corrupt_orddict() -> test_corrupt(synctree_orddict). 40 | test_corrupt_ets() -> test_corrupt(synctree_ets). 41 | test_corrupt_leveldb() -> test_corrupt(synctree_leveldb). 42 | 43 | test_corrupt(Mod) -> 44 | T = build(10, Mod), 45 | Result = synctree:get(4, T), 46 | Expect = <<40:64/integer>>, 47 | ?assertEqual(Expect, Result), 48 | T2 = synctree:corrupt(4, T), 49 | Result2 = synctree:get(4, T2), 50 | ?assertMatch({corrupted, _, _}, Result2), 51 | T3 = synctree:rehash(T2), 52 | Result3 = synctree:get(4, T3), 53 | ?assertEqual(notfound, Result3), 54 | ok. 55 | 56 | test_exchange_orddict() -> test_exchange(synctree_orddict). 57 | test_exchange_ets() -> test_exchange(synctree_ets). 58 | test_exchange_leveldb() -> test_exchange(synctree_leveldb). 59 | 60 | test_exchange(Mod) -> 61 | Num = 50, 62 | Diff = 10, 63 | T1 = build(Num, Mod), 64 | T2 = build(Num-Diff, Mod), 65 | Result = synctree:local_compare(T1, T2), 66 | Expect = expected_diff(Num, Diff), 67 | ?assertEqual(Expect, lists:sort(Result)), 68 | ok. 69 | 70 | build(N) -> 71 | build(N, synctree_ets). 72 | 73 | build(N, Mod) -> 74 | do_build(N, synctree:new(undefined, default, default, Mod)). 75 | 76 | do_build(0, T) -> 77 | T; 78 | do_build(N, T) -> 79 | T2 = synctree:insert(N, <<(N*10):64/integer>>, T), 80 | do_build(N-1, T2). 81 | 82 | expected_diff(Num, Diff) -> 83 | [{N, {<<(N*10):64/integer>>, '$none'}} 84 | || N <- lists:seq(Num - Diff + 1, Num)]. 85 | -------------------------------------------------------------------------------- /test/synctree_remote.erl: -------------------------------------------------------------------------------- 1 | -module(synctree_remote). 2 | -compile([export_all, nowarn_export_all]). 3 | -include_lib("eunit/include/eunit.hrl"). 4 | 5 | run_test_() -> 6 | Timeout = 60, 7 | {timeout, Timeout, fun test_remote/0}. 8 | 9 | test_remote() -> 10 | Num = 10, 11 | Diff = 4, 12 | 13 | %% Spawn new process for remote tree 14 | Other = 15 | spawn(fun() -> 16 | B1 = synctree_pure:build(Num-Diff), 17 | message_loop(B1, 0, 0) 18 | end), 19 | 20 | %% Build local tree 21 | A1 = synctree_pure:build(Num), 22 | Local = synctree:direct_exchange(A1), 23 | 24 | %% Compare with remote tree through message passing 25 | Remote = fun(exchange_get, {L, B}) -> 26 | receive {get_bucket, B, X} -> 27 | X 28 | after 0 -> 29 | Other ! {get_bucket, self(), L, B}, 30 | receive {get_bucket, B, X} -> X end 31 | end; 32 | (start_exchange_level, {Level, Buckets}) -> 33 | Other ! {start_exchange_level, self(), Level, Buckets}, 34 | receive {start_exchange_level, X} -> X end 35 | end, 36 | 37 | KeyDiff = synctree:compare(synctree:height(A1), Local, Remote), 38 | Expected = synctree_pure:expected_diff(Num, Diff), 39 | ?assertEqual(Expected, KeyDiff), 40 | 41 | %% Signal spawned process to print stats and exit 42 | Other ! done, 43 | ok. 44 | 45 | message_loop(Tree, Msgs, Bytes) -> 46 | receive 47 | {get_bucket, From, L, B} -> 48 | io:format(user, "Not streamed: ~p/~p~n", [L, B]), 49 | Size = send_bucket(From, L, B, Tree), 50 | message_loop(Tree, Msgs+1, Bytes+Size); 51 | {start_exchange_level, From, Level, Buckets} -> 52 | io:format(user, "Start streaming for ~p/~p~n", [Level, Buckets]), 53 | From ! {start_exchange_level, ok}, 54 | _ = [send_bucket(From, Level, B, Tree) || B <- Buckets], 55 | message_loop(Tree, Msgs, Bytes); 56 | done -> 57 | io:format(user, "Exchanged messages: ~b~n", [Msgs]), 58 | io:format(user, "Exchanged bytes: ~b~n", [Bytes]), 59 | ok 60 | end. 61 | 62 | send_bucket(From, L, B, Tree) -> 63 | Reply = synctree:exchange_get(L, B, Tree), 64 | From ! {get_bucket, B, Reply}, 65 | Size = byte_size(term_to_binary(Reply)), 66 | Size. 67 | --------------------------------------------------------------------------------